Unverified Commit ecd0d06a authored by twang's avatar twang Committed by GitHub
Browse files

[Feature] Support monocular 3D detection on nuScenes (#392)

* Support nuscenes mono3d json info generation

* Support nuscenes mono3d dataset class

* Support attribute and bbox2d prediction in bbox3dnms and bbox3d2result

* Rename dataset class and add comments to 'attrs'

* Support mono3d related pipelines

* Fix unittest for loading 3D annotations

* Add unit test for nuscenes mono3d dataset

* Rename the sample result file

* Upload sample data for mono3d unit test

* Upload sample data for mono3d unit test

* Upload sample image for unit test

* Delete tests/data/nuscenes/samples/LIDAR_TOP/CAM_BACK_LEFT directory

* Add files via upload

* Remove unnecessary 'f'

* Remove unnecessary \ in arguments

* Remove check for pycocotools version because it has been done in the cocodataset

* Remove unnecessary comma, add TODO and change init of attrs in format_results

* Merge RandomFlip3D and RandomFlipMono3D

* Add pytest to check whether cuda is available in the unit test

* Add visualization TODO

* Remove useless init in loading mono3d images
parent 3a5a2010
......@@ -335,21 +335,40 @@ def rotation_points_single_angle(points, angle, axis=0):
return points @ rot_mat_T, rot_mat_T
def points_cam2img(points_3d, proj_mat):
def points_cam2img(points_3d, proj_mat, with_depth=False):
"""Project points in camera coordinates to image coordinates.
Args:
points_3d (np.ndarray): Points in shape (N, 3)
proj_mat (np.ndarray): Transformation matrix between coordinates.
with_depth (bool): Whether to keep depth in the output.
Returns:
np.ndarray: Points in image coordinates with shape [N, 2].
"""
points_shape = list(points_3d.shape)
points_shape[-1] = 1
assert len(proj_mat.shape) == 2, 'The dimension of the projection'\
f' matrix should be 2 instead of {len(proj_mat.shape)}.'
d1, d2 = proj_mat.shape[:2]
assert (d1 == 3 and d2 == 3) or (d1 == 3 and d2 == 4) or (
d1 == 4 and d2 == 4), 'The shape of the projection matrix'\
f' ({d1}*{d2}) is not supported.'
if d1 == 3:
proj_mat_expanded = np.eye(4, dtype=proj_mat.dtype)
proj_mat_expanded[:d1, :d2] = proj_mat
proj_mat = proj_mat_expanded
points_4 = np.concatenate([points_3d, np.ones(points_shape)], axis=-1)
point_2d = points_4 @ proj_mat.T
point_2d_res = point_2d[..., :2] / point_2d[..., 2:3]
if with_depth:
points_2d_depth = np.concatenate([point_2d_res, point_2d[..., 2:3]],
axis=-1)
return points_2d_depth
return point_2d_res
......
......@@ -124,11 +124,11 @@ def points_cam2img(points_3d, proj_mat):
points_num = list(points_3d.shape)[:-1]
points_shape = np.concatenate([points_num, [1]], axis=0).tolist()
assert len(proj_mat.shape) == 2, f'The dimension of the projection'\
f'matrix should be 2 instead of {len(proj_mat.shape)}.'
assert len(proj_mat.shape) == 2, 'The dimension of the projection'\
f' matrix should be 2 instead of {len(proj_mat.shape)}.'
d1, d2 = proj_mat.shape[:2]
assert (d1 == 3 and d2 == 3) or (d1 == 3 and d2 == 4) or (
d1 == 4 and d2 == 4), f'The shape of the projection matrix'\
d1 == 4 and d2 == 4), 'The shape of the projection matrix'\
f' ({d1}*{d2}) is not supported.'
if d1 == 3:
proj_mat_expanded = torch.eye(
......
......@@ -46,13 +46,15 @@ def bbox3d2roi(bbox_list):
return rois
def bbox3d2result(bboxes, scores, labels):
def bbox3d2result(bboxes, scores, labels, attrs=None):
"""Convert detection results to a list of numpy arrays.
Args:
bboxes (torch.Tensor): Bounding boxes with shape of (n, 5).
labels (torch.Tensor): Labels with shape of (n, ).
scores (torch.Tensor): Scores with shape of (n, ).
attrs (torch.Tensor, optional): Attributes with shape of (n, ). \
Defaults to None.
Returns:
dict[str, torch.Tensor]: Bounding box results in cpu mode.
......@@ -60,8 +62,14 @@ def bbox3d2result(bboxes, scores, labels):
- boxes_3d (torch.Tensor): 3D boxes.
- scores (torch.Tensor): Prediction scores.
- labels_3d (torch.Tensor): Box labels.
- attrs_3d (torch.Tensor, optional): Box attributes.
"""
return dict(
result_dict = dict(
boxes_3d=bboxes.to('cpu'),
scores_3d=scores.cpu(),
labels_3d=labels.cpu())
if attrs is not None:
result_dict['attrs_3d'] = attrs.cpu()
return result_dict
......@@ -11,7 +11,9 @@ def box3d_multiclass_nms(mlvl_bboxes,
score_thr,
max_num,
cfg,
mlvl_dir_scores=None):
mlvl_dir_scores=None,
mlvl_attr_scores=None,
mlvl_bboxes2d=None):
"""Multi-class nms for 3D boxes.
Args:
......@@ -27,10 +29,15 @@ def box3d_multiclass_nms(mlvl_bboxes,
cfg (dict): Configuration dict of NMS.
mlvl_dir_scores (torch.Tensor, optional): Multi-level scores
of direction classifier. Defaults to None.
mlvl_attr_scores (torch.Tensor, optional): Multi-level scores
of attribute classifier. Defaults to None.
mlvl_bboxes2d (torch.Tensor, optional): Multi-level 2D bounding
boxes. Defaults to None.
Returns:
tuple[torch.Tensor]: Return results after nms, including 3D \
bounding boxes, scores, labels and direction scores.
bounding boxes, scores, labels, direction scores, attribute \
scores (optional) and 2D bounding boxes (optional).
"""
# do multi class nms
# the fg class id range: [0, num_classes-1]
......@@ -39,6 +46,8 @@ def box3d_multiclass_nms(mlvl_bboxes,
scores = []
labels = []
dir_scores = []
attr_scores = []
bboxes2d = []
for i in range(0, num_classes):
# get bboxes and scores of this class
cls_inds = mlvl_scores[:, i] > score_thr
......@@ -65,6 +74,12 @@ def box3d_multiclass_nms(mlvl_bboxes,
if mlvl_dir_scores is not None:
_mlvl_dir_scores = mlvl_dir_scores[cls_inds]
dir_scores.append(_mlvl_dir_scores[selected])
if mlvl_attr_scores is not None:
_mlvl_attr_scores = mlvl_attr_scores[cls_inds]
attr_scores.append(_mlvl_attr_scores[selected])
if mlvl_bboxes2d is not None:
_mlvl_bboxes2d = mlvl_bboxes2d[cls_inds]
bboxes2d.append(_mlvl_bboxes2d[selected])
if bboxes:
bboxes = torch.cat(bboxes, dim=0)
......@@ -72,6 +87,10 @@ def box3d_multiclass_nms(mlvl_bboxes,
labels = torch.cat(labels, dim=0)
if mlvl_dir_scores is not None:
dir_scores = torch.cat(dir_scores, dim=0)
if mlvl_attr_scores is not None:
attr_scores = torch.cat(attr_scores, dim=0)
if mlvl_bboxes2d is not None:
bboxes2d = torch.cat(bboxes2d, dim=0)
if bboxes.shape[0] > max_num:
_, inds = scores.sort(descending=True)
inds = inds[:max_num]
......@@ -80,12 +99,31 @@ def box3d_multiclass_nms(mlvl_bboxes,
scores = scores[inds]
if mlvl_dir_scores is not None:
dir_scores = dir_scores[inds]
if mlvl_attr_scores is not None:
attr_scores = attr_scores[inds]
if mlvl_bboxes2d is not None:
bboxes2d = bboxes2d[inds]
else:
bboxes = mlvl_scores.new_zeros((0, mlvl_bboxes.size(-1)))
scores = mlvl_scores.new_zeros((0, ))
labels = mlvl_scores.new_zeros((0, ), dtype=torch.long)
dir_scores = mlvl_scores.new_zeros((0, ))
return bboxes, scores, labels, dir_scores
if mlvl_dir_scores is not None:
dir_scores = mlvl_scores.new_zeros((0, ))
if mlvl_attr_scores is not None:
attr_scores = mlvl_scores.new_zeros((0, ))
if mlvl_bboxes2d is not None:
bboxes2d = mlvl_scores.new_zeros((0, 4))
results = (bboxes, scores, labels)
if mlvl_dir_scores is not None:
results = results + (dir_scores, )
if mlvl_attr_scores is not None:
results = results + (attr_scores, )
if mlvl_bboxes2d is not None:
results = results + (bboxes2d, )
return results
def aligned_3d_nms(boxes, scores, classes, thresh):
......
......@@ -5,6 +5,7 @@ from .custom_3d_seg import Custom3DSegDataset
from .kitti_dataset import KittiDataset
from .lyft_dataset import LyftDataset
from .nuscenes_dataset import NuScenesDataset
from .nuscenes_mono_dataset import NuScenesMonoDataset
from .pipelines import (BackgroundPointsFilter, GlobalRotScaleTrans,
IndoorPointSample, LoadAnnotations3D,
LoadPointsFromFile, LoadPointsFromMultiSweeps,
......@@ -19,9 +20,9 @@ from .waymo_dataset import WaymoDataset
__all__ = [
'KittiDataset', 'GroupSampler', 'DistributedGroupSampler',
'build_dataloader', 'RepeatFactorDataset', 'DATASETS', 'build_dataset',
'CocoDataset', 'NuScenesDataset', 'LyftDataset', 'ObjectSample',
'RandomFlip3D', 'ObjectNoise', 'GlobalRotScaleTrans', 'PointShuffle',
'ObjectRangeFilter', 'PointsRangeFilter', 'Collect3D',
'CocoDataset', 'NuScenesDataset', 'NuScenesMonoDataset', 'LyftDataset',
'ObjectSample', 'RandomFlip3D', 'ObjectNoise', 'GlobalRotScaleTrans',
'PointShuffle', 'ObjectRangeFilter', 'PointsRangeFilter', 'Collect3D',
'LoadPointsFromFile', 'NormalizePointsColor', 'IndoorPointSample',
'LoadAnnotations3D', 'SUNRGBDDataset', 'ScanNetDataset',
'ScanNetSegDataset', 'SemanticKITTIDataset', 'Custom3DDataset',
......
import copy
import mmcv
import numpy as np
import pyquaternion
import tempfile
import torch
from nuscenes.utils.data_classes import Box as NuScenesBox
from os import path as osp
from mmdet3d.core import bbox3d2result, box3d_multiclass_nms, xywhr2xyxyr
from mmdet.datasets import DATASETS, CocoDataset
from ..core.bbox import CameraInstance3DBoxes, get_box_type
@DATASETS.register_module()
class NuScenesMonoDataset(CocoDataset):
r"""Monocular 3D detection on NuScenes Dataset.
This class serves as the API for experiments on the NuScenes Dataset.
Please refer to `NuScenes Dataset <https://www.nuscenes.org/download>`_
for data downloading.
Args:
ann_file (str): Path of annotation file.
data_root (str): Path of dataset root.
load_interval (int, optional): Interval of loading the dataset. It is
used to uniformly sample the dataset. Defaults to 1.
with_velocity (bool, optional): Whether include velocity prediction
into the experiments. Defaults to True.
modality (dict, optional): Modality to specify the sensor data used
as input. Defaults to None.
box_type_3d (str, optional): Type of 3D box of this dataset.
Based on the `box_type_3d`, the dataset will encapsulate the box
to its original format then converted them to `box_type_3d`.
Defaults to 'Camera' in this class. Available options includes.
- 'LiDAR': Box in LiDAR coordinates.
- 'Depth': Box in depth coordinates, usually for indoor dataset.
- 'Camera': Box in camera coordinates.
eval_version (str, optional): Configuration version of evaluation.
Defaults to 'detection_cvpr_2019'.
use_valid_flag (bool): Whether to use `use_valid_flag` key in the info
file as mask to filter gt_boxes and gt_names. Defaults to False.
version (str, optional): Dataset version. Defaults to 'v1.0-trainval'.
"""
CLASSES = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
'barrier')
DefaultAttribute = {
'car': 'vehicle.parked',
'pedestrian': 'pedestrian.moving',
'trailer': 'vehicle.parked',
'truck': 'vehicle.parked',
'bus': 'vehicle.moving',
'motorcycle': 'cycle.without_rider',
'construction_vehicle': 'vehicle.parked',
'bicycle': 'cycle.without_rider',
'barrier': '',
'traffic_cone': '',
}
def __init__(self,
data_root,
load_interval=1,
with_velocity=True,
modality=None,
box_type_3d='Camera',
eval_version='detection_cvpr_2019',
use_valid_flag=False,
version='v1.0-trainval',
**kwargs):
super().__init__(**kwargs)
self.data_root = data_root
self.load_interval = load_interval
self.with_velocity = with_velocity
self.modality = modality
self.box_type_3d, self.box_mode_3d = get_box_type(box_type_3d)
self.eval_version = eval_version
self.use_valid_flag = use_valid_flag
self.bbox_code_size = 9
self.version = version
if self.eval_version is not None:
from nuscenes.eval.detection.config import config_factory
self.eval_detection_configs = config_factory(self.eval_version)
if self.modality is None:
self.modality = dict(
use_camera=True,
use_lidar=False,
use_radar=False,
use_map=False,
use_external=False)
def pre_pipeline(self, results):
"""Initialization before data preparation.
Args:
results (dict): Dict before data preprocessing.
- img_fields (list): Image fields.
- bbox3d_fields (list): 3D bounding boxes fields.
- pts_mask_fields (list): Mask fields of points.
- pts_seg_fields (list): Mask fields of point segments.
- bbox_fields (list): Fields of bounding boxes.
- mask_fields (list): Fields of masks.
- seg_fields (list): Segment fields.
- box_type_3d (str): 3D box type.
- box_mode_3d (str): 3D box mode.
"""
results['img_prefix'] = self.img_prefix
results['seg_prefix'] = self.seg_prefix
results['proposal_file'] = self.proposal_file
results['img_fields'] = []
results['bbox3d_fields'] = []
results['pts_mask_fields'] = []
results['pts_seg_fields'] = []
results['bbox_fields'] = []
results['mask_fields'] = []
results['seg_fields'] = []
results['box_type_3d'] = self.box_type_3d
results['box_mode_3d'] = self.box_mode_3d
def _parse_ann_info(self, img_info, ann_info):
"""Parse bbox annotation.
Args:
img_info (list[dict]): Image info.
ann_info (list[dict]): Annotation info of an image.
Returns:
dict: A dict containing the following keys: bboxes, labels, \
gt_bboxes_3d, gt_labels_3d, attr_labels, centers2d, \
depths, bboxes_ignore, masks, seg_map
"""
gt_bboxes = []
gt_labels = []
attr_labels = []
gt_bboxes_ignore = []
gt_masks_ann = []
gt_bboxes_cam3d = []
centers2d = []
depths = []
for i, ann in enumerate(ann_info):
if ann.get('ignore', False):
continue
x1, y1, w, h = ann['bbox']
inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
if inter_w * inter_h == 0:
continue
if ann['area'] <= 0 or w < 1 or h < 1:
continue
if ann['category_id'] not in self.cat_ids:
continue
bbox = [x1, y1, x1 + w, y1 + h]
if ann.get('iscrowd', False):
gt_bboxes_ignore.append(bbox)
else:
gt_bboxes.append(bbox)
gt_labels.append(self.cat2label[ann['category_id']])
attr_labels.append(ann['attribute_id'])
gt_masks_ann.append(ann.get('segmentation', None))
# 3D annotations in camera coordinates
bbox_cam3d = np.array(ann['bbox_cam3d']).reshape(1, -1)
# change orientation to local yaw
bbox_cam3d[0, 6] = -np.arctan2(
bbox_cam3d[0, 0], bbox_cam3d[0, 2]) + bbox_cam3d[0, 6]
velo_cam3d = np.array(ann['velo_cam3d']).reshape(1, 2)
nan_mask = np.isnan(velo_cam3d[:, 0])
velo_cam3d[nan_mask] = [0.0, 0.0]
bbox_cam3d = np.concatenate([bbox_cam3d, velo_cam3d], axis=-1)
gt_bboxes_cam3d.append(bbox_cam3d.squeeze())
# 2.5D annotations in camera coordinates
center2d = ann['center2d'][:2]
depth = ann['center2d'][2]
centers2d.append(center2d)
depths.append(depth)
if gt_bboxes:
gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
gt_labels = np.array(gt_labels, dtype=np.int64)
attr_labels = np.array(attr_labels, dtype=np.int64)
else:
gt_bboxes = np.zeros((0, 4), dtype=np.float32)
gt_labels = np.array([], dtype=np.int64)
attr_labels = np.array([], dtype=np.int64)
if gt_bboxes_cam3d:
gt_bboxes_cam3d = np.array(gt_bboxes_cam3d, dtype=np.float32)
centers2d = np.array(centers2d, dtype=np.float32)
depths = np.array(depths, dtype=np.float32)
else:
gt_bboxes_cam3d = np.zeros((0, self.bbox_code_size),
dtype=np.float32)
centers2d = np.zeros((0, 2), dtype=np.float32)
depths = np.zeros((0), dtype=np.float32)
gt_bboxes_cam3d = CameraInstance3DBoxes(
gt_bboxes_cam3d,
box_dim=gt_bboxes_cam3d.shape[-1],
origin=(0.5, 0.5, 0.5))
gt_labels_3d = copy.deepcopy(gt_labels)
if gt_bboxes_ignore:
gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
else:
gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
seg_map = img_info['filename'].replace('jpg', 'png')
ann = dict(
bboxes=gt_bboxes,
labels=gt_labels,
gt_bboxes_3d=gt_bboxes_cam3d,
gt_labels_3d=gt_labels_3d,
attr_labels=attr_labels,
centers2d=centers2d,
depths=depths,
bboxes_ignore=gt_bboxes_ignore,
masks=gt_masks_ann,
seg_map=seg_map)
return ann
def get_attr_name(self, attr_idx, label_name):
"""Get attribute from predicted index.
This is a workaround to predict attribute when the predicted velocity
is not reliable. We map the predicted attribute index to the one
in the attribute set. If it is consistent with the category, we will
keep it. Otherwise, we will use the default attribute.
Args:
attr_idx (int): Attribute index.
label_name (str): Predicted category name.
Returns:
str: Predicted attribute name.
"""
# TODO: Simplify the variable name
AttrMapping_rev2 = [
'cycle.with_rider', 'cycle.without_rider', 'pedestrian.moving',
'pedestrian.standing', 'pedestrian.sitting_lying_down',
'vehicle.moving', 'vehicle.parked', 'vehicle.stopped', 'None'
]
if label_name == 'car' or label_name == 'bus' \
or label_name == 'truck' or label_name == 'trailer' \
or label_name == 'construction_vehicle':
if AttrMapping_rev2[attr_idx] == 'vehicle.moving' or \
AttrMapping_rev2[attr_idx] == 'vehicle.parked' or \
AttrMapping_rev2[attr_idx] == 'vehicle.stopped':
return AttrMapping_rev2[attr_idx]
else:
return NuScenesMonoDataset.DefaultAttribute[label_name]
elif label_name == 'pedestrian':
if AttrMapping_rev2[attr_idx] == 'pedestrian.moving' or \
AttrMapping_rev2[attr_idx] == 'pedestrian.standing' or \
AttrMapping_rev2[attr_idx] == \
'pedestrian.sitting_lying_down':
return AttrMapping_rev2[attr_idx]
else:
return NuScenesMonoDataset.DefaultAttribute[label_name]
elif label_name == 'bicycle' or label_name == 'motorcycle':
if AttrMapping_rev2[attr_idx] == 'cycle.with_rider' or \
AttrMapping_rev2[attr_idx] == 'cycle.without_rider':
return AttrMapping_rev2[attr_idx]
else:
return NuScenesMonoDataset.DefaultAttribute[label_name]
else:
return NuScenesMonoDataset.DefaultAttribute[label_name]
def _format_bbox(self, results, jsonfile_prefix=None):
"""Convert the results to the standard format.
Args:
results (list[dict]): Testing results of the dataset.
jsonfile_prefix (str): The prefix of the output jsonfile.
You can specify the output directory/filename by
modifying the jsonfile_prefix. Default: None.
Returns:
str: Path of the output json file.
"""
nusc_annos = {}
mapped_class_names = self.CLASSES
print('Start to convert detection format...')
CAM_NUM = 6
for sample_id, det in enumerate(mmcv.track_iter_progress(results)):
if sample_id % CAM_NUM == 0:
boxes_per_frame = []
attrs_per_frame = []
# need to merge results from images of the same sample
annos = []
boxes, attrs = output_to_nusc_box(det)
sample_token = self.data_infos[sample_id]['token']
boxes, attrs = cam_nusc_box_to_global(self.data_infos[sample_id],
boxes, attrs,
mapped_class_names,
self.eval_detection_configs,
self.eval_version)
boxes_per_frame.extend(boxes)
attrs_per_frame.extend(attrs)
# Remove redundant predictions caused by overlap of images
if (sample_id + 1) % CAM_NUM != 0:
continue
boxes = global_nusc_box_to_cam(
self.data_infos[sample_id + 1 - CAM_NUM], boxes_per_frame,
mapped_class_names, self.eval_detection_configs,
self.eval_version)
cam_boxes3d, scores, labels = nusc_box_to_cam_box3d(boxes)
# box nms 3d over 6 images in a frame
# TODO: move this global setting into config
nms_cfg = dict(
use_rotate_nms=True,
nms_across_levels=False,
nms_pre=4096,
nms_thr=0.05,
score_thr=0.01,
min_bbox_size=0,
max_per_frame=500)
from mmcv import Config
nms_cfg = Config(nms_cfg)
cam_boxes3d_for_nms = xywhr2xyxyr(cam_boxes3d.bev)
boxes3d = cam_boxes3d.tensor
# generate attr scores from attr labels
attrs = labels.new_tensor([attr for attr in attrs_per_frame])
boxes3d, scores, labels, attrs = box3d_multiclass_nms(
boxes3d,
cam_boxes3d_for_nms,
scores,
nms_cfg.score_thr,
nms_cfg.max_per_frame,
nms_cfg,
mlvl_attr_scores=attrs)
cam_boxes3d = CameraInstance3DBoxes(boxes3d, box_dim=9)
det = bbox3d2result(cam_boxes3d, scores, labels, attrs)
boxes, attrs = output_to_nusc_box(det)
boxes, attrs = cam_nusc_box_to_global(
self.data_infos[sample_id + 1 - CAM_NUM], boxes, attrs,
mapped_class_names, self.eval_detection_configs,
self.eval_version)
for i, box in enumerate(boxes):
name = mapped_class_names[box.label]
attr = self.get_attr_name(attrs[i], name)
nusc_anno = dict(
sample_token=sample_token,
translation=box.center.tolist(),
size=box.wlh.tolist(),
rotation=box.orientation.elements.tolist(),
velocity=box.velocity[:2].tolist(),
detection_name=name,
detection_score=box.score,
attribute_name=attr)
annos.append(nusc_anno)
# other views results of the same frame should be concatenated
if sample_token in nusc_annos:
nusc_annos[sample_token].extend(annos)
else:
nusc_annos[sample_token] = annos
nusc_submissions = {
'meta': self.modality,
'results': nusc_annos,
}
mmcv.mkdir_or_exist(jsonfile_prefix)
res_path = osp.join(jsonfile_prefix, 'results_nusc.json')
print('Results writes to', res_path)
mmcv.dump(nusc_submissions, res_path)
return res_path
def _evaluate_single(self,
result_path,
logger=None,
metric='bbox',
result_name='img_bbox'):
"""Evaluation for a single model in nuScenes protocol.
Args:
result_path (str): Path of the result file.
logger (logging.Logger | str | None): Logger used for printing
related information during evaluation. Default: None.
metric (str): Metric name used for evaluation. Default: 'bbox'.
result_name (str): Result name in the metric prefix.
Default: 'img_bbox'.
Returns:
dict: Dictionary of evaluation details.
"""
from nuscenes import NuScenes
from nuscenes.eval.detection.evaluate import NuScenesEval
output_dir = osp.join(*osp.split(result_path)[:-1])
nusc = NuScenes(
version=self.version, dataroot=self.data_root, verbose=False)
eval_set_map = {
'v1.0-mini': 'mini_val',
'v1.0-trainval': 'val',
}
nusc_eval = NuScenesEval(
nusc,
config=self.eval_detection_configs,
result_path=result_path,
eval_set=eval_set_map[self.version],
output_dir=output_dir,
verbose=False)
nusc_eval.main(render_curves=True)
# record metrics
metrics = mmcv.load(osp.join(output_dir, 'metrics_summary.json'))
detail = dict()
metric_prefix = f'{result_name}_NuScenes'
for name in self.CLASSES:
for k, v in metrics['label_aps'][name].items():
val = float('{:.4f}'.format(v))
detail['{}/{}_AP_dist_{}'.format(metric_prefix, name, k)] = val
for k, v in metrics['label_tp_errors'][name].items():
val = float('{:.4f}'.format(v))
detail['{}/{}_{}'.format(metric_prefix, name, k)] = val
detail['{}/NDS'.format(metric_prefix)] = metrics['nd_score']
detail['{}/mAP'.format(metric_prefix)] = metrics['mean_ap']
return detail
def format_results(self, results, jsonfile_prefix=None, **kwargs):
"""Format the results to json (standard format for COCO evaluation).
Args:
results (list[tuple | numpy.ndarray]): Testing results of the
dataset.
jsonfile_prefix (str | None): The prefix of json files. It includes
the file path and the prefix of filename, e.g., "a/b/prefix".
If not specified, a temp file will be created. Default: None.
Returns:
tuple: (result_files, tmp_dir), result_files is a dict containing \
the json filepaths, tmp_dir is the temporal directory created \
for saving json files when jsonfile_prefix is not specified.
"""
assert isinstance(results, list), 'results must be a list'
assert len(results) == len(self), (
'The length of results is not equal to the dataset len: {} != {}'.
format(len(results), len(self)))
if jsonfile_prefix is None:
tmp_dir = tempfile.TemporaryDirectory()
jsonfile_prefix = osp.join(tmp_dir.name, 'results')
else:
tmp_dir = None
if not isinstance(results[0], dict):
result_files = self._format_bbox(results, jsonfile_prefix)
else:
result_files = dict()
for name in results[0]:
# not evaluate 2D predictions on nuScenes
if '2d' in name:
continue
print(f'\nFormating bboxes of {name}')
results_ = [out[name] for out in results]
tmp_file_ = osp.join(jsonfile_prefix, name)
result_files.update(
{name: self._format_bbox(results_, tmp_file_)})
return result_files, tmp_dir
def evaluate(self,
results,
metric='bbox',
logger=None,
jsonfile_prefix=None,
result_names=['img_bbox'],
show=False,
out_dir=None):
"""Evaluation in nuScenes protocol.
Args:
results (list[dict]): Testing results of the dataset.
metric (str | list[str]): Metrics to be evaluated.
logger (logging.Logger | str | None): Logger used for printing
related information during evaluation. Default: None.
jsonfile_prefix (str | None): The prefix of json files. It includes
the file path and the prefix of filename, e.g., "a/b/prefix".
If not specified, a temp file will be created. Default: None.
show (bool): Whether to visualize.
Default: False.
out_dir (str): Path to save the visualization results.
Default: None.
Returns:
dict[str, float]: Results of each evaluation metric.
"""
result_files, tmp_dir = self.format_results(results, jsonfile_prefix)
if isinstance(result_files, dict):
results_dict = dict()
for name in result_names:
print('Evaluating bboxes of {}'.format(name))
ret_dict = self._evaluate_single(result_files[name])
results_dict.update(ret_dict)
elif isinstance(result_files, str):
results_dict = self._evaluate_single(result_files)
if tmp_dir is not None:
tmp_dir.cleanup()
if show:
self.show(results, out_dir)
return results_dict
def show(self, results, out_dir):
"""Results visualization.
Args:
results (list[dict]): List of bounding boxes results.
out_dir (str): Output directory of visualization result.
"""
# TODO: support mono3d visualization
pass
def output_to_nusc_box(detection):
"""Convert the output to the box class in the nuScenes.
Args:
detection (dict): Detection results.
- boxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox.
- scores_3d (torch.Tensor): Detection scores.
- labels_3d (torch.Tensor): Predicted box labels.
- attrs_3d (torch.Tensor, optional): Predicted attributes.
Returns:
list[:obj:`NuScenesBox`]: List of standard NuScenesBoxes.
"""
box3d = detection['boxes_3d']
scores = detection['scores_3d'].numpy()
labels = detection['labels_3d'].numpy()
attrs = None
if 'attrs_3d' in detection:
attrs = detection['attrs_3d'].numpy()
box_gravity_center = box3d.gravity_center.numpy()
box_dims = box3d.dims.numpy()
box_yaw = box3d.yaw.numpy()
box_list = []
for i in range(len(box3d)):
q1 = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw[i])
q2 = pyquaternion.Quaternion(axis=[1, 0, 0], radians=np.pi / 2)
quat = q2 * q1
velocity = (box3d.tensor[i, 7], 0.0, box3d.tensor[i, 8])
box = NuScenesBox(
box_gravity_center[i],
box_dims[i],
quat,
label=labels[i],
score=scores[i],
velocity=velocity)
box_list.append(box)
return box_list, attrs
def cam_nusc_box_to_global(info,
boxes,
attrs,
classes,
eval_configs,
eval_version='detection_cvpr_2019'):
"""Convert the box from camera to global coordinate.
Args:
info (dict): Info for a specific sample data, including the
calibration information.
boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
classes (list[str]): Mapped classes in the evaluation.
eval_configs (object): Evaluation configuration object.
eval_version (str): Evaluation version.
Default: 'detection_cvpr_2019'
Returns:
list: List of standard NuScenesBoxes in the global
coordinate.
"""
box_list = []
attr_list = []
for (box, attr) in zip(boxes, attrs):
# Move box to ego vehicle coord system
box.rotate(pyquaternion.Quaternion(info['cam2ego_rotation']))
box.translate(np.array(info['cam2ego_translation']))
# filter det in ego.
cls_range_map = eval_configs.class_range
radius = np.linalg.norm(box.center[:2], 2)
det_range = cls_range_map[classes[box.label]]
if radius > det_range:
continue
# Move box to global coord system
box.rotate(pyquaternion.Quaternion(info['ego2global_rotation']))
box.translate(np.array(info['ego2global_translation']))
box_list.append(box)
attr_list.append(attr)
return box_list, attr_list
def global_nusc_box_to_cam(info,
boxes,
classes,
eval_configs,
eval_version='detection_cvpr_2019'):
"""Convert the box from global to camera coordinate.
Args:
info (dict): Info for a specific sample data, including the
calibration information.
boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
classes (list[str]): Mapped classes in the evaluation.
eval_configs (object): Evaluation configuration object.
eval_version (str): Evaluation version.
Default: 'detection_cvpr_2019'
Returns:
list: List of standard NuScenesBoxes in the global
coordinate.
"""
box_list = []
for box in boxes:
# Move box to ego vehicle coord system
box.translate(-np.array(info['ego2global_translation']))
box.rotate(
pyquaternion.Quaternion(info['ego2global_rotation']).inverse)
# filter det in ego.
cls_range_map = eval_configs.class_range
radius = np.linalg.norm(box.center[:2], 2)
det_range = cls_range_map[classes[box.label]]
if radius > det_range:
continue
# Move box to camera coord system
box.translate(-np.array(info['cam2ego_translation']))
box.rotate(pyquaternion.Quaternion(info['cam2ego_rotation']).inverse)
box_list.append(box)
return box_list
def nusc_box_to_cam_box3d(boxes):
"""Convert boxes from :obj:`NuScenesBox` to :obj:`CameraInstance3DBoxes`.
Args:
boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
Returns:
tuple (:obj:`CameraInstance3DBoxes` | torch.Tensor | torch.Tensor): \
Converted 3D bounding boxes, scores and labels.
"""
locs = torch.Tensor([b.center for b in boxes]).view(-1, 3)
dims = torch.Tensor([b.wlh for b in boxes]).view(-1, 3)
rots = torch.Tensor([b.orientation.yaw_pitch_roll[0]
for b in boxes]).view(-1, 1)
velocity = torch.Tensor([b.velocity[:2] for b in boxes]).view(-1, 2)
boxes_3d = torch.cat([locs, dims, rots, velocity], dim=1).cuda()
cam_boxes3d = CameraInstance3DBoxes(
boxes_3d, box_dim=9, origin=(0.5, 0.5, 0.5))
scores = torch.Tensor([b.score for b in boxes]).cuda()
labels = torch.LongTensor([b.label for b in boxes]).cuda()
nms_scores = scores.new_zeros(scores.shape[0], 10 + 1)
indices = labels.new_tensor(list(range(scores.shape[0])))
nms_scores[indices, labels] = scores
return cam_boxes3d, nms_scores, labels
......@@ -51,7 +51,8 @@ class DefaultFormatBundle(object):
results['img'] = DC(to_tensor(img), stack=True)
for key in [
'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels',
'gt_labels_3d', 'pts_instance_mask', 'pts_semantic_mask'
'gt_labels_3d', 'attr_labels', 'pts_instance_mask',
'pts_semantic_mask', 'centers2d', 'depths'
]:
if key not in results:
continue
......@@ -134,11 +135,11 @@ class Collect3D(object):
keys,
meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img',
'pad_shape', 'scale_factor', 'flip',
'pcd_horizontal_flip', 'pcd_vertical_flip',
'box_mode_3d', 'box_type_3d', 'img_norm_cfg',
'rect', 'Trv2c', 'P2', 'pcd_trans', 'sample_idx',
'pcd_scale_factor', 'pcd_rotation', 'pts_filename',
'transformation_3d_flow')):
'cam_intrinsic', 'pcd_horizontal_flip',
'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d',
'img_norm_cfg', 'rect', 'Trv2c', 'P2', 'pcd_trans',
'sample_idx', 'pcd_scale_factor', 'pcd_rotation',
'pts_filename', 'transformation_3d_flow')):
self.keys = keys
self.meta_keys = meta_keys
......@@ -222,6 +223,11 @@ class DefaultFormatBundle3D(DefaultFormatBundle):
if 'gt_names_3d' in results:
results['gt_names_3d'] = results['gt_names_3d'][
gt_bboxes_3d_mask]
if 'centers2d' in results:
results['centers2d'] = results['centers2d'][
gt_bboxes_3d_mask]
if 'depths' in results:
results['depths'] = results['depths'][gt_bboxes_3d_mask]
if 'gt_bboxes_mask' in results:
gt_bboxes_mask = results['gt_bboxes_mask']
if 'gt_bboxes' in results:
......@@ -230,6 +236,7 @@ class DefaultFormatBundle3D(DefaultFormatBundle):
if self.with_label:
if 'gt_names' in results and len(results['gt_names']) == 0:
results['gt_labels'] = np.array([], dtype=np.int64)
results['attr_labels'] = np.array([], dtype=np.int64)
elif 'gt_names' in results and isinstance(
results['gt_names'][0], list):
# gt_labels might be a list of list in multi-view setting
......
......@@ -3,7 +3,7 @@ import numpy as np
from mmdet3d.core.points import BasePoints, get_points_type
from mmdet.datasets.builder import PIPELINES
from mmdet.datasets.pipelines import LoadAnnotations
from mmdet.datasets.pipelines import LoadAnnotations, LoadImageFromFile
@PIPELINES.register_module()
......@@ -65,6 +65,30 @@ class LoadMultiViewImageFromFiles(object):
f"color_type='{self.color_type}')"
@PIPELINES.register_module()
class LoadImageFromFileMono3D(LoadImageFromFile):
"""Load an image from file in monocular 3D object detection. Compared to 2D
detection, additional camera parameters need to be loaded.
Args:
kwargs (dict): Arguments are the same as those in \
:class:`LoadImageFromFile`.
"""
def __call__(self, results):
"""Call functions to load image and get image meta information.
Args:
results (dict): Result dict from :obj:`mmdet.CustomDataset`.
Returns:
dict: The dict contains loaded image and meta information.
"""
super().__call__(results)
results['cam_intrinsic'] = results['img_info']['cam_intrinsic']
return results
@PIPELINES.register_module()
class LoadPointsFromMultiSweeps(object):
"""Load points from multiple sweeps.
......@@ -426,6 +450,8 @@ class LoadAnnotations3D(LoadAnnotations):
Defaults to True.
with_label_3d (bool, optional): Whether to load 3D labels.
Defaults to True.
with_attr_label (bool, optional): Whether to load attribute label.
Defaults to False.
with_mask_3d (bool, optional): Whether to load 3D instance masks.
for points. Defaults to False.
with_seg_3d (bool, optional): Whether to load 3D semantic masks.
......@@ -438,6 +464,8 @@ class LoadAnnotations3D(LoadAnnotations):
Defaults to False.
with_seg (bool, optional): Whether to load 2D semantic masks.
Defaults to False.
with_bbox_depth (bool, optional): Whether to load 2.5D boxes.
Defaults to False.
poly2mask (bool, optional): Whether to convert polygon annotations
to bitmasks. Defaults to True.
seg_3d_dtype (dtype, optional): Dtype of 3D semantic masks.
......@@ -450,12 +478,14 @@ class LoadAnnotations3D(LoadAnnotations):
def __init__(self,
with_bbox_3d=True,
with_label_3d=True,
with_attr_label=False,
with_mask_3d=False,
with_seg_3d=False,
with_bbox=False,
with_label=False,
with_mask=False,
with_seg=False,
with_bbox_depth=False,
poly2mask=True,
seg_3d_dtype='int',
file_client_args=dict(backend='disk')):
......@@ -467,7 +497,9 @@ class LoadAnnotations3D(LoadAnnotations):
poly2mask,
file_client_args=file_client_args)
self.with_bbox_3d = with_bbox_3d
self.with_bbox_depth = with_bbox_depth
self.with_label_3d = with_label_3d
self.with_attr_label = with_attr_label
self.with_mask_3d = with_mask_3d
self.with_seg_3d = with_seg_3d
self.seg_3d_dtype = seg_3d_dtype
......@@ -485,6 +517,19 @@ class LoadAnnotations3D(LoadAnnotations):
results['bbox3d_fields'].append('gt_bboxes_3d')
return results
def _load_bboxes_depth(self, results):
"""Private function to load 2.5D bounding box annotations.
Args:
results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
Returns:
dict: The dict containing loaded 2.5D bounding box annotations.
"""
results['centers2d'] = results['ann_info']['centers2d']
results['depths'] = results['ann_info']['depths']
return results
def _load_labels_3d(self, results):
"""Private function to load label annotations.
......@@ -497,6 +542,18 @@ class LoadAnnotations3D(LoadAnnotations):
results['gt_labels_3d'] = results['ann_info']['gt_labels_3d']
return results
def _load_attr_labels(self, results):
"""Private function to load label annotations.
Args:
results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
Returns:
dict: The dict containing loaded label annotations.
"""
results['attr_labels'] = results['ann_info']['attr_labels']
return results
def _load_masks_3d(self, results):
"""Private function to load 3D mask annotations.
......@@ -564,8 +621,14 @@ class LoadAnnotations3D(LoadAnnotations):
results = self._load_bboxes_3d(results)
if results is None:
return None
if self.with_bbox_depth:
results = self._load_bboxes_depth(results)
if results is None:
return None
if self.with_label_3d:
results = self._load_labels_3d(results)
if self.with_attr_label:
results = self._load_attr_labels(results)
if self.with_mask_3d:
results = self._load_masks_3d(results)
if self.with_seg_3d:
......@@ -579,11 +642,13 @@ class LoadAnnotations3D(LoadAnnotations):
repr_str = self.__class__.__name__ + '(\n'
repr_str += f'{indent_str}with_bbox_3d={self.with_bbox_3d}, '
repr_str += f'{indent_str}with_label_3d={self.with_label_3d}, '
repr_str += f'{indent_str}with_attr_label={self.with_attr_label}, '
repr_str += f'{indent_str}with_mask_3d={self.with_mask_3d}, '
repr_str += f'{indent_str}with_seg_3d={self.with_seg_3d}, '
repr_str += f'{indent_str}with_bbox={self.with_bbox}, '
repr_str += f'{indent_str}with_label={self.with_label}, '
repr_str += f'{indent_str}with_mask={self.with_mask}, '
repr_str += f'{indent_str}with_seg={self.with_seg}, '
repr_str += f'{indent_str}with_bbox_depth={self.with_bbox_depth}, '
repr_str += f'{indent_str}poly2mask={self.poly2mask})'
return repr_str
......@@ -65,8 +65,17 @@ class RandomFlip3D(RandomFlip):
np.array([], dtype=np.float32))
assert len(input_dict['bbox3d_fields']) == 1
for key in input_dict['bbox3d_fields']:
input_dict['points'] = input_dict[key].flip(
direction, points=input_dict['points'])
if 'points' in input_dict:
input_dict['points'] = input_dict[key].flip(
direction, points=input_dict['points'])
else:
input_dict[key].flip(direction)
if 'centers2d' in input_dict:
assert self.sync_2d is True and direction == 'horizontal', \
'Only support sync_2d=True and horizontal flip with images'
w = input_dict['img_shape'][1]
input_dict['centers2d'][..., 0] = \
w - input_dict['centers2d'][..., 0]
def __call__(self, input_dict):
"""Call function to flip points, values in the ``bbox3d_fields`` and \
......
{"images": [{"file_name": "samples/CAM_BACK_LEFT/n015-2018-07-18-11-07-57+0800__CAM_BACK_LEFT__1531883530447423.jpg", "id": "86e6806d626b4711a6d0f5015b090116", "token": "e93e98b63d3b40209056d129dc53ceee", "cam2ego_rotation": [0.6924185592174665, -0.7031619420114925, -0.11648342771943819, 0.11203317912370753], "cam2ego_translation": [1.03569100218, 0.484795032713, 1.59097014818], "ego2global_rotation": [-0.7495886280607293, -0.0077695335695504636, 0.00829759813869316, -0.6618063711504101], "ego2global_translation": [1010.1328353833223, 610.8111652918716, 0.0], "cam_intrinsic": [[1256.7414812095406, 0.0, 792.1125740759628], [0.0, 1256.7414812095406, 492.7757465151356], [0.0, 0.0, 1.0]], "width": 1600, "height": 900}, {"file_name": "samples/CAM_FRONT/n015-2018-07-18-11-07-57+0800__CAM_FRONT__1531883530412470.jpg", "id": "020d7b4f858147558106c504f7f31bef", "token": "e93e98b63d3b40209056d129dc53ceee", "cam2ego_rotation": [0.4998015430569128, -0.5030316162024876, 0.4997798114386805, -0.49737083824542755], "cam2ego_translation": [1.70079118954, 0.0159456324149, 1.51095763913], "ego2global_rotation": [-0.7495886280607293, -0.0077695335695504636, 0.00829759813869316, -0.6618063711504101], "ego2global_translation": [1010.1328353833223, 610.8111652918716, 0.0], "cam_intrinsic": [[1266.417203046554, 0.0, 816.2670197447984], [0.0, 1266.417203046554, 491.50706579294757], [0.0, 0.0, 1.0]], "width": 1600, "height": 900}, {"file_name": "samples/CAM_FRONT_RIGHT/n015-2018-07-18-11-07-57+0800__CAM_FRONT_RIGHT__1531883530420339.jpg", "id": "16d39ff22a8545b0a4ee3236a0fe1c20", "token": "e93e98b63d3b40209056d129dc53ceee", "cam2ego_rotation": [0.2060347966337182, -0.2026940577919598, 0.6824507824531167, -0.6713610884174485], "cam2ego_translation": [1.5508477543, -0.493404796419, 1.49574800619], "ego2global_rotation": [-0.7495886280607293, -0.0077695335695504636, 0.00829759813869316, -0.6618063711504101], "ego2global_translation": [1010.1328353833223, 610.8111652918716, 0.0], "cam_intrinsic": [[1260.8474446004698, 0.0, 807.968244525554], [0.0, 1260.8474446004698, 495.3344268742088], [0.0, 0.0, 1.0]], "width": 1600, "height": 900}, {"file_name": "samples/CAM_FRONT_LEFT/n015-2018-07-18-11-07-57+0800__CAM_FRONT_LEFT__1531883530404844.jpg", "id": "24332e9c554a406f880430f17771b608", "token": "e93e98b63d3b40209056d129dc53ceee", "cam2ego_rotation": [0.6757265034669446, -0.6736266522251881, 0.21214015046209478, -0.21122827103904068], "cam2ego_translation": [1.52387798135, 0.494631336551, 1.50932822144], "ego2global_rotation": [-0.7495886280607293, -0.0077695335695504636, 0.00829759813869316, -0.6618063711504101], "ego2global_translation": [1010.1328353833223, 610.8111652918716, 0.0], "cam_intrinsic": [[1272.5979470598488, 0.0, 826.6154927353808], [0.0, 1272.5979470598488, 479.75165386361925], [0.0, 0.0, 1.0]], "width": 1600, "height": 900}, {"file_name": "samples/CAM_BACK/n015-2018-07-18-11-07-57+0800__CAM_BACK__1531883530437525.jpg", "id": "aab35aeccbda42de82b2ff5c278a0d48", "token": "e93e98b63d3b40209056d129dc53ceee", "cam2ego_rotation": [0.5037872666382278, -0.49740249788611096, -0.4941850223835201, 0.5045496097725578], "cam2ego_translation": [0.0283260309358, 0.00345136761476, 1.57910346144], "ego2global_rotation": [-0.7495886280607293, -0.0077695335695504636, 0.00829759813869316, -0.6618063711504101], "ego2global_translation": [1010.1328353833223, 610.8111652918716, 0.0], "cam_intrinsic": [[809.2209905677063, 0.0, 829.2196003259838], [0.0, 809.2209905677063, 481.77842384512485], [0.0, 0.0, 1.0]], "width": 1600, "height": 900}, {"file_name": "samples/CAM_BACK_RIGHT/n015-2018-07-18-11-07-57+0800__CAM_BACK_RIGHT__1531883530427893.jpg", "id": "ec7096278e484c9ebe6894a2ad5682e9", "token": "e93e98b63d3b40209056d129dc53ceee", "cam2ego_rotation": [0.12280980120078765, -0.132400842670559, -0.7004305821388234, 0.690496031265798], "cam2ego_translation": [1.0148780988, -0.480568219723, 1.56239545128], "ego2global_rotation": [-0.7495886280607293, -0.0077695335695504636, 0.00829759813869316, -0.6618063711504101], "ego2global_translation": [1010.1328353833223, 610.8111652918716, 0.0], "cam_intrinsic": [[1259.5137405846733, 0.0, 807.2529053838625], [0.0, 1259.5137405846733, 501.19579884916527], [0.0, 0.0, 1.0]], "width": 1600, "height": 900}], "annotations": [{"file_name": "samples/CAM_FRONT/n015-2018-07-18-11-07-57+0800__CAM_FRONT__1531883530412470.jpg", "image_id": "020d7b4f858147558106c504f7f31bef", "area": 85383.89600714693, "category_name": "truck", "category_id": 1, "bbox": [0.0, 357.732750319127, 342.56437261895206, 249.24920053528984], "iscrowd": 0, "bbox_cam3d": [-10.356295829208502, -0.06394600736590471, 18.785737229926998, 2.312, 7.516, 3.093, -0.5996975863361309], "velo_cam3d": [0.05742557272436208, 0.06990201482350666], "center2d": [118.11016609440316, 487.19622492451936, 18.785737229926998], "attribute_name": "vehicle.parked", "attribute_id": 6, "segmentation": [], "id": 0}, {"file_name": "samples/CAM_FRONT_LEFT/n015-2018-07-18-11-07-57+0800__CAM_FRONT_LEFT__1531883530404844.jpg", "image_id": "24332e9c554a406f880430f17771b608", "area": 76274.38331683438, "category_name": "truck", "category_id": 1, "bbox": [1305.1296604171719, 350.75901341602525, 294.87033958282814, 258.6709243959383], "iscrowd": 0, "bbox_cam3d": [9.795917040815693, 0.07538275380197612, 19.033148401567978, 2.312, 7.516, 3.093, -1.5546044317874126], "velo_cam3d": [0.09022854769195846, -0.0065096147400431695], "center2d": [1481.5919397578637, 484.79190972187814, 19.033148401567978], "attribute_name": "vehicle.parked", "attribute_id": 6, "segmentation": [], "id": 1}, {"file_name": "samples/CAM_FRONT_LEFT/n015-2018-07-18-11-07-57+0800__CAM_FRONT_LEFT__1531883530404844.jpg", "image_id": "24332e9c554a406f880430f17771b608", "area": 5248.9339273703135, "category_name": "truck", "category_id": 1, "bbox": [808.1218983320856, 436.2076328554, 75.28483638734929, 69.72099800235912], "iscrowd": 0, "bbox_cam3d": [0.7896581102503435, -0.32866532307883706, 58.48166239420381, 2.877, 6.372, 2.978, 1.641180695066564], "velo_cam3d": [0.009938485543455734, 0.0010084200213775884], "center2d": [843.7989524532317, 472.5996886441534, 58.48166239420381], "attribute_name": "vehicle.parked", "attribute_id": 6, "segmentation": [], "id": 2}, {"file_name": "samples/CAM_FRONT_LEFT/n015-2018-07-18-11-07-57+0800__CAM_FRONT_LEFT__1531883530404844.jpg", "image_id": "24332e9c554a406f880430f17771b608", "area": 25266.816070927107, "category_name": "truck", "category_id": 1, "bbox": [1133.5883785276196, 424.4436001005383, 202.5256666350731, 124.75858734712807], "iscrowd": 0, "bbox_cam3d": [9.39338221449255, 0.19762751304835102, 30.01455814405707, 2.156, 6.227, 2.601, -1.4587684025759116], "velo_cam3d": [0.0, 0.0], "center2d": [1224.88885277412, 488.1309332180172, 30.01455814405707], "attribute_name": "vehicle.parked", "attribute_id": 6, "segmentation": [], "id": 3}, {"file_name": "samples/CAM_BACK/n015-2018-07-18-11-07-57+0800__CAM_BACK__1531883530437525.jpg", "image_id": "aab35aeccbda42de82b2ff5c278a0d48", "area": 31981.88483023472, "category_name": "car", "category_id": 0, "bbox": [652.8710695836726, 487.2457293359287, 256.3734471348506, 124.74725907715583], "iscrowd": 0, "bbox_cam3d": [-0.48041137691585667, 0.8426032188612489, 12.27160016308813, 1.871, 4.478, 1.456, -2.0402647554154876], "velo_cam3d": [-2.4043357184501866, -4.232358489028598], "center2d": [797.5400340802389, 537.3418550489371, 12.27160016308813], "attribute_name": "vehicle.moving", "attribute_id": 5, "segmentation": [], "id": 4}, {"file_name": "samples/CAM_BACK_LEFT/n015-2018-07-18-11-07-57+0800__CAM_BACK_LEFT__1531883530447423.jpg", "image_id": "86e6806d626b4711a6d0f5015b090116", "area": 1874.1656394574547, "category_name": "traffic_cone", "category_id": 8, "bbox": [1084.536273989852, 513.7567766430512, 30.043100006470013, 62.382565016720605], "iscrowd": 0, "bbox_cam3d": [3.745641322414848, 0.6321604510604618, 15.319339525420224, 0.3, 0.291, 0.734, 1.4550554479430875], "velo_cam3d": [0.028202672296939114, -0.001622377193634249], "center2d": [1099.3910188026568, 544.635832278593, 15.319339525420224], "attribute_name": "None", "attribute_id": 8, "segmentation": [], "id": 5}, {"file_name": "samples/CAM_BACK_LEFT/n015-2018-07-18-11-07-57+0800__CAM_BACK_LEFT__1531883530447423.jpg", "image_id": "86e6806d626b4711a6d0f5015b090116", "area": 1641.3529623313364, "category_name": "traffic_cone", "category_id": 8, "bbox": [823.5058461203419, 512.0451382733748, 27.545987206560085, 59.58591899514306], "iscrowd": 0, "bbox_cam3d": [0.558956408408079, 0.6054486006477211, 15.607344275188172, 0.315, 0.338, 0.712, 1.5596704833049395], "velo_cam3d": [0.07717355032092023, -0.0013264953734539453], "center2d": [837.1211093045397, 541.5279466177432, 15.607344275188172], "attribute_name": "None", "attribute_id": 8, "segmentation": [], "id": 6}, {"file_name": "samples/CAM_BACK_LEFT/n015-2018-07-18-11-07-57+0800__CAM_BACK_LEFT__1531883530447423.jpg", "image_id": "86e6806d626b4711a6d0f5015b090116", "area": 11464.868967812941, "category_name": "pedestrian", "category_id": 7, "bbox": [1091.57108913607, 427.8805195896188, 76.29701915190844, 150.2662763926101], "iscrowd": 0, "bbox_cam3d": [3.953820859983739, 0.11100574170732268, 14.75668416993455, 0.739, 0.563, 1.711, 1.4550554479430875], "velo_cam3d": [0.10262495353364391, -0.0064695610507391095], "center2d": [1128.8366393735657, 502.22946380348515, 14.75668416993455], "attribute_name": "pedestrian.sitting_lying_down", "attribute_id": 4, "segmentation": [], "id": 7}, {"file_name": "samples/CAM_BACK_LEFT/n015-2018-07-18-11-07-57+0800__CAM_BACK_LEFT__1531883530447423.jpg", "image_id": "86e6806d626b4711a6d0f5015b090116", "area": 10887.814254422945, "category_name": "pedestrian", "category_id": 7, "bbox": [1160.5755663065963, 427.76823935577545, 72.04645850373822, 151.1221298109749], "iscrowd": 0, "bbox_cam3d": [4.7798492054669035, 0.1162134030605403, 14.880252178422799, 0.665, 0.544, 1.739, 1.4550554479430875], "velo_cam3d": [0.08665208940588605, -0.12554131041835265], "center2d": [1195.8043058026105, 502.5907820768639, 14.880252178422799], "attribute_name": "pedestrian.sitting_lying_down", "attribute_id": 4, "segmentation": [], "id": 8}, {"file_name": "samples/CAM_BACK_LEFT/n015-2018-07-18-11-07-57+0800__CAM_BACK_LEFT__1531883530447423.jpg", "image_id": "86e6806d626b4711a6d0f5015b090116", "area": 1840.396836351825, "category_name": "traffic_cone", "category_id": 8, "bbox": [976.5016497372175, 515.0039595028874, 30.627062877370918, 60.09054292018379], "iscrowd": 0, "bbox_cam3d": [2.4596094747766615, 0.6404788797338883, 15.49228428713527, 0.338, 0.309, 0.712, 1.461625206011101], "velo_cam3d": [0.02389033738396964, -0.0027892907804445547], "center2d": [991.6372663187118, 544.7316983348808, 15.49228428713527], "attribute_name": "None", "attribute_id": 8, "segmentation": [], "id": 9}, {"file_name": "samples/CAM_BACK_RIGHT/n015-2018-07-18-11-07-57+0800__CAM_BACK_RIGHT__1531883530427893.jpg", "image_id": "ec7096278e484c9ebe6894a2ad5682e9", "area": 130637.82232697189, "category_name": "car", "category_id": 0, "bbox": [806.290660237549, 470.86948127698895, 564.486943265249, 231.42753589888787], "iscrowd": 0, "bbox_cam3d": [2.041080764231013, 0.5400087467741127, 10.16381197333443, 1.638, 4.25, 1.44, 2.3008777344302445], "velo_cam3d": [-3.11975390859937, 4.71824099865795], "center2d": [1060.1864774468488, 568.1144351228712, 10.16381197333443], "attribute_name": "vehicle.moving", "attribute_id": 5, "segmentation": [], "id": 10}], "categories": [{"id": 0, "name": "car"}, {"id": 1, "name": "truck"}, {"id": 2, "name": "trailer"}, {"id": 3, "name": "bus"}, {"id": 4, "name": "construction_vehicle"}, {"id": 5, "name": "bicycle"}, {"id": 6, "name": "motorcycle"}, {"id": 7, "name": "pedestrian"}, {"id": 8, "name": "traffic_cone"}, {"id": 9, "name": "barrier"}]}
\ No newline at end of file
import mmcv
import numpy as np
import pytest
import torch
from mmdet3d.datasets import NuScenesMonoDataset
def test_getitem():
np.random.seed(0)
class_names = [
'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
]
img_norm_cfg = dict(
mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False)
pipeline = [
dict(type='LoadImageFromFileMono3D'),
dict(
type='LoadAnnotations3D',
with_bbox=True,
with_label=True,
with_attr_label=True,
with_bbox_3d=True,
with_label_3d=True,
with_bbox_depth=True),
dict(type='Resize', img_scale=(1600, 900), keep_ratio=True),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=1.0),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(
type='Collect3D',
keys=[
'img', 'gt_bboxes', 'gt_labels', 'attr_labels', 'gt_bboxes_3d',
'gt_labels_3d', 'centers2d', 'depths'
]),
]
nus_dataset = NuScenesMonoDataset(
ann_file='tests/data/nuscenes/nus_infos_mono3d.coco.json',
pipeline=pipeline,
data_root='tests/data/nuscenes/',
img_prefix='tests/data/nuscenes/',
test_mode=False)
data = nus_dataset[0]
img_metas = data['img_metas']._data
filename = img_metas['filename']
img_shape = img_metas['img_shape']
pad_shape = img_metas['pad_shape']
flip = img_metas['flip']
bboxes = data['gt_bboxes']._data
attrs = data['attr_labels']._data
labels3d = data['gt_labels_3d']._data
labels = data['gt_labels']._data
centers2d = data['centers2d']._data
depths = data['depths']._data
expected_filename = 'tests/data/nuscenes/samples/CAM_BACK_LEFT/' + \
'n015-2018-07-18-11-07-57+0800__CAM_BACK_LEFT__1531883530447423.jpg'
expected_img_shape = (900, 1600, 3)
expected_pad_shape = (928, 1600, 3)
expected_flip = True
expected_bboxes = torch.tensor([[485.4207, 513.7568, 515.4637, 576.1393],
[748.9482, 512.0452, 776.4941, 571.6310],
[432.1318, 427.8805, 508.4290, 578.1468],
[367.3779, 427.7682, 439.4244, 578.8904],
[592.8713, 515.0040, 623.4984, 575.0945]])
expected_attr_labels = torch.tensor([8, 8, 4, 4, 8])
expected_labels = torch.tensor([8, 8, 7, 7, 8])
expected_centers2d = torch.tensor([[500.6090, 544.6358],
[762.8789, 541.5280],
[471.1633, 502.2295],
[404.1957, 502.5908],
[608.3627, 544.7317]])
expected_depths = torch.tensor(
[15.3193, 15.6073, 14.7567, 14.8803, 15.4923])
assert filename == expected_filename
assert img_shape == expected_img_shape
assert pad_shape == expected_pad_shape
assert flip == expected_flip
assert torch.allclose(bboxes, expected_bboxes, 1e-5)
assert torch.all(attrs == expected_attr_labels)
assert torch.all(labels == expected_labels)
assert torch.all(labels3d == expected_labels)
assert torch.allclose(centers2d, expected_centers2d, 1e-5)
assert torch.allclose(depths, expected_depths, 1e-5)
def test_format_results():
if not torch.cuda.is_available():
pytest.skip('test requires GPU and torch+cuda')
root_path = 'tests/data/nuscenes/'
ann_file = 'tests/data/nuscenes/nus_infos_mono3d.coco.json'
class_names = [
'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
]
pipeline = [
dict(type='LoadImageFromFileMono3D'),
dict(
type='LoadAnnotations3D',
with_bbox=True,
with_label=True,
with_attr_label=True,
with_bbox_3d=True,
with_label_3d=True,
with_bbox_depth=True),
dict(type='Resize', img_scale=(1600, 900), keep_ratio=True),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(
type='Collect3D',
keys=[
'img', 'gt_bboxes', 'gt_labels', 'attr_labels', 'gt_bboxes_3d',
'gt_labels_3d', 'centers2d', 'depths'
]),
]
nus_dataset = NuScenesMonoDataset(
ann_file=ann_file,
pipeline=pipeline,
data_root=root_path,
test_mode=True)
results = mmcv.load('tests/data/nuscenes/mono3d_sample_results.pkl')
result_files, tmp_dir = nus_dataset.format_results(results)
result_data = mmcv.load(result_files['img_bbox'])
assert len(result_data['results'].keys()) == 1
assert len(result_data['results']['e93e98b63d3b40209056d129dc53ceee']) == 8
det = result_data['results']['e93e98b63d3b40209056d129dc53ceee'][0]
expected_token = 'e93e98b63d3b40209056d129dc53ceee'
expected_trans = torch.tensor(
[1018.753821915645, 605.190386124652, 0.7266818822266328])
expected_size = torch.tensor([1.6380000114440918, 4.25, 1.440000057220459])
expected_rotation = torch.tensor([
-0.9924980733795628, -0.013604682549109839, 0.01027292674776989,
-0.12106590736714223
])
expected_detname = 'car'
expected_attr = 'vehicle.moving'
assert det['sample_token'] == expected_token
assert torch.allclose(
torch.tensor(det['translation']), expected_trans, 1e-5)
assert torch.allclose(torch.tensor(det['size']), expected_size, 1e-5)
assert torch.allclose(
torch.tensor(det['rotation']), expected_rotation, 1e-5)
assert det['detection_name'] == expected_detname
assert det['attribute_name'] == expected_attr
......@@ -130,10 +130,11 @@ def test_load_annotations3D():
scannet_pts_semantic_mask = scannet_results['pts_semantic_mask']
repr_str = repr(scannet_load_annotations3D)
expected_repr_str = 'LoadAnnotations3D(\n with_bbox_3d=True, ' \
'with_label_3d=True, with_mask_3d=True, ' \
'with_seg_3d=True, with_bbox=False, ' \
'with_label=False, with_mask=False, ' \
'with_seg=False, poly2mask=True)'
'with_label_3d=True, with_attr_label=False, ' \
'with_mask_3d=True, with_seg_3d=True, ' \
'with_bbox=False, with_label=False, ' \
'with_mask=False, with_seg=False, ' \
'with_bbox_depth=False, poly2mask=True)'
assert repr_str == expected_repr_str
assert scannet_gt_boxes.tensor.shape == (27, 7)
assert scannet_gt_labels.shape == (27, )
......
......@@ -55,6 +55,9 @@ def nuscenes_data_prep(root_path,
root_path, info_prefix, version=version, max_sweeps=max_sweeps)
if version == 'v1.0-test':
info_test_path = osp.join(root_path, f'{info_prefix}_infos_test.pkl')
nuscenes_converter.export_2d_annotation(
root_path, info_test_path, version=version)
return
info_train_path = osp.join(root_path, f'{info_prefix}_infos_train.pkl')
......
......@@ -9,12 +9,18 @@ from pyquaternion import Quaternion
from shapely.geometry import MultiPoint, box
from typing import List, Tuple, Union
from mmdet3d.core.bbox.box_np_ops import points_cam2img
from mmdet3d.datasets import NuScenesDataset
nus_categories = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
'barrier')
nus_attributes = ('cycle.with_rider', 'cycle.without_rider',
'pedestrian.moving', 'pedestrian.standing',
'pedestrian.sitting_lying_down', 'vehicle.moving',
'vehicle.parked', 'vehicle.stopped', 'None')
def create_nuscenes_infos(root_path,
info_prefix,
......@@ -324,13 +330,14 @@ def obtain_sensor2top(nusc,
return sweep
def export_2d_annotation(root_path, info_path, version):
def export_2d_annotation(root_path, info_path, version, mono3d=True):
"""Export 2d annotation from the info file and raw data.
Args:
root_path (str): Root path of the raw data.
info_path (str): Path of the info file.
version (str): Dataset version.
mono3d (bool): Whether to export mono3d annotation. Default: True.
"""
# get bbox annotations for camera
camera_types = [
......@@ -356,12 +363,20 @@ def export_2d_annotation(root_path, info_path, version):
coco_infos = get_2d_boxes(
nusc,
cam_info['sample_data_token'],
visibilities=['', '1', '2', '3', '4'])
visibilities=['', '1', '2', '3', '4'],
mono3d=mono3d)
(height, width, _) = mmcv.imread(cam_info['data_path']).shape
coco_2d_dict['images'].append(
dict(
file_name=cam_info['data_path'],
file_name=cam_info['data_path'].split('data/nuscenes/')
[-1],
id=cam_info['sample_data_token'],
token=info['token'],
cam2ego_rotation=cam_info['sensor2ego_rotation'],
cam2ego_translation=cam_info['sensor2ego_translation'],
ego2global_rotation=info['ego2global_rotation'],
ego2global_translation=info['ego2global_translation'],
cam_intrinsic=cam_info['cam_intrinsic'],
width=width,
height=height))
for coco_info in coco_infos:
......@@ -372,16 +387,24 @@ def export_2d_annotation(root_path, info_path, version):
coco_info['id'] = coco_ann_id
coco_2d_dict['annotations'].append(coco_info)
coco_ann_id += 1
mmcv.dump(coco_2d_dict, f'{info_path[:-4]}.coco.json')
if mono3d:
json_prefix = f'{info_path[:-4]}_mono3d'
else:
json_prefix = f'{info_path[:-4]}'
mmcv.dump(coco_2d_dict, f'{json_prefix}.coco.json')
def get_2d_boxes(nusc, sample_data_token: str,
visibilities: List[str]) -> List[OrderedDict]:
def get_2d_boxes(nusc,
sample_data_token: str,
visibilities: List[str],
mono3d=True):
"""Get the 2D annotation records for a given `sample_data_token`.
Args:
sample_data_token: Sample data token belonging to a camera keyframe.
visibilities: Visibility filter.
sample_data_token (str): Sample data token belonging to a camera \
keyframe.
visibilities (list[str]): Visibility filter.
mono3d (bool): Whether to get boxes with mono3d annotation.
Return:
list[dict]: List of 2D annotation record that belongs to the input
......@@ -456,6 +479,43 @@ def get_2d_boxes(nusc, sample_data_token: str,
# Generate dictionary record to be included in the .json file.
repro_rec = generate_record(ann_rec, min_x, min_y, max_x, max_y,
sample_data_token, sd_rec['filename'])
# If mono3d=True, add 3D annotations in camera coordinates
if mono3d and (repro_rec is not None):
loc = box.center.tolist()
dim = box.wlh.tolist()
rot = [box.orientation.yaw_pitch_roll[0]]
global_velo2d = nusc.box_velocity(box.token)[:2]
global_velo3d = np.array([*global_velo2d, 0.0])
e2g_r_mat = Quaternion(pose_rec['rotation']).rotation_matrix
c2e_r_mat = Quaternion(cs_rec['rotation']).rotation_matrix
cam_velo3d = global_velo3d @ np.linalg.inv(
e2g_r_mat).T @ np.linalg.inv(c2e_r_mat).T
velo = cam_velo3d[0::2].tolist()
repro_rec['bbox_cam3d'] = loc + dim + rot
repro_rec['velo_cam3d'] = velo
center3d = np.array(loc).reshape([1, 3])
center2d = points_cam2img(
center3d, camera_intrinsic, with_depth=True)
repro_rec['center2d'] = center2d.squeeze().tolist()
# normalized center2D + depth
# if samples with depth < 0 will be removed
if repro_rec['center2d'][2] <= 0:
continue
ann_token = nusc.get('sample_annotation',
box.token)['attribute_tokens']
if len(ann_token) == 0:
attr_name = 'None'
else:
attr_name = nusc.get('attribute', ann_token[0])['name']
attr_id = nus_attributes.index(attr_name)
repro_rec['attribute_name'] = attr_name
repro_rec['attribute_id'] = attr_id
repro_recs.append(repro_rec)
return repro_recs
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment