Unverified Commit ecd0d06a authored by twang's avatar twang Committed by GitHub
Browse files

[Feature] Support monocular 3D detection on nuScenes (#392)

* Support nuscenes mono3d json info generation

* Support nuscenes mono3d dataset class

* Support attribute and bbox2d prediction in bbox3dnms and bbox3d2result

* Rename dataset class and add comments to 'attrs'

* Support mono3d related pipelines

* Fix unittest for loading 3D annotations

* Add unit test for nuscenes mono3d dataset

* Rename the sample result file

* Upload sample data for mono3d unit test

* Upload sample data for mono3d unit test

* Upload sample image for unit test

* Delete tests/data/nuscenes/samples/LIDAR_TOP/CAM_BACK_LEFT directory

* Add files via upload

* Remove unnecessary 'f'

* Remove unnecessary \ in arguments

* Remove check for pycocotools version because it has been done in the cocodataset

* Remove unnecessary comma, add TODO and change init of attrs in format_results

* Merge RandomFlip3D and RandomFlipMono3D

* Add pytest to check whether cuda is available in the unit test

* Add visualization TODO

* Remove useless init in loading mono3d images
parent 3a5a2010
......@@ -335,21 +335,40 @@ def rotation_points_single_angle(points, angle, axis=0):
return points @ rot_mat_T, rot_mat_T
def points_cam2img(points_3d, proj_mat):
def points_cam2img(points_3d, proj_mat, with_depth=False):
"""Project points in camera coordinates to image coordinates.
Args:
points_3d (np.ndarray): Points in shape (N, 3)
proj_mat (np.ndarray): Transformation matrix between coordinates.
with_depth (bool): Whether to keep depth in the output.
Returns:
np.ndarray: Points in image coordinates with shape [N, 2].
"""
points_shape = list(points_3d.shape)
points_shape[-1] = 1
assert len(proj_mat.shape) == 2, 'The dimension of the projection'\
f' matrix should be 2 instead of {len(proj_mat.shape)}.'
d1, d2 = proj_mat.shape[:2]
assert (d1 == 3 and d2 == 3) or (d1 == 3 and d2 == 4) or (
d1 == 4 and d2 == 4), 'The shape of the projection matrix'\
f' ({d1}*{d2}) is not supported.'
if d1 == 3:
proj_mat_expanded = np.eye(4, dtype=proj_mat.dtype)
proj_mat_expanded[:d1, :d2] = proj_mat
proj_mat = proj_mat_expanded
points_4 = np.concatenate([points_3d, np.ones(points_shape)], axis=-1)
point_2d = points_4 @ proj_mat.T
point_2d_res = point_2d[..., :2] / point_2d[..., 2:3]
if with_depth:
points_2d_depth = np.concatenate([point_2d_res, point_2d[..., 2:3]],
axis=-1)
return points_2d_depth
return point_2d_res
......
......@@ -124,11 +124,11 @@ def points_cam2img(points_3d, proj_mat):
points_num = list(points_3d.shape)[:-1]
points_shape = np.concatenate([points_num, [1]], axis=0).tolist()
assert len(proj_mat.shape) == 2, f'The dimension of the projection'\
f'matrix should be 2 instead of {len(proj_mat.shape)}.'
assert len(proj_mat.shape) == 2, 'The dimension of the projection'\
f' matrix should be 2 instead of {len(proj_mat.shape)}.'
d1, d2 = proj_mat.shape[:2]
assert (d1 == 3 and d2 == 3) or (d1 == 3 and d2 == 4) or (
d1 == 4 and d2 == 4), f'The shape of the projection matrix'\
d1 == 4 and d2 == 4), 'The shape of the projection matrix'\
f' ({d1}*{d2}) is not supported.'
if d1 == 3:
proj_mat_expanded = torch.eye(
......
......@@ -46,13 +46,15 @@ def bbox3d2roi(bbox_list):
return rois
def bbox3d2result(bboxes, scores, labels):
def bbox3d2result(bboxes, scores, labels, attrs=None):
"""Convert detection results to a list of numpy arrays.
Args:
bboxes (torch.Tensor): Bounding boxes with shape of (n, 5).
labels (torch.Tensor): Labels with shape of (n, ).
scores (torch.Tensor): Scores with shape of (n, ).
attrs (torch.Tensor, optional): Attributes with shape of (n, ). \
Defaults to None.
Returns:
dict[str, torch.Tensor]: Bounding box results in cpu mode.
......@@ -60,8 +62,14 @@ def bbox3d2result(bboxes, scores, labels):
- boxes_3d (torch.Tensor): 3D boxes.
- scores (torch.Tensor): Prediction scores.
- labels_3d (torch.Tensor): Box labels.
- attrs_3d (torch.Tensor, optional): Box attributes.
"""
return dict(
result_dict = dict(
boxes_3d=bboxes.to('cpu'),
scores_3d=scores.cpu(),
labels_3d=labels.cpu())
if attrs is not None:
result_dict['attrs_3d'] = attrs.cpu()
return result_dict
......@@ -11,7 +11,9 @@ def box3d_multiclass_nms(mlvl_bboxes,
score_thr,
max_num,
cfg,
mlvl_dir_scores=None):
mlvl_dir_scores=None,
mlvl_attr_scores=None,
mlvl_bboxes2d=None):
"""Multi-class nms for 3D boxes.
Args:
......@@ -27,10 +29,15 @@ def box3d_multiclass_nms(mlvl_bboxes,
cfg (dict): Configuration dict of NMS.
mlvl_dir_scores (torch.Tensor, optional): Multi-level scores
of direction classifier. Defaults to None.
mlvl_attr_scores (torch.Tensor, optional): Multi-level scores
of attribute classifier. Defaults to None.
mlvl_bboxes2d (torch.Tensor, optional): Multi-level 2D bounding
boxes. Defaults to None.
Returns:
tuple[torch.Tensor]: Return results after nms, including 3D \
bounding boxes, scores, labels and direction scores.
bounding boxes, scores, labels, direction scores, attribute \
scores (optional) and 2D bounding boxes (optional).
"""
# do multi class nms
# the fg class id range: [0, num_classes-1]
......@@ -39,6 +46,8 @@ def box3d_multiclass_nms(mlvl_bboxes,
scores = []
labels = []
dir_scores = []
attr_scores = []
bboxes2d = []
for i in range(0, num_classes):
# get bboxes and scores of this class
cls_inds = mlvl_scores[:, i] > score_thr
......@@ -65,6 +74,12 @@ def box3d_multiclass_nms(mlvl_bboxes,
if mlvl_dir_scores is not None:
_mlvl_dir_scores = mlvl_dir_scores[cls_inds]
dir_scores.append(_mlvl_dir_scores[selected])
if mlvl_attr_scores is not None:
_mlvl_attr_scores = mlvl_attr_scores[cls_inds]
attr_scores.append(_mlvl_attr_scores[selected])
if mlvl_bboxes2d is not None:
_mlvl_bboxes2d = mlvl_bboxes2d[cls_inds]
bboxes2d.append(_mlvl_bboxes2d[selected])
if bboxes:
bboxes = torch.cat(bboxes, dim=0)
......@@ -72,6 +87,10 @@ def box3d_multiclass_nms(mlvl_bboxes,
labels = torch.cat(labels, dim=0)
if mlvl_dir_scores is not None:
dir_scores = torch.cat(dir_scores, dim=0)
if mlvl_attr_scores is not None:
attr_scores = torch.cat(attr_scores, dim=0)
if mlvl_bboxes2d is not None:
bboxes2d = torch.cat(bboxes2d, dim=0)
if bboxes.shape[0] > max_num:
_, inds = scores.sort(descending=True)
inds = inds[:max_num]
......@@ -80,12 +99,31 @@ def box3d_multiclass_nms(mlvl_bboxes,
scores = scores[inds]
if mlvl_dir_scores is not None:
dir_scores = dir_scores[inds]
if mlvl_attr_scores is not None:
attr_scores = attr_scores[inds]
if mlvl_bboxes2d is not None:
bboxes2d = bboxes2d[inds]
else:
bboxes = mlvl_scores.new_zeros((0, mlvl_bboxes.size(-1)))
scores = mlvl_scores.new_zeros((0, ))
labels = mlvl_scores.new_zeros((0, ), dtype=torch.long)
dir_scores = mlvl_scores.new_zeros((0, ))
return bboxes, scores, labels, dir_scores
if mlvl_dir_scores is not None:
dir_scores = mlvl_scores.new_zeros((0, ))
if mlvl_attr_scores is not None:
attr_scores = mlvl_scores.new_zeros((0, ))
if mlvl_bboxes2d is not None:
bboxes2d = mlvl_scores.new_zeros((0, 4))
results = (bboxes, scores, labels)
if mlvl_dir_scores is not None:
results = results + (dir_scores, )
if mlvl_attr_scores is not None:
results = results + (attr_scores, )
if mlvl_bboxes2d is not None:
results = results + (bboxes2d, )
return results
def aligned_3d_nms(boxes, scores, classes, thresh):
......
......@@ -5,6 +5,7 @@ from .custom_3d_seg import Custom3DSegDataset
from .kitti_dataset import KittiDataset
from .lyft_dataset import LyftDataset
from .nuscenes_dataset import NuScenesDataset
from .nuscenes_mono_dataset import NuScenesMonoDataset
from .pipelines import (BackgroundPointsFilter, GlobalRotScaleTrans,
IndoorPointSample, LoadAnnotations3D,
LoadPointsFromFile, LoadPointsFromMultiSweeps,
......@@ -19,9 +20,9 @@ from .waymo_dataset import WaymoDataset
__all__ = [
'KittiDataset', 'GroupSampler', 'DistributedGroupSampler',
'build_dataloader', 'RepeatFactorDataset', 'DATASETS', 'build_dataset',
'CocoDataset', 'NuScenesDataset', 'LyftDataset', 'ObjectSample',
'RandomFlip3D', 'ObjectNoise', 'GlobalRotScaleTrans', 'PointShuffle',
'ObjectRangeFilter', 'PointsRangeFilter', 'Collect3D',
'CocoDataset', 'NuScenesDataset', 'NuScenesMonoDataset', 'LyftDataset',
'ObjectSample', 'RandomFlip3D', 'ObjectNoise', 'GlobalRotScaleTrans',
'PointShuffle', 'ObjectRangeFilter', 'PointsRangeFilter', 'Collect3D',
'LoadPointsFromFile', 'NormalizePointsColor', 'IndoorPointSample',
'LoadAnnotations3D', 'SUNRGBDDataset', 'ScanNetDataset',
'ScanNetSegDataset', 'SemanticKITTIDataset', 'Custom3DDataset',
......
This diff is collapsed.
......@@ -51,7 +51,8 @@ class DefaultFormatBundle(object):
results['img'] = DC(to_tensor(img), stack=True)
for key in [
'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels',
'gt_labels_3d', 'pts_instance_mask', 'pts_semantic_mask'
'gt_labels_3d', 'attr_labels', 'pts_instance_mask',
'pts_semantic_mask', 'centers2d', 'depths'
]:
if key not in results:
continue
......@@ -134,11 +135,11 @@ class Collect3D(object):
keys,
meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img',
'pad_shape', 'scale_factor', 'flip',
'pcd_horizontal_flip', 'pcd_vertical_flip',
'box_mode_3d', 'box_type_3d', 'img_norm_cfg',
'rect', 'Trv2c', 'P2', 'pcd_trans', 'sample_idx',
'pcd_scale_factor', 'pcd_rotation', 'pts_filename',
'transformation_3d_flow')):
'cam_intrinsic', 'pcd_horizontal_flip',
'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d',
'img_norm_cfg', 'rect', 'Trv2c', 'P2', 'pcd_trans',
'sample_idx', 'pcd_scale_factor', 'pcd_rotation',
'pts_filename', 'transformation_3d_flow')):
self.keys = keys
self.meta_keys = meta_keys
......@@ -222,6 +223,11 @@ class DefaultFormatBundle3D(DefaultFormatBundle):
if 'gt_names_3d' in results:
results['gt_names_3d'] = results['gt_names_3d'][
gt_bboxes_3d_mask]
if 'centers2d' in results:
results['centers2d'] = results['centers2d'][
gt_bboxes_3d_mask]
if 'depths' in results:
results['depths'] = results['depths'][gt_bboxes_3d_mask]
if 'gt_bboxes_mask' in results:
gt_bboxes_mask = results['gt_bboxes_mask']
if 'gt_bboxes' in results:
......@@ -230,6 +236,7 @@ class DefaultFormatBundle3D(DefaultFormatBundle):
if self.with_label:
if 'gt_names' in results and len(results['gt_names']) == 0:
results['gt_labels'] = np.array([], dtype=np.int64)
results['attr_labels'] = np.array([], dtype=np.int64)
elif 'gt_names' in results and isinstance(
results['gt_names'][0], list):
# gt_labels might be a list of list in multi-view setting
......
......@@ -3,7 +3,7 @@ import numpy as np
from mmdet3d.core.points import BasePoints, get_points_type
from mmdet.datasets.builder import PIPELINES
from mmdet.datasets.pipelines import LoadAnnotations
from mmdet.datasets.pipelines import LoadAnnotations, LoadImageFromFile
@PIPELINES.register_module()
......@@ -65,6 +65,30 @@ class LoadMultiViewImageFromFiles(object):
f"color_type='{self.color_type}')"
@PIPELINES.register_module()
class LoadImageFromFileMono3D(LoadImageFromFile):
"""Load an image from file in monocular 3D object detection. Compared to 2D
detection, additional camera parameters need to be loaded.
Args:
kwargs (dict): Arguments are the same as those in \
:class:`LoadImageFromFile`.
"""
def __call__(self, results):
"""Call functions to load image and get image meta information.
Args:
results (dict): Result dict from :obj:`mmdet.CustomDataset`.
Returns:
dict: The dict contains loaded image and meta information.
"""
super().__call__(results)
results['cam_intrinsic'] = results['img_info']['cam_intrinsic']
return results
@PIPELINES.register_module()
class LoadPointsFromMultiSweeps(object):
"""Load points from multiple sweeps.
......@@ -426,6 +450,8 @@ class LoadAnnotations3D(LoadAnnotations):
Defaults to True.
with_label_3d (bool, optional): Whether to load 3D labels.
Defaults to True.
with_attr_label (bool, optional): Whether to load attribute label.
Defaults to False.
with_mask_3d (bool, optional): Whether to load 3D instance masks.
for points. Defaults to False.
with_seg_3d (bool, optional): Whether to load 3D semantic masks.
......@@ -438,6 +464,8 @@ class LoadAnnotations3D(LoadAnnotations):
Defaults to False.
with_seg (bool, optional): Whether to load 2D semantic masks.
Defaults to False.
with_bbox_depth (bool, optional): Whether to load 2.5D boxes.
Defaults to False.
poly2mask (bool, optional): Whether to convert polygon annotations
to bitmasks. Defaults to True.
seg_3d_dtype (dtype, optional): Dtype of 3D semantic masks.
......@@ -450,12 +478,14 @@ class LoadAnnotations3D(LoadAnnotations):
def __init__(self,
with_bbox_3d=True,
with_label_3d=True,
with_attr_label=False,
with_mask_3d=False,
with_seg_3d=False,
with_bbox=False,
with_label=False,
with_mask=False,
with_seg=False,
with_bbox_depth=False,
poly2mask=True,
seg_3d_dtype='int',
file_client_args=dict(backend='disk')):
......@@ -467,7 +497,9 @@ class LoadAnnotations3D(LoadAnnotations):
poly2mask,
file_client_args=file_client_args)
self.with_bbox_3d = with_bbox_3d
self.with_bbox_depth = with_bbox_depth
self.with_label_3d = with_label_3d
self.with_attr_label = with_attr_label
self.with_mask_3d = with_mask_3d
self.with_seg_3d = with_seg_3d
self.seg_3d_dtype = seg_3d_dtype
......@@ -485,6 +517,19 @@ class LoadAnnotations3D(LoadAnnotations):
results['bbox3d_fields'].append('gt_bboxes_3d')
return results
def _load_bboxes_depth(self, results):
"""Private function to load 2.5D bounding box annotations.
Args:
results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
Returns:
dict: The dict containing loaded 2.5D bounding box annotations.
"""
results['centers2d'] = results['ann_info']['centers2d']
results['depths'] = results['ann_info']['depths']
return results
def _load_labels_3d(self, results):
"""Private function to load label annotations.
......@@ -497,6 +542,18 @@ class LoadAnnotations3D(LoadAnnotations):
results['gt_labels_3d'] = results['ann_info']['gt_labels_3d']
return results
def _load_attr_labels(self, results):
"""Private function to load label annotations.
Args:
results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
Returns:
dict: The dict containing loaded label annotations.
"""
results['attr_labels'] = results['ann_info']['attr_labels']
return results
def _load_masks_3d(self, results):
"""Private function to load 3D mask annotations.
......@@ -564,8 +621,14 @@ class LoadAnnotations3D(LoadAnnotations):
results = self._load_bboxes_3d(results)
if results is None:
return None
if self.with_bbox_depth:
results = self._load_bboxes_depth(results)
if results is None:
return None
if self.with_label_3d:
results = self._load_labels_3d(results)
if self.with_attr_label:
results = self._load_attr_labels(results)
if self.with_mask_3d:
results = self._load_masks_3d(results)
if self.with_seg_3d:
......@@ -579,11 +642,13 @@ class LoadAnnotations3D(LoadAnnotations):
repr_str = self.__class__.__name__ + '(\n'
repr_str += f'{indent_str}with_bbox_3d={self.with_bbox_3d}, '
repr_str += f'{indent_str}with_label_3d={self.with_label_3d}, '
repr_str += f'{indent_str}with_attr_label={self.with_attr_label}, '
repr_str += f'{indent_str}with_mask_3d={self.with_mask_3d}, '
repr_str += f'{indent_str}with_seg_3d={self.with_seg_3d}, '
repr_str += f'{indent_str}with_bbox={self.with_bbox}, '
repr_str += f'{indent_str}with_label={self.with_label}, '
repr_str += f'{indent_str}with_mask={self.with_mask}, '
repr_str += f'{indent_str}with_seg={self.with_seg}, '
repr_str += f'{indent_str}with_bbox_depth={self.with_bbox_depth}, '
repr_str += f'{indent_str}poly2mask={self.poly2mask})'
return repr_str
......@@ -65,8 +65,17 @@ class RandomFlip3D(RandomFlip):
np.array([], dtype=np.float32))
assert len(input_dict['bbox3d_fields']) == 1
for key in input_dict['bbox3d_fields']:
input_dict['points'] = input_dict[key].flip(
direction, points=input_dict['points'])
if 'points' in input_dict:
input_dict['points'] = input_dict[key].flip(
direction, points=input_dict['points'])
else:
input_dict[key].flip(direction)
if 'centers2d' in input_dict:
assert self.sync_2d is True and direction == 'horizontal', \
'Only support sync_2d=True and horizontal flip with images'
w = input_dict['img_shape'][1]
input_dict['centers2d'][..., 0] = \
w - input_dict['centers2d'][..., 0]
def __call__(self, input_dict):
"""Call function to flip points, values in the ``bbox3d_fields`` and \
......
{"images": [{"file_name": "samples/CAM_BACK_LEFT/n015-2018-07-18-11-07-57+0800__CAM_BACK_LEFT__1531883530447423.jpg", "id": "86e6806d626b4711a6d0f5015b090116", "token": "e93e98b63d3b40209056d129dc53ceee", "cam2ego_rotation": [0.6924185592174665, -0.7031619420114925, -0.11648342771943819, 0.11203317912370753], "cam2ego_translation": [1.03569100218, 0.484795032713, 1.59097014818], "ego2global_rotation": [-0.7495886280607293, -0.0077695335695504636, 0.00829759813869316, -0.6618063711504101], "ego2global_translation": [1010.1328353833223, 610.8111652918716, 0.0], "cam_intrinsic": [[1256.7414812095406, 0.0, 792.1125740759628], [0.0, 1256.7414812095406, 492.7757465151356], [0.0, 0.0, 1.0]], "width": 1600, "height": 900}, {"file_name": "samples/CAM_FRONT/n015-2018-07-18-11-07-57+0800__CAM_FRONT__1531883530412470.jpg", "id": "020d7b4f858147558106c504f7f31bef", "token": "e93e98b63d3b40209056d129dc53ceee", "cam2ego_rotation": [0.4998015430569128, -0.5030316162024876, 0.4997798114386805, -0.49737083824542755], "cam2ego_translation": [1.70079118954, 0.0159456324149, 1.51095763913], "ego2global_rotation": [-0.7495886280607293, -0.0077695335695504636, 0.00829759813869316, -0.6618063711504101], "ego2global_translation": [1010.1328353833223, 610.8111652918716, 0.0], "cam_intrinsic": [[1266.417203046554, 0.0, 816.2670197447984], [0.0, 1266.417203046554, 491.50706579294757], [0.0, 0.0, 1.0]], "width": 1600, "height": 900}, {"file_name": "samples/CAM_FRONT_RIGHT/n015-2018-07-18-11-07-57+0800__CAM_FRONT_RIGHT__1531883530420339.jpg", "id": "16d39ff22a8545b0a4ee3236a0fe1c20", "token": "e93e98b63d3b40209056d129dc53ceee", "cam2ego_rotation": [0.2060347966337182, -0.2026940577919598, 0.6824507824531167, -0.6713610884174485], "cam2ego_translation": [1.5508477543, -0.493404796419, 1.49574800619], "ego2global_rotation": [-0.7495886280607293, -0.0077695335695504636, 0.00829759813869316, -0.6618063711504101], "ego2global_translation": [1010.1328353833223, 610.8111652918716, 0.0], "cam_intrinsic": [[1260.8474446004698, 0.0, 807.968244525554], [0.0, 1260.8474446004698, 495.3344268742088], [0.0, 0.0, 1.0]], "width": 1600, "height": 900}, {"file_name": "samples/CAM_FRONT_LEFT/n015-2018-07-18-11-07-57+0800__CAM_FRONT_LEFT__1531883530404844.jpg", "id": "24332e9c554a406f880430f17771b608", "token": "e93e98b63d3b40209056d129dc53ceee", "cam2ego_rotation": [0.6757265034669446, -0.6736266522251881, 0.21214015046209478, -0.21122827103904068], "cam2ego_translation": [1.52387798135, 0.494631336551, 1.50932822144], "ego2global_rotation": [-0.7495886280607293, -0.0077695335695504636, 0.00829759813869316, -0.6618063711504101], "ego2global_translation": [1010.1328353833223, 610.8111652918716, 0.0], "cam_intrinsic": [[1272.5979470598488, 0.0, 826.6154927353808], [0.0, 1272.5979470598488, 479.75165386361925], [0.0, 0.0, 1.0]], "width": 1600, "height": 900}, {"file_name": "samples/CAM_BACK/n015-2018-07-18-11-07-57+0800__CAM_BACK__1531883530437525.jpg", "id": "aab35aeccbda42de82b2ff5c278a0d48", "token": "e93e98b63d3b40209056d129dc53ceee", "cam2ego_rotation": [0.5037872666382278, -0.49740249788611096, -0.4941850223835201, 0.5045496097725578], "cam2ego_translation": [0.0283260309358, 0.00345136761476, 1.57910346144], "ego2global_rotation": [-0.7495886280607293, -0.0077695335695504636, 0.00829759813869316, -0.6618063711504101], "ego2global_translation": [1010.1328353833223, 610.8111652918716, 0.0], "cam_intrinsic": [[809.2209905677063, 0.0, 829.2196003259838], [0.0, 809.2209905677063, 481.77842384512485], [0.0, 0.0, 1.0]], "width": 1600, "height": 900}, {"file_name": "samples/CAM_BACK_RIGHT/n015-2018-07-18-11-07-57+0800__CAM_BACK_RIGHT__1531883530427893.jpg", "id": "ec7096278e484c9ebe6894a2ad5682e9", "token": "e93e98b63d3b40209056d129dc53ceee", "cam2ego_rotation": [0.12280980120078765, -0.132400842670559, -0.7004305821388234, 0.690496031265798], "cam2ego_translation": [1.0148780988, -0.480568219723, 1.56239545128], "ego2global_rotation": [-0.7495886280607293, -0.0077695335695504636, 0.00829759813869316, -0.6618063711504101], "ego2global_translation": [1010.1328353833223, 610.8111652918716, 0.0], "cam_intrinsic": [[1259.5137405846733, 0.0, 807.2529053838625], [0.0, 1259.5137405846733, 501.19579884916527], [0.0, 0.0, 1.0]], "width": 1600, "height": 900}], "annotations": [{"file_name": "samples/CAM_FRONT/n015-2018-07-18-11-07-57+0800__CAM_FRONT__1531883530412470.jpg", "image_id": "020d7b4f858147558106c504f7f31bef", "area": 85383.89600714693, "category_name": "truck", "category_id": 1, "bbox": [0.0, 357.732750319127, 342.56437261895206, 249.24920053528984], "iscrowd": 0, "bbox_cam3d": [-10.356295829208502, -0.06394600736590471, 18.785737229926998, 2.312, 7.516, 3.093, -0.5996975863361309], "velo_cam3d": [0.05742557272436208, 0.06990201482350666], "center2d": [118.11016609440316, 487.19622492451936, 18.785737229926998], "attribute_name": "vehicle.parked", "attribute_id": 6, "segmentation": [], "id": 0}, {"file_name": "samples/CAM_FRONT_LEFT/n015-2018-07-18-11-07-57+0800__CAM_FRONT_LEFT__1531883530404844.jpg", "image_id": "24332e9c554a406f880430f17771b608", "area": 76274.38331683438, "category_name": "truck", "category_id": 1, "bbox": [1305.1296604171719, 350.75901341602525, 294.87033958282814, 258.6709243959383], "iscrowd": 0, "bbox_cam3d": [9.795917040815693, 0.07538275380197612, 19.033148401567978, 2.312, 7.516, 3.093, -1.5546044317874126], "velo_cam3d": [0.09022854769195846, -0.0065096147400431695], "center2d": [1481.5919397578637, 484.79190972187814, 19.033148401567978], "attribute_name": "vehicle.parked", "attribute_id": 6, "segmentation": [], "id": 1}, {"file_name": "samples/CAM_FRONT_LEFT/n015-2018-07-18-11-07-57+0800__CAM_FRONT_LEFT__1531883530404844.jpg", "image_id": "24332e9c554a406f880430f17771b608", "area": 5248.9339273703135, "category_name": "truck", "category_id": 1, "bbox": [808.1218983320856, 436.2076328554, 75.28483638734929, 69.72099800235912], "iscrowd": 0, "bbox_cam3d": [0.7896581102503435, -0.32866532307883706, 58.48166239420381, 2.877, 6.372, 2.978, 1.641180695066564], "velo_cam3d": [0.009938485543455734, 0.0010084200213775884], "center2d": [843.7989524532317, 472.5996886441534, 58.48166239420381], "attribute_name": "vehicle.parked", "attribute_id": 6, "segmentation": [], "id": 2}, {"file_name": "samples/CAM_FRONT_LEFT/n015-2018-07-18-11-07-57+0800__CAM_FRONT_LEFT__1531883530404844.jpg", "image_id": "24332e9c554a406f880430f17771b608", "area": 25266.816070927107, "category_name": "truck", "category_id": 1, "bbox": [1133.5883785276196, 424.4436001005383, 202.5256666350731, 124.75858734712807], "iscrowd": 0, "bbox_cam3d": [9.39338221449255, 0.19762751304835102, 30.01455814405707, 2.156, 6.227, 2.601, -1.4587684025759116], "velo_cam3d": [0.0, 0.0], "center2d": [1224.88885277412, 488.1309332180172, 30.01455814405707], "attribute_name": "vehicle.parked", "attribute_id": 6, "segmentation": [], "id": 3}, {"file_name": "samples/CAM_BACK/n015-2018-07-18-11-07-57+0800__CAM_BACK__1531883530437525.jpg", "image_id": "aab35aeccbda42de82b2ff5c278a0d48", "area": 31981.88483023472, "category_name": "car", "category_id": 0, "bbox": [652.8710695836726, 487.2457293359287, 256.3734471348506, 124.74725907715583], "iscrowd": 0, "bbox_cam3d": [-0.48041137691585667, 0.8426032188612489, 12.27160016308813, 1.871, 4.478, 1.456, -2.0402647554154876], "velo_cam3d": [-2.4043357184501866, -4.232358489028598], "center2d": [797.5400340802389, 537.3418550489371, 12.27160016308813], "attribute_name": "vehicle.moving", "attribute_id": 5, "segmentation": [], "id": 4}, {"file_name": "samples/CAM_BACK_LEFT/n015-2018-07-18-11-07-57+0800__CAM_BACK_LEFT__1531883530447423.jpg", "image_id": "86e6806d626b4711a6d0f5015b090116", "area": 1874.1656394574547, "category_name": "traffic_cone", "category_id": 8, "bbox": [1084.536273989852, 513.7567766430512, 30.043100006470013, 62.382565016720605], "iscrowd": 0, "bbox_cam3d": [3.745641322414848, 0.6321604510604618, 15.319339525420224, 0.3, 0.291, 0.734, 1.4550554479430875], "velo_cam3d": [0.028202672296939114, -0.001622377193634249], "center2d": [1099.3910188026568, 544.635832278593, 15.319339525420224], "attribute_name": "None", "attribute_id": 8, "segmentation": [], "id": 5}, {"file_name": "samples/CAM_BACK_LEFT/n015-2018-07-18-11-07-57+0800__CAM_BACK_LEFT__1531883530447423.jpg", "image_id": "86e6806d626b4711a6d0f5015b090116", "area": 1641.3529623313364, "category_name": "traffic_cone", "category_id": 8, "bbox": [823.5058461203419, 512.0451382733748, 27.545987206560085, 59.58591899514306], "iscrowd": 0, "bbox_cam3d": [0.558956408408079, 0.6054486006477211, 15.607344275188172, 0.315, 0.338, 0.712, 1.5596704833049395], "velo_cam3d": [0.07717355032092023, -0.0013264953734539453], "center2d": [837.1211093045397, 541.5279466177432, 15.607344275188172], "attribute_name": "None", "attribute_id": 8, "segmentation": [], "id": 6}, {"file_name": "samples/CAM_BACK_LEFT/n015-2018-07-18-11-07-57+0800__CAM_BACK_LEFT__1531883530447423.jpg", "image_id": "86e6806d626b4711a6d0f5015b090116", "area": 11464.868967812941, "category_name": "pedestrian", "category_id": 7, "bbox": [1091.57108913607, 427.8805195896188, 76.29701915190844, 150.2662763926101], "iscrowd": 0, "bbox_cam3d": [3.953820859983739, 0.11100574170732268, 14.75668416993455, 0.739, 0.563, 1.711, 1.4550554479430875], "velo_cam3d": [0.10262495353364391, -0.0064695610507391095], "center2d": [1128.8366393735657, 502.22946380348515, 14.75668416993455], "attribute_name": "pedestrian.sitting_lying_down", "attribute_id": 4, "segmentation": [], "id": 7}, {"file_name": "samples/CAM_BACK_LEFT/n015-2018-07-18-11-07-57+0800__CAM_BACK_LEFT__1531883530447423.jpg", "image_id": "86e6806d626b4711a6d0f5015b090116", "area": 10887.814254422945, "category_name": "pedestrian", "category_id": 7, "bbox": [1160.5755663065963, 427.76823935577545, 72.04645850373822, 151.1221298109749], "iscrowd": 0, "bbox_cam3d": [4.7798492054669035, 0.1162134030605403, 14.880252178422799, 0.665, 0.544, 1.739, 1.4550554479430875], "velo_cam3d": [0.08665208940588605, -0.12554131041835265], "center2d": [1195.8043058026105, 502.5907820768639, 14.880252178422799], "attribute_name": "pedestrian.sitting_lying_down", "attribute_id": 4, "segmentation": [], "id": 8}, {"file_name": "samples/CAM_BACK_LEFT/n015-2018-07-18-11-07-57+0800__CAM_BACK_LEFT__1531883530447423.jpg", "image_id": "86e6806d626b4711a6d0f5015b090116", "area": 1840.396836351825, "category_name": "traffic_cone", "category_id": 8, "bbox": [976.5016497372175, 515.0039595028874, 30.627062877370918, 60.09054292018379], "iscrowd": 0, "bbox_cam3d": [2.4596094747766615, 0.6404788797338883, 15.49228428713527, 0.338, 0.309, 0.712, 1.461625206011101], "velo_cam3d": [0.02389033738396964, -0.0027892907804445547], "center2d": [991.6372663187118, 544.7316983348808, 15.49228428713527], "attribute_name": "None", "attribute_id": 8, "segmentation": [], "id": 9}, {"file_name": "samples/CAM_BACK_RIGHT/n015-2018-07-18-11-07-57+0800__CAM_BACK_RIGHT__1531883530427893.jpg", "image_id": "ec7096278e484c9ebe6894a2ad5682e9", "area": 130637.82232697189, "category_name": "car", "category_id": 0, "bbox": [806.290660237549, 470.86948127698895, 564.486943265249, 231.42753589888787], "iscrowd": 0, "bbox_cam3d": [2.041080764231013, 0.5400087467741127, 10.16381197333443, 1.638, 4.25, 1.44, 2.3008777344302445], "velo_cam3d": [-3.11975390859937, 4.71824099865795], "center2d": [1060.1864774468488, 568.1144351228712, 10.16381197333443], "attribute_name": "vehicle.moving", "attribute_id": 5, "segmentation": [], "id": 10}], "categories": [{"id": 0, "name": "car"}, {"id": 1, "name": "truck"}, {"id": 2, "name": "trailer"}, {"id": 3, "name": "bus"}, {"id": 4, "name": "construction_vehicle"}, {"id": 5, "name": "bicycle"}, {"id": 6, "name": "motorcycle"}, {"id": 7, "name": "pedestrian"}, {"id": 8, "name": "traffic_cone"}, {"id": 9, "name": "barrier"}]}
\ No newline at end of file
import mmcv
import numpy as np
import pytest
import torch
from mmdet3d.datasets import NuScenesMonoDataset
def test_getitem():
np.random.seed(0)
class_names = [
'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
]
img_norm_cfg = dict(
mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False)
pipeline = [
dict(type='LoadImageFromFileMono3D'),
dict(
type='LoadAnnotations3D',
with_bbox=True,
with_label=True,
with_attr_label=True,
with_bbox_3d=True,
with_label_3d=True,
with_bbox_depth=True),
dict(type='Resize', img_scale=(1600, 900), keep_ratio=True),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=1.0),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(
type='Collect3D',
keys=[
'img', 'gt_bboxes', 'gt_labels', 'attr_labels', 'gt_bboxes_3d',
'gt_labels_3d', 'centers2d', 'depths'
]),
]
nus_dataset = NuScenesMonoDataset(
ann_file='tests/data/nuscenes/nus_infos_mono3d.coco.json',
pipeline=pipeline,
data_root='tests/data/nuscenes/',
img_prefix='tests/data/nuscenes/',
test_mode=False)
data = nus_dataset[0]
img_metas = data['img_metas']._data
filename = img_metas['filename']
img_shape = img_metas['img_shape']
pad_shape = img_metas['pad_shape']
flip = img_metas['flip']
bboxes = data['gt_bboxes']._data
attrs = data['attr_labels']._data
labels3d = data['gt_labels_3d']._data
labels = data['gt_labels']._data
centers2d = data['centers2d']._data
depths = data['depths']._data
expected_filename = 'tests/data/nuscenes/samples/CAM_BACK_LEFT/' + \
'n015-2018-07-18-11-07-57+0800__CAM_BACK_LEFT__1531883530447423.jpg'
expected_img_shape = (900, 1600, 3)
expected_pad_shape = (928, 1600, 3)
expected_flip = True
expected_bboxes = torch.tensor([[485.4207, 513.7568, 515.4637, 576.1393],
[748.9482, 512.0452, 776.4941, 571.6310],
[432.1318, 427.8805, 508.4290, 578.1468],
[367.3779, 427.7682, 439.4244, 578.8904],
[592.8713, 515.0040, 623.4984, 575.0945]])
expected_attr_labels = torch.tensor([8, 8, 4, 4, 8])
expected_labels = torch.tensor([8, 8, 7, 7, 8])
expected_centers2d = torch.tensor([[500.6090, 544.6358],
[762.8789, 541.5280],
[471.1633, 502.2295],
[404.1957, 502.5908],
[608.3627, 544.7317]])
expected_depths = torch.tensor(
[15.3193, 15.6073, 14.7567, 14.8803, 15.4923])
assert filename == expected_filename
assert img_shape == expected_img_shape
assert pad_shape == expected_pad_shape
assert flip == expected_flip
assert torch.allclose(bboxes, expected_bboxes, 1e-5)
assert torch.all(attrs == expected_attr_labels)
assert torch.all(labels == expected_labels)
assert torch.all(labels3d == expected_labels)
assert torch.allclose(centers2d, expected_centers2d, 1e-5)
assert torch.allclose(depths, expected_depths, 1e-5)
def test_format_results():
if not torch.cuda.is_available():
pytest.skip('test requires GPU and torch+cuda')
root_path = 'tests/data/nuscenes/'
ann_file = 'tests/data/nuscenes/nus_infos_mono3d.coco.json'
class_names = [
'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
]
pipeline = [
dict(type='LoadImageFromFileMono3D'),
dict(
type='LoadAnnotations3D',
with_bbox=True,
with_label=True,
with_attr_label=True,
with_bbox_3d=True,
with_label_3d=True,
with_bbox_depth=True),
dict(type='Resize', img_scale=(1600, 900), keep_ratio=True),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(
type='Collect3D',
keys=[
'img', 'gt_bboxes', 'gt_labels', 'attr_labels', 'gt_bboxes_3d',
'gt_labels_3d', 'centers2d', 'depths'
]),
]
nus_dataset = NuScenesMonoDataset(
ann_file=ann_file,
pipeline=pipeline,
data_root=root_path,
test_mode=True)
results = mmcv.load('tests/data/nuscenes/mono3d_sample_results.pkl')
result_files, tmp_dir = nus_dataset.format_results(results)
result_data = mmcv.load(result_files['img_bbox'])
assert len(result_data['results'].keys()) == 1
assert len(result_data['results']['e93e98b63d3b40209056d129dc53ceee']) == 8
det = result_data['results']['e93e98b63d3b40209056d129dc53ceee'][0]
expected_token = 'e93e98b63d3b40209056d129dc53ceee'
expected_trans = torch.tensor(
[1018.753821915645, 605.190386124652, 0.7266818822266328])
expected_size = torch.tensor([1.6380000114440918, 4.25, 1.440000057220459])
expected_rotation = torch.tensor([
-0.9924980733795628, -0.013604682549109839, 0.01027292674776989,
-0.12106590736714223
])
expected_detname = 'car'
expected_attr = 'vehicle.moving'
assert det['sample_token'] == expected_token
assert torch.allclose(
torch.tensor(det['translation']), expected_trans, 1e-5)
assert torch.allclose(torch.tensor(det['size']), expected_size, 1e-5)
assert torch.allclose(
torch.tensor(det['rotation']), expected_rotation, 1e-5)
assert det['detection_name'] == expected_detname
assert det['attribute_name'] == expected_attr
......@@ -130,10 +130,11 @@ def test_load_annotations3D():
scannet_pts_semantic_mask = scannet_results['pts_semantic_mask']
repr_str = repr(scannet_load_annotations3D)
expected_repr_str = 'LoadAnnotations3D(\n with_bbox_3d=True, ' \
'with_label_3d=True, with_mask_3d=True, ' \
'with_seg_3d=True, with_bbox=False, ' \
'with_label=False, with_mask=False, ' \
'with_seg=False, poly2mask=True)'
'with_label_3d=True, with_attr_label=False, ' \
'with_mask_3d=True, with_seg_3d=True, ' \
'with_bbox=False, with_label=False, ' \
'with_mask=False, with_seg=False, ' \
'with_bbox_depth=False, poly2mask=True)'
assert repr_str == expected_repr_str
assert scannet_gt_boxes.tensor.shape == (27, 7)
assert scannet_gt_labels.shape == (27, )
......
......@@ -55,6 +55,9 @@ def nuscenes_data_prep(root_path,
root_path, info_prefix, version=version, max_sweeps=max_sweeps)
if version == 'v1.0-test':
info_test_path = osp.join(root_path, f'{info_prefix}_infos_test.pkl')
nuscenes_converter.export_2d_annotation(
root_path, info_test_path, version=version)
return
info_train_path = osp.join(root_path, f'{info_prefix}_infos_train.pkl')
......
......@@ -9,12 +9,18 @@ from pyquaternion import Quaternion
from shapely.geometry import MultiPoint, box
from typing import List, Tuple, Union
from mmdet3d.core.bbox.box_np_ops import points_cam2img
from mmdet3d.datasets import NuScenesDataset
nus_categories = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
'barrier')
nus_attributes = ('cycle.with_rider', 'cycle.without_rider',
'pedestrian.moving', 'pedestrian.standing',
'pedestrian.sitting_lying_down', 'vehicle.moving',
'vehicle.parked', 'vehicle.stopped', 'None')
def create_nuscenes_infos(root_path,
info_prefix,
......@@ -324,13 +330,14 @@ def obtain_sensor2top(nusc,
return sweep
def export_2d_annotation(root_path, info_path, version):
def export_2d_annotation(root_path, info_path, version, mono3d=True):
"""Export 2d annotation from the info file and raw data.
Args:
root_path (str): Root path of the raw data.
info_path (str): Path of the info file.
version (str): Dataset version.
mono3d (bool): Whether to export mono3d annotation. Default: True.
"""
# get bbox annotations for camera
camera_types = [
......@@ -356,12 +363,20 @@ def export_2d_annotation(root_path, info_path, version):
coco_infos = get_2d_boxes(
nusc,
cam_info['sample_data_token'],
visibilities=['', '1', '2', '3', '4'])
visibilities=['', '1', '2', '3', '4'],
mono3d=mono3d)
(height, width, _) = mmcv.imread(cam_info['data_path']).shape
coco_2d_dict['images'].append(
dict(
file_name=cam_info['data_path'],
file_name=cam_info['data_path'].split('data/nuscenes/')
[-1],
id=cam_info['sample_data_token'],
token=info['token'],
cam2ego_rotation=cam_info['sensor2ego_rotation'],
cam2ego_translation=cam_info['sensor2ego_translation'],
ego2global_rotation=info['ego2global_rotation'],
ego2global_translation=info['ego2global_translation'],
cam_intrinsic=cam_info['cam_intrinsic'],
width=width,
height=height))
for coco_info in coco_infos:
......@@ -372,16 +387,24 @@ def export_2d_annotation(root_path, info_path, version):
coco_info['id'] = coco_ann_id
coco_2d_dict['annotations'].append(coco_info)
coco_ann_id += 1
mmcv.dump(coco_2d_dict, f'{info_path[:-4]}.coco.json')
if mono3d:
json_prefix = f'{info_path[:-4]}_mono3d'
else:
json_prefix = f'{info_path[:-4]}'
mmcv.dump(coco_2d_dict, f'{json_prefix}.coco.json')
def get_2d_boxes(nusc, sample_data_token: str,
visibilities: List[str]) -> List[OrderedDict]:
def get_2d_boxes(nusc,
sample_data_token: str,
visibilities: List[str],
mono3d=True):
"""Get the 2D annotation records for a given `sample_data_token`.
Args:
sample_data_token: Sample data token belonging to a camera keyframe.
visibilities: Visibility filter.
sample_data_token (str): Sample data token belonging to a camera \
keyframe.
visibilities (list[str]): Visibility filter.
mono3d (bool): Whether to get boxes with mono3d annotation.
Return:
list[dict]: List of 2D annotation record that belongs to the input
......@@ -456,6 +479,43 @@ def get_2d_boxes(nusc, sample_data_token: str,
# Generate dictionary record to be included in the .json file.
repro_rec = generate_record(ann_rec, min_x, min_y, max_x, max_y,
sample_data_token, sd_rec['filename'])
# If mono3d=True, add 3D annotations in camera coordinates
if mono3d and (repro_rec is not None):
loc = box.center.tolist()
dim = box.wlh.tolist()
rot = [box.orientation.yaw_pitch_roll[0]]
global_velo2d = nusc.box_velocity(box.token)[:2]
global_velo3d = np.array([*global_velo2d, 0.0])
e2g_r_mat = Quaternion(pose_rec['rotation']).rotation_matrix
c2e_r_mat = Quaternion(cs_rec['rotation']).rotation_matrix
cam_velo3d = global_velo3d @ np.linalg.inv(
e2g_r_mat).T @ np.linalg.inv(c2e_r_mat).T
velo = cam_velo3d[0::2].tolist()
repro_rec['bbox_cam3d'] = loc + dim + rot
repro_rec['velo_cam3d'] = velo
center3d = np.array(loc).reshape([1, 3])
center2d = points_cam2img(
center3d, camera_intrinsic, with_depth=True)
repro_rec['center2d'] = center2d.squeeze().tolist()
# normalized center2D + depth
# if samples with depth < 0 will be removed
if repro_rec['center2d'][2] <= 0:
continue
ann_token = nusc.get('sample_annotation',
box.token)['attribute_tokens']
if len(ann_token) == 0:
attr_name = 'None'
else:
attr_name = nusc.get('attribute', ann_token[0])['name']
attr_id = nus_attributes.index(attr_name)
repro_rec['attribute_name'] = attr_name
repro_rec['attribute_id'] = attr_id
repro_recs.append(repro_rec)
return repro_recs
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment