nuscenes.py

# Copyright 2021 Toyota Research Institute.  All rights reserved.
#import functools
from collections import OrderedDict

import numpy as np
import seaborn as sns
from torch.utils.data import Dataset
from tqdm import tqdm

#from detectron2.data import MetadataCatalog
from detectron2.structures.boxes import BoxMode
from nuscenes.eval.detection.utils import category_to_detection_name
from nuscenes.nuscenes import NuScenes
from nuscenes.utils.splits import create_splits_scenes

#from tridet.data import collect_dataset_dicts
from projects.mmdet3d_plugin.dd3d.structures.boxes3d import GenericBoxes3D
from projects.mmdet3d_plugin.dd3d.structures.pose import Pose
from projects.mmdet3d_plugin.dd3d.utils.geometry import project_points3d
from projects.mmdet3d_plugin.dd3d.utils.visualization import float_to_uint8_color

#  https://github.com/nutonomy/nuscenes-devkit/blob/9b209638ef3dee6d0cdc5ac700c493747f5b35fe/python-sdk/nuscenes/utils/splits.py#L189
#     - train/val/test: The standard splits of the nuScenes dataset (700/150/150 scenes).
#     - mini_train/mini_val: Train and val splits of the mini subset used for visualization and debugging (8/2 scenes).
#     - train_detect/train_track: Two halves of the train split used for separating the training sets of detector and
#         tracker if required
DATASET_NAME_TO_VERSION = {
    "nusc_train": "v1.0-trainval",
    "nusc_val": "v1.0-trainval",
    "nusc_val-subsample-8": "v1.0-trainval",
    "nusc_trainval": "v1.0-trainval",
    "nusc_test": "v1.0-test",
    "nusc_mini_train": "v1.0-mini",
    "nusc_mini_val": "v1.0-mini",
}

CAMERA_NAMES = ('CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT', 'CAM_BACK', 'CAM_BACK_RIGHT')

ATTRIBUTE_IDS = {
    'vehicle.moving': 0,
    'vehicle.parked': 1,
    'vehicle.stopped': 2,
    'pedestrian.moving': 0,
    'pedestrian.standing': 1,
    'pedestrian.sitting_lying_down': 2,
    'cycle.with_rider': 0,
    'cycle.without_rider': 1,
}

CATEGORY_IDS = OrderedDict({
    'barrier': 0,
    'bicycle': 1,
    'bus': 2,
    'car': 3,
    'construction_vehicle': 4,
    'motorcycle': 5,
    'pedestrian': 6,
    'traffic_cone': 7,
    'trailer': 8,
    'truck': 9,
})

COLORS = [float_to_uint8_color(clr) for clr in sns.color_palette("bright", n_colors=10)]
COLORMAP = OrderedDict({
    'barrier': COLORS[8],  # yellow
    'bicycle': COLORS[0],  # blue
    'bus': COLORS[6],  # pink
    'car': COLORS[2],  # green
    'construction_vehicle': COLORS[7],  # gray
    'motorcycle': COLORS[4],  # purple
    'pedestrian': COLORS[1],  # orange
    'traffic_cone': COLORS[3],  # red
    'trailer': COLORS[9],  # skyblue
    'truck': COLORS[5],  # brown
})

MAX_NUM_ATTRIBUTES = 3


def _compute_iou(box1, box2):
    """
    Parameters
    ----------
    box1, box2:
        (x1, y1, x2, y2)
    """
    xx1 = max(box1[0], box2[0])
    yy1 = max(box1[1], box2[1])
    xx2 = min(box1[2], box2[2])
    yy2 = min(box1[3], box2[3])
    if xx1 >= xx2 or yy1 >= yy2:
        return 0.
    inter = (xx2 - xx1) * (yy2 - yy1)
    a1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
    a2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
    return inter / (a1 + a2 - inter)


class NuscenesDataset(Dataset):
    def __init__(self, name, data_root, datum_names=CAMERA_NAMES, min_num_lidar_points=3, min_box_visibility=0.2, **unused):
        self.data_root = data_root
        assert name in DATASET_NAME_TO_VERSION
        version = DATASET_NAME_TO_VERSION[name]
        self.nusc = NuScenes(version=version, dataroot=data_root, verbose=True)

        self.datum_names = datum_names
        self.min_num_lidar_points = min_num_lidar_points
        self.min_box_visibility = min_box_visibility

        self.dataset_item_info = self._build_dataset_item_info(name)

        # Index instance tokens to their IDs
        self._instance_token_to_id = self._index_instance_tokens()

        # Construct the mapping from datum_token (image id) to index
        print("Generating the mapping from image id to idx...")
        self.datumtoken2idx = {}
        for idx, (datum_token, _, _, _, _) in enumerate(self.dataset_item_info):
            self.datumtoken2idx[datum_token] = idx
        print("Done.")

    def _build_dataset_item_info(self, name):
        scenes_in_split = self._get_split_scenes(name)

        dataset_items = []
        for _, scene_token in tqdm(scenes_in_split):
            scene = self.nusc.get('scene', scene_token)
            sample_token = scene['first_sample_token']
            for sample_idx in range(scene['nbr_samples']):
                if name.endswith('subsample-8') and sample_idx % 8 > 0:
                    # Sample-level subsampling.
                    continue

                sample = self.nusc.get('sample', sample_token)
                for datum_name, datum_token in sample['data'].items():
                    if datum_name not in self.datum_names:
                        continue
                    dataset_items.append((datum_token, sample_token, scene['name'], sample_idx, datum_name))
                sample_token = sample['next']
        return dataset_items

    def _get_split_scenes(self, name):
        scenes_in_splits = create_splits_scenes()
        if name == "nusc_trainval":
            scenes = scenes_in_splits["train"] + scenes_in_splits["val"]
        elif name == "nusc_val-subsample-8":
            scenes = scenes_in_splits["val"]
        else:
            assert name.startswith('nusc_'), f"Invalid dataset name: {name}"
            split = name[5:]
            assert split in scenes_in_splits, f"Invalid dataset: {split}"
            scenes = scenes_in_splits[split]

        # Mapping from scene name to token.
        name_to_token = {scene['name']: scene['token'] for scene in self.nusc.scene}
        return [(name, name_to_token[name]) for name in scenes]

    def __len__(self):
        return len(self.dataset_item_info)

    def _build_id(self, scene_name, sample_idx, datum_name):
        sample_id = f"{scene_name}_{sample_idx:03d}"
        image_id = f"{sample_id}_{datum_name}"
        return image_id, sample_id

    def _index_instance_tokens(self):
        """Index instance tokens for uniquely identifying instances across samples"""
        instance_token_to_id = {}
        for record in self.nusc.sample_annotation:
            instance_token = record['instance_token']
            if instance_token not in instance_token_to_id:
                next_instance_id = len(instance_token_to_id)
                instance_token_to_id[instance_token] = next_instance_id
        return instance_token_to_id

    def get_instance_annotations(self, annotation_list, K, image_shape, pose_WS):
        annotations = []
        for _ann in annotation_list:
            ann = self.nusc.get('sample_annotation', _ann.token)
            if ann['num_lidar_pts'] + ann['num_radar_pts'] < self.min_num_lidar_points:
                continue
            annotation = OrderedDict()

            # --------
            # Category
            # --------
            category = category_to_detection_name(ann['category_name'])
            if category is None:
                continue
            annotation['category_id'] = CATEGORY_IDS[category]

            # ------
            # 3D box
            # ------
            # NOTE: ann['rotation'], ann['translation'] is in global frame.
            pose_SO = Pose(wxyz=_ann.orientation, tvec=_ann.center)  # pose in sensor frame
            # DEBUG:
            # pose_WO_1 = Pose(np.array(ann['rotation']), np.array(ann['translation']))
            # pose_WO_2 = pose_WS * pose_SO
            # assert np.allclose(pose_WO_1.matrix, pose_WO_2.matrix)
            bbox3d = GenericBoxes3D(_ann.orientation, _ann.center, _ann.wlh)
            annotation['bbox3d'] = bbox3d.vectorize().tolist()[0]

            # --------------------------------------
            # 2D box -- project 8 corners of 3D bbox
            # --------------------------------------
            corners = project_points3d(bbox3d.corners.cpu().numpy().squeeze(0), K)
            l, t = corners[:, 0].min(), corners[:, 1].min()
            r, b = corners[:, 0].max(), corners[:, 1].max()

            x1 = max(0, l)
            y1 = max(0, t)
            x2 = min(image_shape[1], r)
            y2 = min(image_shape[0], b)

            iou = _compute_iou([l, t, r, b], [x1, y1, x2, y2])
            if iou < self.min_box_visibility:
                continue

            annotation['bbox'] = [x1, y1, x2, y2]
            annotation['bbox_mode'] = BoxMode.XYXY_ABS

            # --------
            # Track ID
            # --------
            annotation['track_id'] = self._instance_token_to_id[ann['instance_token']]

            # ---------
            # Attribute
            # ---------
            attr_tokens = ann['attribute_tokens']
            assert len(attr_tokens) < 2  # NOTE: Allow only single attrubute.
            attribute_id = MAX_NUM_ATTRIBUTES  # By default, MAX_NUM_ATTRIBUTES -- this is to be ignored in loss compute.
            if attr_tokens:
                attribute = self.nusc.get('attribute', attr_tokens[0])['name']
                attribute_id = ATTRIBUTE_IDS[attribute]
            annotation['attribute_id'] = attribute_id

            # -----
            # Speed
            # -----
            vel_global = self.nusc.box_velocity(ann['token'])
            speed = np.linalg.norm(vel_global)  # NOTE: This can be NaN.
            # DEBUG:
            # speed * Quaternion(ann['rotation']).rotation_matrix.T[0] ~= vel_global
            annotation['speed'] = speed

            annotations.append(annotation)

        return annotations

    def _get_ego_velocity(self, current, max_time_diff=1.5):
        """Velocity of ego-vehicle in m/s.
        """
        has_prev = current['prev'] != ''
        has_next = current['next'] != ''

        # Cannot estimate velocity for a single annotation.
        if not has_prev and not has_next:
            return np.array([np.nan, np.nan, np.nan])

        if has_prev:
            first = self.nusc.get('sample_data', current['prev'])
        else:
            first = current

        if has_next:
            last = self.nusc.get('sample_data', current['next'])
        else:
            last = current

        pos_first = self.nusc.get('ego_pose', first['ego_pose_token'])['translation']
        pos_last = self.nusc.get('ego_pose', last['ego_pose_token'])['translation']
        pos_diff = np.float32(pos_last) - np.float32(pos_first)

        time_last = 1e-6 * last['timestamp']
        time_first = 1e-6 * first['timestamp']
        time_diff = time_last - time_first

        if has_next and has_prev:
            # If doing centered difference, allow for up to double the max_time_diff.
            max_time_diff *= 2

        if time_diff > max_time_diff:
            # If time_diff is too big, don't return an estimate.
            return np.array([np.nan, np.nan, np.nan])
        else:
            return pos_diff / time_diff

    def __getitem__(self, idx):
        datum_token, sample_token, scene_name, sample_idx, datum_name = self.dataset_item_info[idx]
        datum = self.nusc.get('sample_data', datum_token)
        assert datum['is_key_frame']

        filename, _annotations, K = self.nusc.get_sample_data(datum_token)
        image_id, sample_id = self._build_id(scene_name, sample_idx, datum_name)
        height, width = datum['height'], datum['width']
        d2_dict = OrderedDict(
            file_name=filename,
            height=height,
            width=width,
            image_id=image_id,
            sample_id=sample_id,
            sample_token=sample_token
        )

        # Intrinsics
        d2_dict['intrinsics'] = list(K.flatten())

        # Get pose of the sensor (S) from vehicle (V) frame
        _pose_VS = self.nusc.get('calibrated_sensor', datum['calibrated_sensor_token'])
        pose_VS = Pose(wxyz=np.float64(_pose_VS['rotation']), tvec=np.float64(_pose_VS['translation']))

        # Get ego-pose of the vehicle (V) from global/world (W) frame
        _pose_WV = self.nusc.get('ego_pose', datum['ego_pose_token'])
        pose_WV = Pose(wxyz=np.float64(_pose_WV['rotation']), tvec=np.float64(_pose_WV['translation']))
        pose_WS = pose_WV * pose_VS

        d2_dict['pose'] = {'wxyz': list(pose_WS.quat.elements), 'tvec': list(pose_WS.tvec)}
        d2_dict['extrinsics'] = {'wxyz': list(pose_VS.quat.elements), 'tvec': list(pose_VS.tvec)}

        d2_dict['ego_speed'] = np.linalg.norm(self._get_ego_velocity(datum))

        d2_dict['annotations'] = self.get_instance_annotations(_annotations, K, (height, width), pose_WS)

        return d2_dict

    def getitem_by_datumtoken(self, datum_token):
        # idx = self.datumtoken2idx[datum_token]
        # ret = self.__getitem__(idx)

        datum = self.nusc.get('sample_data', datum_token)
        sample_token = datum['sample_token']
        filename, _annotations, K = self.nusc.get_sample_data(datum_token)
        height, width = datum['height'], datum['width']
        d2_dict = OrderedDict(
            file_name=filename,
            height=height,
            width=width,
            image_id=0,
            sample_id=0,
            sample_token=sample_token
        )
        # Intrinsics
        d2_dict['intrinsics'] = list(K.flatten())
        # Get pose of the sensor (S) from vehicle (V) frame
        _pose_VS = self.nusc.get('calibrated_sensor', datum['calibrated_sensor_token'])
        pose_VS = Pose(wxyz=np.float64(_pose_VS['rotation']), tvec=np.float64(_pose_VS['translation'])) 
        # Get ego-pose of the vehicle (V) from global/world (W) frame
        _pose_WV = self.nusc.get('ego_pose', datum['ego_pose_token'])
        pose_WV = Pose(wxyz=np.float64(_pose_WV['rotation']), tvec=np.float64(_pose_WV['translation']))
        pose_WS = pose_WV * pose_VS

        d2_dict['pose'] = {'wxyz': list(pose_WS.quat.elements), 'tvec': list(pose_WS.tvec)}
        d2_dict['extrinsics'] = {'wxyz': list(pose_VS.quat.elements), 'tvec': list(pose_VS.tvec)}

        d2_dict['ego_speed'] = np.linalg.norm(self._get_ego_velocity(datum))

        d2_dict['annotations'] = self.get_instance_annotations(_annotations, K, (height, width), pose_WS)
        return d2_dict