kitti_dataset.py 7.12 KB
Newer Older
dingchang's avatar
dingchang committed
1
# Copyright (c) OpenMMLab. All rights reserved.
2
from typing import Callable, List, Union
3
4

import numpy as np
zhangwenwei's avatar
zhangwenwei committed
5

jshilong's avatar
jshilong committed
6
from mmdet3d.datasets import DATASETS
zhangshilong's avatar
zhangshilong committed
7
from mmdet3d.structures import CameraInstance3DBoxes
jshilong's avatar
jshilong committed
8
from .det3d_dataset import Det3DDataset
zhangwenwei's avatar
zhangwenwei committed
9
10


11
@DATASETS.register_module()
jshilong's avatar
jshilong committed
12
class KittiDataset(Det3DDataset):
zhangwenwei's avatar
zhangwenwei committed
13
    r"""KITTI Dataset.
wangtai's avatar
wangtai committed
14

zhangwenwei's avatar
zhangwenwei committed
15
16
    This class serves as the API for experiments on the `KITTI Dataset
    <http://www.cvlibs.net/datasets/kitti/eval_object.php?obj_benchmark=3d>`_.
wangtai's avatar
wangtai committed
17
18
19
20

    Args:
        data_root (str): Path of dataset root.
        ann_file (str): Path of annotation file.
21
        pipeline (List[dict]): Pipeline used for data processing.
22
23
            Defaults to [].
        modality (dict): Modality to specify the sensor data used as input.
24
            Defaults to dict(use_lidar=True).
25
        default_cam_key (str): The default camera name adopted.
26
            Defaults to 'CAM2'.
27
28
29
30
31
32
33
34
35
        load_type (str): Type of loading mode. Defaults to 'frame_based'.

            - 'frame_based': Load all of the instances in the frame.
            - 'mv_image_based': Load all of the instances in the frame and need
              to convert to the FOV-based data type to support image-based
              detector.
            - 'fov_image_based': Only load the instances inside the default
              cam, and need to convert to the FOV-based data type to support
              image-based detector.
36
        box_type_3d (str): Type of 3D box of this dataset.
wangtai's avatar
wangtai committed
37
38
            Based on the `box_type_3d`, the dataset will encapsulate the box
            to its original format then converted them to `box_type_3d`.
39
            Defaults to 'LiDAR' in this dataset. Available options includes:
wangtai's avatar
wangtai committed
40

wangtai's avatar
wangtai committed
41
42
43
            - 'LiDAR': Box in LiDAR coordinates.
            - 'Depth': Box in depth coordinates, usually for indoor dataset.
            - 'Camera': Box in camera coordinates.
44
45
46
47
48
        filter_empty_gt (bool): Whether to filter the data with empty GT.
            If it's set to be True, the example with empty annotations after
            data pipeline will be dropped and a random example will be chosen
            in `__getitem__`. Defaults to True.
        test_mode (bool): Whether the dataset is in test mode.
wangtai's avatar
wangtai committed
49
            Defaults to False.
50
        pcd_limit_range (List[float]): The range of point cloud used to filter
51
            invalid predicted boxes.
52
            Defaults to [0, -40, -3, 70.4, 40, 0.0].
wangtai's avatar
wangtai committed
53
    """
jshilong's avatar
jshilong committed
54
    # TODO: use full classes of kitti
VVsssssk's avatar
VVsssssk committed
55
    METAINFO = {
56
        'classes': ('Pedestrian', 'Cyclist', 'Car', 'Van', 'Truck',
VVsssssk's avatar
VVsssssk committed
57
58
                    'Person_sitting', 'Tram', 'Misc')
    }
zhangwenwei's avatar
zhangwenwei committed
59
60

    def __init__(self,
jshilong's avatar
jshilong committed
61
62
63
                 data_root: str,
                 ann_file: str,
                 pipeline: List[Union[dict, Callable]] = [],
64
                 modality: dict = dict(use_lidar=True),
65
                 default_cam_key: str = 'CAM2',
66
                 load_type: str = 'frame_based',
jshilong's avatar
jshilong committed
67
68
69
70
                 box_type_3d: str = 'LiDAR',
                 filter_empty_gt: bool = True,
                 test_mode: bool = False,
                 pcd_limit_range: List[float] = [0, -40, -3, 70.4, 40, 0.0],
71
                 **kwargs) -> None:
jshilong's avatar
jshilong committed
72
73

        self.pcd_limit_range = pcd_limit_range
74
75
76
        assert load_type in ('frame_based', 'mv_image_based',
                             'fov_image_based')
        self.load_type = load_type
zhangwenwei's avatar
zhangwenwei committed
77
78
79
80
81
        super().__init__(
            data_root=data_root,
            ann_file=ann_file,
            pipeline=pipeline,
            modality=modality,
jshilong's avatar
jshilong committed
82
            default_cam_key=default_cam_key,
83
84
            box_type_3d=box_type_3d,
            filter_empty_gt=filter_empty_gt,
85
86
            test_mode=test_mode,
            **kwargs)
zhangwenwei's avatar
zhangwenwei committed
87
        assert self.modality is not None
jshilong's avatar
jshilong committed
88
        assert box_type_3d.lower() in ('lidar', 'camera')
zhangwenwei's avatar
zhangwenwei committed
89

jshilong's avatar
jshilong committed
90
91
    def parse_data_info(self, info: dict) -> dict:
        """Process the raw data info.
zhangwenwei's avatar
zhangwenwei committed
92

jshilong's avatar
jshilong committed
93
94
        The only difference with it in `Det3DDataset`
        is the specific process for `plane`.
95
96

        Args:
jshilong's avatar
jshilong committed
97
            info (dict): Raw info dict.
98
99

        Returns:
jshilong's avatar
jshilong committed
100
101
            dict: Has `ann_info` in training stage. And
            all path has been converted to absolute path.
102
        """
jshilong's avatar
jshilong committed
103
104
105
106
        if self.modality['use_lidar']:
            if 'plane' in info:
                # convert ground plane to velodyne coordinates
                plane = np.array(info['plane'])
zhangshilong's avatar
zhangshilong committed
107
108
                lidar2cam = np.array(
                    info['images']['CAM2']['lidar2cam'], dtype=np.float32)
jshilong's avatar
jshilong committed
109
110
111
112
113
114
115
116
117
118
119
120
121
122
                reverse = np.linalg.inv(lidar2cam)

                (plane_norm_cam, plane_off_cam) = (plane[:3],
                                                   -plane[:3] * plane[3])
                plane_norm_lidar = \
                    (reverse[:3, :3] @ plane_norm_cam[:, None])[:, 0]
                plane_off_lidar = (
                    reverse[:3, :3] @ plane_off_cam[:, None][:, 0] +
                    reverse[:3, 3])
                plane_lidar = np.zeros_like(plane_norm_lidar, shape=(4, ))
                plane_lidar[:3] = plane_norm_lidar
                plane_lidar[3] = -plane_norm_lidar.T @ plane_off_lidar
            else:
                plane_lidar = None
zhangwenwei's avatar
zhangwenwei committed
123

jshilong's avatar
jshilong committed
124
            info['plane'] = plane_lidar
zhangwenwei's avatar
zhangwenwei committed
125

126
        if self.load_type == 'fov_image_based' and self.load_eval_anns:
127
128
            info['instances'] = info['cam_instances'][self.default_cam_key]

jshilong's avatar
jshilong committed
129
        info = super().parse_data_info(info)
zhangwenwei's avatar
zhangwenwei committed
130

jshilong's avatar
jshilong committed
131
        return info
zhangwenwei's avatar
zhangwenwei committed
132

133
    def parse_ann_info(self, info: dict) -> dict:
134
        """Process the `instances` in data info to `ann_info`.
135
136

        Args:
jshilong's avatar
jshilong committed
137
            info (dict): Data information of single data sample.
138
139

        Returns:
140
            dict: Annotation information consists of the following keys:
141

zhangshilong's avatar
zhangshilong committed
142
                - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`):
143
                  3D ground truth bboxes.
jshilong's avatar
jshilong committed
144
                - bbox_labels_3d (np.ndarray): Labels of ground truths.
wangtai's avatar
wangtai committed
145
146
                - gt_bboxes (np.ndarray): 2D ground truth bboxes.
                - gt_labels (np.ndarray): Labels of ground truths.
147
                - difficulty (int): Difficulty defined by KITTI.
148
                  0, 1, 2 represent xxxxx respectively.
149
        """
jshilong's avatar
jshilong committed
150
        ann_info = super().parse_ann_info(info)
151
        if ann_info is None:
jshilong's avatar
jshilong committed
152
            ann_info = dict()
153
154
155
            # empty instance
            ann_info['gt_bboxes_3d'] = np.zeros((0, 7), dtype=np.float32)
            ann_info['gt_labels_3d'] = np.zeros(0, dtype=np.int64)
156

157
            if self.load_type in ['fov_image_based', 'mv_image_based']:
158
159
160
161
162
                ann_info['gt_bboxes'] = np.zeros((0, 4), dtype=np.float32)
                ann_info['gt_bboxes_labels'] = np.array(0, dtype=np.int64)
                ann_info['centers_2d'] = np.zeros((0, 2), dtype=np.float32)
                ann_info['depths'] = np.zeros((0), dtype=np.float32)

jshilong's avatar
jshilong committed
163
164
165
166
167
168
169
170
171
        ann_info = self._remove_dontcare(ann_info)
        # in kitti, lidar2cam = R0_rect @ Tr_velo_to_cam
        lidar2cam = np.array(info['images']['CAM2']['lidar2cam'])
        # convert gt_bboxes_3d to velodyne coordinates with `lidar2cam`
        gt_bboxes_3d = CameraInstance3DBoxes(
            ann_info['gt_bboxes_3d']).convert_to(self.box_mode_3d,
                                                 np.linalg.inv(lidar2cam))
        ann_info['gt_bboxes_3d'] = gt_bboxes_3d
        return ann_info