raw_mmdetection

7aa442d5 · raojy · 9c03eaa8 · 7aa442d5 · 7aa442d5 · 7aa442d5
Commit 7aa442d5 authored Apr 01, 2026 by raojy
5 changed files
--- a/mmdetection3d/mmdet3d/datasets/kitti2d_dataset.py
+++ b/mmdetection3d/mmdet3d/datasets/kitti2d_dataset.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmengine
+import numpy as np
+
+from mmdet3d.datasets import Det3DDataset
+from mmdet3d.registry import DATASETS
+
+
+@DATASETS.register_module()
+class Kitti2DDataset(Det3DDataset):
+    r"""KITTI 2D Dataset.
+
+    This class serves as the API for experiments on the `KITTI Dataset
+    <http://www.cvlibs.net/datasets/kitti/eval_object.php?obj_benchmark=3d>`_.
+
+    Args:
+        data_root (str): Path of dataset root.
+        ann_file (str): Path of annotation file.
+        pipeline (list[dict], optional): Pipeline used for data processing.
+            Defaults to None.
+        classes (tuple[str], optional): Classes used in the dataset.
+            Defaults to None.
+        modality (dict, optional): Modality to specify the sensor data used
+            as input. Defaults to None.
+        box_type_3d (str, optional): Type of 3D box of this dataset.
+            Based on the `box_type_3d`, the dataset will encapsulate the box
+            to its original format then converted them to `box_type_3d`.
+            Defaults to 'LiDAR'. Available options includes
+
+            - 'LiDAR': Box in LiDAR coordinates.
+            - 'Depth': Box in depth coordinates, usually for indoor dataset.
+            - 'Camera': Box in camera coordinates.
+        filter_empty_gt (bool, optional): Whether to filter empty GT.
+            Defaults to True.
+        test_mode (bool, optional): Whether the dataset is in test mode.
+            Defaults to False.
+    """
+
+    classes = ('car', 'pedestrian', 'cyclist')
+    """
+    Annotation format:
+    [
+        {
+            'image': {
+                'image_idx': 0,
+                'image_path': 'training/image_2/000000.png',
+                'image_shape': array([ 370, 1224], dtype=int32)
+            },
+            'point_cloud': {
+                 'num_features': 4,
+                 'velodyne_path': 'training/velodyne/000000.bin'
+             },
+             'calib': {
+                 'P0': <np.ndarray> (4, 4),
+                 'P1': <np.ndarray> (4, 4),
+                 'P2': <np.ndarray> (4, 4),
+                 'P3': <np.ndarray> (4, 4),
+                 'R0_rect':4x4 np.array,
+                 'Tr_velo_to_cam': 4x4 np.array,
+                 'Tr_imu_to_velo': 4x4 np.array
+             },
+             'annos': {
+                 'name': <np.ndarray> (n),
+                 'truncated': <np.ndarray> (n),
+                 'occluded': <np.ndarray> (n),
+                 'alpha': <np.ndarray> (n),
+                 'bbox': <np.ndarray> (n, 4),
+                 'dimensions': <np.ndarray> (n, 3),
+                 'location': <np.ndarray> (n, 3),
+                 'rotation_y': <np.ndarray> (n),
+                 'score': <np.ndarray> (n),
+                 'index': array([0], dtype=int32),
+                 'group_ids': array([0], dtype=int32),
+                 'difficulty': array([0], dtype=int32),
+                 'num_points_in_gt': <np.ndarray> (n),
+             }
+        }
+    ]
+    """
+
+    def load_annotations(self, ann_file):
+        """Load annotations from ann_file.
+
+        Args:
+            ann_file (str): Path of the annotation file.
+
+        Returns:
+            list[dict]: List of annotations.
+        """
+        self.data_infos = mmengine.load(ann_file)
+        self.cat2label = {
+            cat_name: i
+            for i, cat_name in enumerate(self.classes)
+        }
+        return self.data_infos
+
+    def _filter_imgs(self, min_size=32):
+        """Filter images without ground truths."""
+        valid_inds = []
+        for i, img_info in enumerate(self.data_infos):
+            if len(img_info['annos']['name']) > 0:
+                valid_inds.append(i)
+        return valid_inds
+
+    def get_ann_info(self, index):
+        """Get annotation info according to the given index.
+
+        Args:
+            index (int): Index of the annotation data to get.
+
+        Returns:
+            dict: Annotation information consists of the following keys:
+
+                - bboxes (np.ndarray): Ground truth bboxes.
+                - labels (np.ndarray): Labels of ground truths.
+        """
+        # Use index to get the annos, thus the evalhook could also use this api
+        info = self.data_infos[index]
+        annos = info['annos']
+        gt_names = annos['name']
+        gt_bboxes = annos['bbox']
+        difficulty = annos['difficulty']
+
+        # remove classes that is not needed
+        selected = self.keep_arrays_by_name(gt_names, self.classes)
+        gt_bboxes = gt_bboxes[selected]
+        gt_names = gt_names[selected]
+        difficulty = difficulty[selected]
+        gt_labels = np.array([self.cat2label[n] for n in gt_names])
+
+        anns_results = dict(
+            bboxes=gt_bboxes.astype(np.float32),
+            labels=gt_labels,
+        )
+        return anns_results
+
+    def prepare_train_img(self, idx):
+        """Training image preparation.
+
+        Args:
+            index (int): Index for accessing the target image data.
+
+        Returns:
+            dict: Training image data dict after preprocessing
+                corresponding to the index.
+        """
+        img_raw_info = self.data_infos[idx]['image']
+        img_info = dict(filename=img_raw_info['image_path'])
+        ann_info = self.get_ann_info(idx)
+        if len(ann_info['bboxes']) == 0:
+            return None
+        results = dict(img_info=img_info, ann_info=ann_info)
+        if self.proposals is not None:
+            results['proposals'] = self.proposals[idx]
+        self.pre_pipeline(results)
+        return self.pipeline(results)
+
+    def prepare_test_img(self, idx):
+        """Prepare data for testing.
+
+        Args:
+            index (int): Index for accessing the target image data.
+
+        Returns:
+            dict: Testing image data dict after preprocessing
+                corresponding to the index.
+        """
+        img_raw_info = self.data_infos[idx]['image']
+        img_info = dict(filename=img_raw_info['image_path'])
+        results = dict(img_info=img_info)
+        if self.proposals is not None:
+            results['proposals'] = self.proposals[idx]
+        self.pre_pipeline(results)
+        return self.pipeline(results)
+
+    def drop_arrays_by_name(self, gt_names, used_classes):
+        """Drop irrelevant ground truths by name.
+
+        Args:
+            gt_names (list[str]): Names of ground truths.
+            used_classes (list[str]): Classes of interest.
+
+        Returns:
+            np.ndarray: Indices of ground truths that will be dropped.
+        """
+        inds = [i for i, x in enumerate(gt_names) if x not in used_classes]
+        inds = np.array(inds, dtype=np.int64)
+        return inds
+
+    def keep_arrays_by_name(self, gt_names, used_classes):
+        """Keep useful ground truths by name.
+
+        Args:
+            gt_names (list[str]): Names of ground truths.
+            used_classes (list[str]): Classes of interest.
+
+        Returns:
+            np.ndarray: Indices of ground truths that will be keeped.
+        """
+        inds = [i for i, x in enumerate(gt_names) if x in used_classes]
+        inds = np.array(inds, dtype=np.int64)
+        return inds
+
+    def reformat_bbox(self, outputs, out=None):
+        """Reformat bounding boxes to KITTI 2D styles.
+
+        Args:
+            outputs (list[np.ndarray]): List of arrays storing the inferenced
+                bounding boxes and scores.
+            out (str, optional): The prefix of output file.
+                Default: None.
+
+        Returns:
+            list[dict]: A list of dictionaries with the kitti 2D format.
+        """
+        from mmdet3d.structures.ops.transforms import bbox2result_kitti2d
+        sample_idx = [info['image']['image_idx'] for info in self.data_infos]
+        result_files = bbox2result_kitti2d(outputs, self.classes, sample_idx,
+                                           out)
+        return result_files
+
+    def evaluate(self, result_files, eval_types=None):
+        """Evaluation in KITTI protocol.
+
+        Args:
+            result_files (str): Path of result files.
+            eval_types (str, optional): Types of evaluation. Default: None.
+                KITTI dataset only support 'bbox' evaluation type.
+
+        Returns:
+            tuple (str, dict): Average precision results in str format
+                and average precision results in dict format.
+        """
+        from mmdet3d.evaluation import kitti_eval
+        eval_types = ['bbox'] if not eval_types else eval_types
+        assert eval_types in ('bbox', ['bbox'
+                                       ]), 'KITTI data set only evaluate bbox'
+        gt_annos = [info['annos'] for info in self.data_infos]
+        ap_result_str, ap_dict = kitti_eval(
+            gt_annos, result_files, self.classes, eval_types=['bbox'])
+        return ap_result_str, ap_dict
--- a/mmdetection3d/mmdet3d/datasets/kitti_dataset.py
+++ b/mmdetection3d/mmdet3d/datasets/kitti_dataset.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Callable, List, Union
+
+import numpy as np
+
+from mmdet3d.registry import DATASETS
+from mmdet3d.structures import CameraInstance3DBoxes
+from .det3d_dataset import Det3DDataset
+
+
+@DATASETS.register_module()
+class KittiDataset(Det3DDataset):
+    r"""KITTI Dataset.
+
+    This class serves as the API for experiments on the `KITTI Dataset
+    <http://www.cvlibs.net/datasets/kitti/eval_object.php?obj_benchmark=3d>`_.
+
+    Args:
+        data_root (str): Path of dataset root.
+        ann_file (str): Path of annotation file.
+        pipeline (List[dict]): Pipeline used for data processing.
+            Defaults to [].
+        modality (dict): Modality to specify the sensor data used as input.
+            Defaults to dict(use_lidar=True).
+        default_cam_key (str): The default camera name adopted.
+            Defaults to 'CAM2'.
+        load_type (str): Type of loading mode. Defaults to 'frame_based'.
+
+            - 'frame_based': Load all of the instances in the frame.
+            - 'mv_image_based': Load all of the instances in the frame and need
+              to convert to the FOV-based data type to support image-based
+              detector.
+            - 'fov_image_based': Only load the instances inside the default
+              cam, and need to convert to the FOV-based data type to support
+              image-based detector.
+        box_type_3d (str): Type of 3D box of this dataset.
+            Based on the `box_type_3d`, the dataset will encapsulate the box
+            to its original format then converted them to `box_type_3d`.
+            Defaults to 'LiDAR' in this dataset. Available options includes:
+
+            - 'LiDAR': Box in LiDAR coordinates.
+            - 'Depth': Box in depth coordinates, usually for indoor dataset.
+            - 'Camera': Box in camera coordinates.
+        filter_empty_gt (bool): Whether to filter the data with empty GT.
+            If it's set to be True, the example with empty annotations after
+            data pipeline will be dropped and a random example will be chosen
+            in `__getitem__`. Defaults to True.
+        test_mode (bool): Whether the dataset is in test mode.
+            Defaults to False.
+        pcd_limit_range (List[float]): The range of point cloud used to filter
+            invalid predicted boxes.
+            Defaults to [0, -40, -3, 70.4, 40, 0.0].
+    """
+    # TODO: use full classes of kitti
+    METAINFO = {
+        'classes': ('Pedestrian', 'Cyclist', 'Car', 'Van', 'Truck',
+                    'Person_sitting', 'Tram', 'Misc'),
+        'palette': [(106, 0, 228), (119, 11, 32), (165, 42, 42), (0, 0, 192),
+                    (197, 226, 255), (0, 60, 100), (0, 0, 142), (255, 77, 255)]
+    }
+
+    def __init__(self,
+                 data_root: str,
+                 ann_file: str,
+                 pipeline: List[Union[dict, Callable]] = [],
+                 modality: dict = dict(use_lidar=True),
+                 default_cam_key: str = 'CAM2',
+                 load_type: str = 'frame_based',
+                 box_type_3d: str = 'LiDAR',
+                 filter_empty_gt: bool = True,
+                 test_mode: bool = False,
+                 pcd_limit_range: List[float] = [0, -40, -3, 70.4, 40, 0.0],
+                 **kwargs) -> None:
+
+        self.pcd_limit_range = pcd_limit_range
+        assert load_type in ('frame_based', 'mv_image_based',
+                             'fov_image_based')
+        self.load_type = load_type
+        super().__init__(
+            data_root=data_root,
+            ann_file=ann_file,
+            pipeline=pipeline,
+            modality=modality,
+            default_cam_key=default_cam_key,
+            box_type_3d=box_type_3d,
+            filter_empty_gt=filter_empty_gt,
+            test_mode=test_mode,
+            **kwargs)
+        assert self.modality is not None
+        assert box_type_3d.lower() in ('lidar', 'camera')
+
+    def parse_data_info(self, info: dict) -> dict:
+        """Process the raw data info.
+
+        The only difference with it in `Det3DDataset`
+        is the specific process for `plane`.
+
+        Args:
+            info (dict): Raw info dict.
+
+        Returns:
+            dict: Has `ann_info` in training stage. And
+            all path has been converted to absolute path.
+        """
+        if self.modality['use_lidar']:
+            if 'plane' in info:
+                # convert ground plane to velodyne coordinates
+                plane = np.array(info['plane'])
+                lidar2cam = np.array(
+                    info['images']['CAM2']['lidar2cam'], dtype=np.float32)
+                reverse = np.linalg.inv(lidar2cam)
+
+                (plane_norm_cam, plane_off_cam) = (plane[:3],
+                                                   -plane[:3] * plane[3])
+                plane_norm_lidar = \
+                    (reverse[:3, :3] @ plane_norm_cam[:, None])[:, 0]
+                plane_off_lidar = (
+                    reverse[:3, :3] @ plane_off_cam[:, None][:, 0] +
+                    reverse[:3, 3])
+                plane_lidar = np.zeros_like(plane_norm_lidar, shape=(4, ))
+                plane_lidar[:3] = plane_norm_lidar
+                plane_lidar[3] = -plane_norm_lidar.T @ plane_off_lidar
+            else:
+                plane_lidar = None
+
+            info['plane'] = plane_lidar
+
+        if self.load_type == 'fov_image_based' and self.load_eval_anns:
+            info['instances'] = info['cam_instances'][self.default_cam_key]
+
+        info = super().parse_data_info(info)
+
+        return info
+
+    def parse_ann_info(self, info: dict) -> dict:
+        """Process the `instances` in data info to `ann_info`.
+
+        Args:
+            info (dict): Data information of single data sample.
+
+        Returns:
+            dict: Annotation information consists of the following keys:
+
+                - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`):
+                  3D ground truth bboxes.
+                - bbox_labels_3d (np.ndarray): Labels of ground truths.
+                - gt_bboxes (np.ndarray): 2D ground truth bboxes.
+                - gt_labels (np.ndarray): Labels of ground truths.
+                - difficulty (int): Difficulty defined by KITTI.
+                  0, 1, 2 represent xxxxx respectively.
+        """
+        ann_info = super().parse_ann_info(info)
+        if ann_info is None:
+            ann_info = dict()
+            # empty instance
+            ann_info['gt_bboxes_3d'] = np.zeros((0, 7), dtype=np.float32)
+            ann_info['gt_labels_3d'] = np.zeros(0, dtype=np.int64)
+
+            if self.load_type in ['fov_image_based', 'mv_image_based']:
+                ann_info['gt_bboxes'] = np.zeros((0, 4), dtype=np.float32)
+                ann_info['gt_bboxes_labels'] = np.array(0, dtype=np.int64)
+                ann_info['centers_2d'] = np.zeros((0, 2), dtype=np.float32)
+                ann_info['depths'] = np.zeros((0), dtype=np.float32)
+
+        ann_info = self._remove_dontcare(ann_info)
+        # in kitti, lidar2cam = R0_rect @ Tr_velo_to_cam
+        lidar2cam = np.array(info['images']['CAM2']['lidar2cam'])
+        # convert gt_bboxes_3d to velodyne coordinates with `lidar2cam`
+        gt_bboxes_3d = CameraInstance3DBoxes(
+            ann_info['gt_bboxes_3d']).convert_to(self.box_mode_3d,
+                                                 np.linalg.inv(lidar2cam))
+        ann_info['gt_bboxes_3d'] = gt_bboxes_3d
+        return ann_info
--- a/mmdetection3d/mmdet3d/datasets/lyft_dataset.py
+++ b/mmdetection3d/mmdet3d/datasets/lyft_dataset.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Callable, List, Union
+
+import numpy as np
+
+from mmdet3d.registry import DATASETS
+from mmdet3d.structures import LiDARInstance3DBoxes
+from .det3d_dataset import Det3DDataset
+
+
+@DATASETS.register_module()
+class LyftDataset(Det3DDataset):
+    r"""Lyft Dataset.
+
+    This class serves as the API for experiments on the Lyft Dataset.
+
+    Please refer to
+    `<https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles/data>`_
+    for data downloading.
+
+    Args:
+        data_root (str): Path of dataset root.
+        ann_file (str): Path of annotation file.
+        pipeline (List[dict]): Pipeline used for data processing.
+            Defaults to [].
+        modality (dict): Modality to specify the sensor data used as input.
+            Defaults to dict(use_camera=False, use_lidar=True).
+        box_type_3d (str): Type of 3D box of this dataset.
+            Based on the `box_type_3d`, the dataset will encapsulate the box
+            to its original format then converted them to `box_type_3d`.
+            Defaults to 'LiDAR' in this dataset. Available options includes:
+
+            - 'LiDAR': Box in LiDAR coordinates.
+            - 'Depth': Box in depth coordinates, usually for indoor dataset.
+            - 'Camera': Box in camera coordinates.
+        filter_empty_gt (bool): Whether to filter the data with empty GT.
+            If it's set to be True, the example with empty annotations after
+            data pipeline will be dropped and a random example will be chosen
+            in `__getitem__`. Defaults to True.
+        test_mode (bool): Whether the dataset is in test mode.
+            Defaults to False.
+    """
+
+    METAINFO = {
+        'classes':
+        ('car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle',
+         'motorcycle', 'bicycle', 'pedestrian', 'animal'),
+        'palette': [(106, 0, 228), (119, 11, 32), (165, 42, 42), (0, 0, 192),
+                    (197, 226, 255), (0, 60, 100), (0, 0, 142), (255, 77, 255),
+                    (153, 69, 1)]
+    }
+
+    def __init__(self,
+                 data_root: str,
+                 ann_file: str,
+                 pipeline: List[Union[dict, Callable]] = [],
+                 modality: dict = dict(use_camera=False, use_lidar=True),
+                 box_type_3d: str = 'LiDAR',
+                 filter_empty_gt: bool = True,
+                 test_mode: bool = False,
+                 **kwargs):
+        assert box_type_3d.lower() in ['lidar']
+        super().__init__(
+            data_root=data_root,
+            ann_file=ann_file,
+            pipeline=pipeline,
+            modality=modality,
+            box_type_3d=box_type_3d,
+            filter_empty_gt=filter_empty_gt,
+            test_mode=test_mode,
+            **kwargs)
+
+    def parse_ann_info(self, info: dict) -> dict:
+        """Process the `instances` in data info to `ann_info`.
+
+        Args:
+            info (dict): Data information of single data sample.
+
+        Returns:
+            dict: Annotation information consists of the following keys:
+
+                - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`):
+                  3D ground truth bboxes.
+                - gt_labels_3d (np.ndarray): Labels of 3D ground truths.
+        """
+        ann_info = super().parse_ann_info(info)
+        if ann_info is None:
+            # empty instance
+            anns_results = dict()
+            anns_results['gt_bboxes_3d'] = np.zeros((0, 7), dtype=np.float32)
+            anns_results['gt_labels_3d'] = np.zeros(0, dtype=np.int64)
+            return anns_results
+        gt_bboxes_3d = ann_info['gt_bboxes_3d']
+        gt_labels_3d = ann_info['gt_labels_3d']
+
+        # the nuscenes box center is [0.5, 0.5, 0.5], we change it to be
+        # the same as KITTI (0.5, 0.5, 0)
+        gt_bboxes_3d = LiDARInstance3DBoxes(
+            gt_bboxes_3d,
+            box_dim=gt_bboxes_3d.shape[-1],
+            origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+
+        anns_results = dict(
+            gt_bboxes_3d=gt_bboxes_3d, gt_labels_3d=gt_labels_3d)
+        return anns_results
--- a/mmdetection3d/mmdet3d/datasets/nuscenes_dataset.py
+++ b/mmdetection3d/mmdet3d/datasets/nuscenes_dataset.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from os import path as osp
+from typing import Callable, List, Union
+
+import numpy as np
+
+from mmdet3d.registry import DATASETS
+from mmdet3d.structures import LiDARInstance3DBoxes
+from mmdet3d.structures.bbox_3d.cam_box3d import CameraInstance3DBoxes
+from .det3d_dataset import Det3DDataset
+
+
+@DATASETS.register_module()
+class NuScenesDataset(Det3DDataset):
+    r"""NuScenes Dataset.
+
+    This class serves as the API for experiments on the NuScenes Dataset.
+
+    Please refer to `NuScenes Dataset <https://www.nuscenes.org/download>`_
+    for data downloading.
+
+    Args:
+        data_root (str): Path of dataset root.
+        ann_file (str): Path of annotation file.
+        pipeline (list[dict]): Pipeline used for data processing.
+            Defaults to [].
+        box_type_3d (str): Type of 3D box of this dataset.
+            Based on the `box_type_3d`, the dataset will encapsulate the box
+            to its original format then converted them to `box_type_3d`.
+            Defaults to 'LiDAR' in this dataset. Available options includes:
+
+            - 'LiDAR': Box in LiDAR coordinates.
+            - 'Depth': Box in depth coordinates, usually for indoor dataset.
+            - 'Camera': Box in camera coordinates.
+        load_type (str): Type of loading mode. Defaults to 'frame_based'.
+
+            - 'frame_based': Load all of the instances in the frame.
+            - 'mv_image_based': Load all of the instances in the frame and need
+                to convert to the FOV-based data type to support image-based
+                detector.
+            - 'fov_image_based': Only load the instances inside the default
+                cam, and need to convert to the FOV-based data type to support
+                image-based detector.
+        modality (dict): Modality to specify the sensor data used as input.
+            Defaults to dict(use_camera=False, use_lidar=True).
+        filter_empty_gt (bool): Whether to filter the data with empty GT.
+            If it's set to be True, the example with empty annotations after
+            data pipeline will be dropped and a random example will be chosen
+            in `__getitem__`. Defaults to True.
+        test_mode (bool): Whether the dataset is in test mode.
+            Defaults to False.
+        with_velocity (bool): Whether to include velocity prediction
+            into the experiments. Defaults to True.
+        use_valid_flag (bool): Whether to use `use_valid_flag` key
+            in the info file as mask to filter gt_boxes and gt_names.
+            Defaults to False.
+    """
+    METAINFO = {
+        'classes':
+        ('car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+         'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'),
+        'version':
+        'v1.0-trainval',
+        'palette': [
+            (255, 158, 0),  # Orange
+            (255, 99, 71),  # Tomato
+            (255, 140, 0),  # Darkorange
+            (255, 127, 80),  # Coral
+            (233, 150, 70),  # Darksalmon
+            (220, 20, 60),  # Crimson
+            (255, 61, 99),  # Red
+            (0, 0, 230),  # Blue
+            (47, 79, 79),  # Darkslategrey
+            (112, 128, 144),  # Slategrey
+        ]
+    }
+
+    def __init__(self,
+                 data_root: str,
+                 ann_file: str,
+                 pipeline: List[Union[dict, Callable]] = [],
+                 box_type_3d: str = 'LiDAR',
+                 load_type: str = 'frame_based',
+                 modality: dict = dict(
+                     use_camera=False,
+                     use_lidar=True,
+                 ),
+                 filter_empty_gt: bool = True,
+                 test_mode: bool = False,
+                 with_velocity: bool = True,
+                 use_valid_flag: bool = False,
+                 **kwargs) -> None:
+        self.use_valid_flag = use_valid_flag
+        self.with_velocity = with_velocity
+
+        # TODO: Redesign multi-view data process in the future
+        assert load_type in ('frame_based', 'mv_image_based',
+                             'fov_image_based')
+        self.load_type = load_type
+
+        assert box_type_3d.lower() in ('lidar', 'camera')
+        super().__init__(
+            data_root=data_root,
+            ann_file=ann_file,
+            modality=modality,
+            pipeline=pipeline,
+            box_type_3d=box_type_3d,
+            filter_empty_gt=filter_empty_gt,
+            test_mode=test_mode,
+            **kwargs)
+
+    def _filter_with_mask(self, ann_info: dict) -> dict:
+        """Remove annotations that do not need to be cared.
+
+        Args:
+            ann_info (dict): Dict of annotation infos.
+
+        Returns:
+            dict: Annotations after filtering.
+        """
+        filtered_annotations = {}
+        if self.use_valid_flag:
+            filter_mask = ann_info['bbox_3d_isvalid']
+        else:
+            filter_mask = ann_info['num_lidar_pts'] > 0
+        for key in ann_info.keys():
+            if key != 'instances':
+                filtered_annotations[key] = (ann_info[key][filter_mask])
+            else:
+                filtered_annotations[key] = ann_info[key]
+        return filtered_annotations
+
+    def parse_ann_info(self, info: dict) -> dict:
+        """Process the `instances` in data info to `ann_info`.
+
+        Args:
+            info (dict): Data information of single data sample.
+
+        Returns:
+            dict: Annotation information consists of the following keys:
+
+                - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`):
+                  3D ground truth bboxes.
+                - gt_labels_3d (np.ndarray): Labels of ground truths.
+        """
+        ann_info = super().parse_ann_info(info)
+        if ann_info is not None:
+
+            ann_info = self._filter_with_mask(ann_info)
+
+            if self.with_velocity:
+                gt_bboxes_3d = ann_info['gt_bboxes_3d']
+                gt_velocities = ann_info['velocities']
+                nan_mask = np.isnan(gt_velocities[:, 0])
+                gt_velocities[nan_mask] = [0.0, 0.0]
+                gt_bboxes_3d = np.concatenate([gt_bboxes_3d, gt_velocities],
+                                              axis=-1)
+                ann_info['gt_bboxes_3d'] = gt_bboxes_3d
+        else:
+            # empty instance
+            ann_info = dict()
+            if self.with_velocity:
+                ann_info['gt_bboxes_3d'] = np.zeros((0, 9), dtype=np.float32)
+            else:
+                ann_info['gt_bboxes_3d'] = np.zeros((0, 7), dtype=np.float32)
+            ann_info['gt_labels_3d'] = np.zeros(0, dtype=np.int64)
+
+            if self.load_type in ['fov_image_based', 'mv_image_based']:
+                ann_info['gt_bboxes'] = np.zeros((0, 4), dtype=np.float32)
+                ann_info['gt_bboxes_labels'] = np.array(0, dtype=np.int64)
+                ann_info['attr_labels'] = np.array(0, dtype=np.int64)
+                ann_info['centers_2d'] = np.zeros((0, 2), dtype=np.float32)
+                ann_info['depths'] = np.zeros((0), dtype=np.float32)
+
+        # the nuscenes box center is [0.5, 0.5, 0.5], we change it to be
+        # the same as KITTI (0.5, 0.5, 0)
+        # TODO: Unify the coordinates
+        if self.load_type in ['fov_image_based', 'mv_image_based']:
+            gt_bboxes_3d = CameraInstance3DBoxes(
+                ann_info['gt_bboxes_3d'],
+                box_dim=ann_info['gt_bboxes_3d'].shape[-1],
+                origin=(0.5, 0.5, 0.5))
+        else:
+            gt_bboxes_3d = LiDARInstance3DBoxes(
+                ann_info['gt_bboxes_3d'],
+                box_dim=ann_info['gt_bboxes_3d'].shape[-1],
+                origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+
+        ann_info['gt_bboxes_3d'] = gt_bboxes_3d
+
+        return ann_info
+
+    def parse_data_info(self, info: dict) -> Union[List[dict], dict]:
+        """Process the raw data info.
+
+        The only difference with it in `Det3DDataset`
+        is the specific process for `plane`.
+
+        Args:
+            info (dict): Raw info dict.
+
+        Returns:
+            List[dict] or dict: Has `ann_info` in training stage. And
+            all path has been converted to absolute path.
+        """
+        if self.load_type == 'mv_image_based':
+            data_list = []
+            if self.modality['use_lidar']:
+                info['lidar_points']['lidar_path'] = \
+                    osp.join(
+                        self.data_prefix.get('pts', ''),
+                        info['lidar_points']['lidar_path'])
+
+            if self.modality['use_camera']:
+                for cam_id, img_info in info['images'].items():
+                    if 'img_path' in img_info:
+                        if cam_id in self.data_prefix:
+                            cam_prefix = self.data_prefix[cam_id]
+                        else:
+                            cam_prefix = self.data_prefix.get('img', '')
+                        img_info['img_path'] = osp.join(
+                            cam_prefix, img_info['img_path'])
+
+            for idx, (cam_id, img_info) in enumerate(info['images'].items()):
+                camera_info = dict()
+                camera_info['images'] = dict()
+                camera_info['images'][cam_id] = img_info
+                if 'cam_instances' in info and cam_id in info['cam_instances']:
+                    camera_info['instances'] = info['cam_instances'][cam_id]
+                else:
+                    camera_info['instances'] = []
+                # TODO: check whether to change sample_idx for 6 cameras
+                #  in one frame
+                camera_info['sample_idx'] = info['sample_idx'] * 6 + idx
+                camera_info['token'] = info['token']
+                camera_info['ego2global'] = info['ego2global']
+
+                if not self.test_mode:
+                    # used in traing
+                    camera_info['ann_info'] = self.parse_ann_info(camera_info)
+                if self.test_mode and self.load_eval_anns:
+                    camera_info['eval_ann_info'] = \
+                        self.parse_ann_info(camera_info)
+                data_list.append(camera_info)
+            return data_list
+        else:
+            data_info = super().parse_data_info(info)
+            return data_info
--- a/mmdetection3d/mmdet3d/datasets/s3dis_dataset.py
+++ b/mmdetection3d/mmdet3d/datasets/s3dis_dataset.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from os import path as osp
+from typing import Any, Callable, List, Optional, Tuple, Union
+
+import numpy as np
+
+from mmdet3d.registry import DATASETS
+from mmdet3d.structures import DepthInstance3DBoxes
+from .det3d_dataset import Det3DDataset
+from .seg3d_dataset import Seg3DDataset
+
+
+@DATASETS.register_module()
+class S3DISDataset(Det3DDataset):
+    r"""S3DIS Dataset for Detection Task.
+
+    This class is the inner dataset for S3DIS. Since S3DIS has 6 areas, we
+    often train on 5 of them and test on the remaining one. The one for
+    test is Area_5 as suggested in `GSDN <https://arxiv.org/abs/2006.12356>`_.
+    To concatenate 5 areas during training
+    `mmengine.datasets.dataset_wrappers.ConcatDataset` should be used.
+
+    Args:
+        data_root (str): Path of dataset root.
+        ann_file (str): Path of annotation file.
+        metainfo (dict, optional): Meta information for dataset, such as class
+            information. Defaults to None.
+        data_prefix (dict): Prefix for data. Defaults to
+            dict(pts='points',
+                 pts_instance_mask='instance_mask',
+                 pts_semantic_mask='semantic_mask').
+        pipeline (List[dict]): Pipeline used for data processing.
+            Defaults to [].
+        modality (dict): Modality to specify the sensor data used as input.
+            Defaults to dict(use_camera=False, use_lidar=True).
+        box_type_3d (str): Type of 3D box of this dataset.
+            Based on the `box_type_3d`, the dataset will encapsulate the box
+            to its original format then converted them to `box_type_3d`.
+            Defaults to 'Depth' in this dataset. Available options includes:
+
+            - 'LiDAR': Box in LiDAR coordinates.
+            - 'Depth': Box in depth coordinates, usually for indoor dataset.
+            - 'Camera': Box in camera coordinates.
+        filter_empty_gt (bool): Whether to filter the data with empty GT.
+            If it's set to be True, the example with empty annotations after
+            data pipeline will be dropped and a random example will be chosen
+            in `__getitem__`. Defaults to True.
+        test_mode (bool): Whether the dataset is in test mode.
+            Defaults to False.
+    """
+    METAINFO = {
+        'classes': ('table', 'chair', 'sofa', 'bookcase', 'board'),
+        # the valid ids of segmentation annotations
+        'seg_valid_class_ids': (7, 8, 9, 10, 11),
+        'seg_all_class_ids':
+        tuple(range(1, 14)),  # possibly with 'stair' class
+        'palette': [(170, 120, 200), (255, 0, 0), (200, 100, 100),
+                    (10, 200, 100), (200, 200, 200)]
+    }
+
+    def __init__(self,
+                 data_root: str,
+                 ann_file: str,
+                 metainfo: Optional[dict] = None,
+                 data_prefix: dict = dict(
+                     pts='points',
+                     pts_instance_mask='instance_mask',
+                     pts_semantic_mask='semantic_mask'),
+                 pipeline: List[Union[dict, Callable]] = [],
+                 modality: dict = dict(use_camera=False, use_lidar=True),
+                 box_type_3d: str = 'Depth',
+                 filter_empty_gt: bool = True,
+                 test_mode: bool = False,
+                 **kwargs) -> None:
+
+        # construct seg_label_mapping for semantic mask
+        seg_max_cat_id = len(self.METAINFO['seg_all_class_ids'])
+        seg_valid_cat_ids = self.METAINFO['seg_valid_class_ids']
+        neg_label = len(seg_valid_cat_ids)
+        seg_label_mapping = np.ones(
+            seg_max_cat_id + 1, dtype=np.int64) * neg_label
+        for cls_idx, cat_id in enumerate(seg_valid_cat_ids):
+            seg_label_mapping[cat_id] = cls_idx
+        self.seg_label_mapping = seg_label_mapping
+
+        super().__init__(
+            data_root=data_root,
+            ann_file=ann_file,
+            metainfo=metainfo,
+            data_prefix=data_prefix,
+            pipeline=pipeline,
+            modality=modality,
+            box_type_3d=box_type_3d,
+            filter_empty_gt=filter_empty_gt,
+            test_mode=test_mode,
+            **kwargs)
+
+        self.metainfo['seg_label_mapping'] = self.seg_label_mapping
+        assert 'use_camera' in self.modality and \
+               'use_lidar' in self.modality
+        assert self.modality['use_camera'] or self.modality['use_lidar']
+
+    def parse_data_info(self, info: dict) -> dict:
+        """Process the raw data info.
+
+        Args:
+            info (dict): Raw info dict.
+
+        Returns:
+            dict: Has `ann_info` in training stage. And
+            all path has been converted to absolute path.
+        """
+        info['pts_instance_mask_path'] = osp.join(
+            self.data_prefix.get('pts_instance_mask', ''),
+            info['pts_instance_mask_path'])
+        info['pts_semantic_mask_path'] = osp.join(
+            self.data_prefix.get('pts_semantic_mask', ''),
+            info['pts_semantic_mask_path'])
+
+        info = super().parse_data_info(info)
+        # only be used in `PointSegClassMapping` in pipeline
+        # to map original semantic class to valid category ids.
+        info['seg_label_mapping'] = self.seg_label_mapping
+        return info
+
+    def parse_ann_info(self, info: dict) -> dict:
+        """Process the `instances` in data info to `ann_info`.
+
+        Args:
+            info (dict): Info dict.
+
+        Returns:
+            dict: Processed `ann_info`.
+        """
+        ann_info = super().parse_ann_info(info)
+        # empty gt
+        if ann_info is None:
+            ann_info = dict()
+            ann_info['gt_bboxes_3d'] = np.zeros((0, 6), dtype=np.float32)
+            ann_info['gt_labels_3d'] = np.zeros((0, ), dtype=np.int64)
+        # to target box structure
+
+        ann_info['gt_bboxes_3d'] = DepthInstance3DBoxes(
+            ann_info['gt_bboxes_3d'],
+            box_dim=ann_info['gt_bboxes_3d'].shape[-1],
+            with_yaw=False,
+            origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+
+        return ann_info
+
+
+class _S3DISSegDataset(Seg3DDataset):
+    r"""S3DIS Dataset for Semantic Segmentation Task.
+
+    This class is the inner dataset for S3DIS. Since S3DIS has 6 areas, we
+    often train on 5 of them and test on the remaining one.
+    However, there is not a fixed train-test split of S3DIS. People often test
+    on Area_5 as suggested by `SEGCloud <https://arxiv.org/abs/1710.07563>`_.
+    But many papers also report the average results of 6-fold cross validation
+    over the 6 areas (e.g. `DGCNN <https://arxiv.org/abs/1801.07829>`_).
+    Therefore, we use an inner dataset for one area, and further use a dataset
+    wrapper to concat all the provided data in different areas.
+
+    Args:
+        data_root (str, optional): Path of dataset root, Defaults to None.
+        ann_file (str): Path of annotation file. Defaults to ''.
+        metainfo (dict, optional): Meta information for dataset, such as class
+            information. Defaults to None.
+        data_prefix (dict): Prefix for training data. Defaults to
+            dict(pts='points', pts_instance_mask='', pts_semantic_mask='').
+        pipeline (List[dict]): Pipeline used for data processing.
+            Defaults to [].
+        modality (dict): Modality to specify the sensor data used as input.
+            Defaults to dict(use_lidar=True, use_camera=False).
+        ignore_index (int, optional): The label index to be ignored, e.g.
+            unannotated points. If None is given, set to len(self.classes) to
+            be consistent with PointSegClassMapping function in pipeline.
+            Defaults to None.
+        scene_idxs (np.ndarray or str, optional): Precomputed index to load
+            data. For scenes with many points, we may sample it several times.
+            Defaults to None.
+        test_mode (bool): Whether the dataset is in test mode.
+            Defaults to False.
+    """
+    METAINFO = {
+        'classes':
+        ('ceiling', 'floor', 'wall', 'beam', 'column', 'window', 'door',
+         'table', 'chair', 'sofa', 'bookcase', 'board', 'clutter'),
+        'palette': [[0, 255, 0], [0, 0, 255], [0, 255, 255], [255, 255, 0],
+                    [255, 0, 255], [100, 100, 255], [200, 200, 100],
+                    [170, 120, 200], [255, 0, 0], [200, 100, 100],
+                    [10, 200, 100], [200, 200, 200], [50, 50, 50]],
+        'seg_valid_class_ids':
+        tuple(range(13)),
+        'seg_all_class_ids':
+        tuple(range(14))  # possibly with 'stair' class
+    }
+
+    def __init__(self,
+                 data_root: Optional[str] = None,
+                 ann_file: str = '',
+                 metainfo: Optional[dict] = None,
+                 data_prefix: dict = dict(
+                     pts='points', pts_instance_mask='', pts_semantic_mask=''),
+                 pipeline: List[Union[dict, Callable]] = [],
+                 modality: dict = dict(use_lidar=True, use_camera=False),
+                 ignore_index: Optional[int] = None,
+                 scene_idxs: Optional[Union[np.ndarray, str]] = None,
+                 test_mode: bool = False,
+                 **kwargs) -> None:
+        super().__init__(
+            data_root=data_root,
+            ann_file=ann_file,
+            metainfo=metainfo,
+            data_prefix=data_prefix,
+            pipeline=pipeline,
+            modality=modality,
+            ignore_index=ignore_index,
+            scene_idxs=scene_idxs,
+            test_mode=test_mode,
+            **kwargs)
+
+    def get_scene_idxs(self, scene_idxs: Union[np.ndarray, str,
+                                               None]) -> np.ndarray:
+        """Compute scene_idxs for data sampling.
+
+        We sample more times for scenes with more points.
+        """
+        # when testing, we load one whole scene every time
+        if not self.test_mode and scene_idxs is None:
+            raise NotImplementedError(
+                'please provide re-sampled scene indexes for training')
+
+        return super().get_scene_idxs(scene_idxs)
+
+
+@DATASETS.register_module()
+class S3DISSegDataset(_S3DISSegDataset):
+    r"""S3DIS Dataset for Semantic Segmentation Task.
+
+    This class serves as the API for experiments on the S3DIS Dataset.
+    It wraps the provided datasets of different areas.
+    We don't use `mmdet.datasets.dataset_wrappers.ConcatDataset` because we
+    need to concat the `scene_idxs` of different areas.
+
+    Please refer to the `google form <https://docs.google.com/forms/d/e/1FAIpQL
+    ScDimvNMCGhy_rmBA2gHfDu3naktRm6A8BPwAWWDv-Uhm6Shw/viewform?c=0&w=1>`_ for
+    data downloading.
+
+    Args:
+        data_root (str, optional): Path of dataset root. Defaults to None.
+        ann_files (List[str]): Path of several annotation files.
+            Defaults to ''.
+        metainfo (dict, optional): Meta information for dataset, such as class
+            information. Defaults to None.
+        data_prefix (dict): Prefix for training data. Defaults to
+            dict(pts='points', pts_instance_mask='', pts_semantic_mask='').
+        pipeline (List[dict]): Pipeline used for data processing.
+            Defaults to [].
+        modality (dict): Modality to specify the sensor data used as input.
+            Defaults to dict(use_lidar=True, use_camera=False).
+        ignore_index (int, optional): The label index to be ignored, e.g.
+            unannotated points. If None is given, set to len(self.classes) to
+            be consistent with PointSegClassMapping function in pipeline.
+            Defaults to None.
+        scene_idxs (List[np.ndarray] | List[str], optional): Precomputed index
+            to load data. For scenes with many points, we may sample it
+            several times. Defaults to None.
+        test_mode (bool): Whether the dataset is in test mode.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 data_root: Optional[str] = None,
+                 ann_files: List[str] = '',
+                 metainfo: Optional[dict] = None,
+                 data_prefix: dict = dict(
+                     pts='points', pts_instance_mask='', pts_semantic_mask=''),
+                 pipeline: List[Union[dict, Callable]] = [],
+                 modality: dict = dict(use_lidar=True, use_camera=False),
+                 ignore_index: Optional[int] = None,
+                 scene_idxs: Optional[Union[List[np.ndarray],
+                                            List[str]]] = None,
+                 test_mode: bool = False,
+                 **kwargs) -> None:
+
+        # make sure that ann_files and scene_idxs have same length
+        ann_files = self._check_ann_files(ann_files)
+        scene_idxs = self._check_scene_idxs(scene_idxs, len(ann_files))
+
+        # initialize some attributes as datasets[0]
+        super().__init__(
+            data_root=data_root,
+            ann_file=ann_files[0],
+            metainfo=metainfo,
+            data_prefix=data_prefix,
+            pipeline=pipeline,
+            modality=modality,
+            ignore_index=ignore_index,
+            scene_idxs=scene_idxs[0],
+            test_mode=test_mode,
+            **kwargs)
+
+        datasets = [
+            _S3DISSegDataset(
+                data_root=data_root,
+                ann_file=ann_files[i],
+                metainfo=metainfo,
+                data_prefix=data_prefix,
+                pipeline=pipeline,
+                modality=modality,
+                ignore_index=ignore_index,
+                scene_idxs=scene_idxs[i],
+                test_mode=test_mode,
+                **kwargs) for i in range(len(ann_files))
+        ]
+
+        # data_list and scene_idxs need to be concat
+        self.concat_data_list([dst.data_list for dst in datasets])
+
+        # set group flag for the sampler
+        if not self.test_mode:
+            self._set_group_flag()
+
+    def concat_data_list(self, data_lists: List[List[dict]]) -> None:
+        """Concat data_list from several datasets to form self.data_list.
+
+        Args:
+            data_lists (List[List[dict]]): List of dict containing
+                annotation information.
+        """
+        self.data_list = [
+            data for data_list in data_lists for data in data_list
+        ]
+
+    @staticmethod
+    def _duplicate_to_list(x: Any, num: int) -> list:
+        """Repeat x `num` times to form a list."""
+        return [x for _ in range(num)]
+
+    def _check_ann_files(
+            self, ann_file: Union[List[str], Tuple[str], str]) -> List[str]:
+        """Make ann_files as list/tuple."""
+        # ann_file could be str
+        if not isinstance(ann_file, (list, tuple)):
+            ann_file = self._duplicate_to_list(ann_file, 1)
+        return ann_file
+
+    def _check_scene_idxs(self, scene_idx: Union[str, List[Union[list, tuple,
+                                                                 np.ndarray]],
+                                                 List[str], None],
+                          num: int) -> List[np.ndarray]:
+        """Make scene_idxs as list/tuple."""
+        if scene_idx is None:
+            return self._duplicate_to_list(scene_idx, num)
+        # scene_idx could be str, np.ndarray, list or tuple
+        if isinstance(scene_idx, str):  # str
+            return self._duplicate_to_list(scene_idx, num)
+        if isinstance(scene_idx[0], str):  # list of str
+            return scene_idx
+        if isinstance(scene_idx[0], (list, tuple, np.ndarray)):  # list of idx
+            return scene_idx
+        # single idx
+        return self._duplicate_to_list(scene_idx, num)