Bump version to v1.1.0rc2

Bump to v1.1.0rc2

Bump version to v1.1.0rc2
Bump to v1.1.0rc2
d7067e44 · Wenwei Zhang · GitHub · 28fe73d2 · fb0e57e5 · d7067e44
Unverified Commit d7067e44 authored Dec 03, 2022 by Wenwei Zhang Committed by GitHub Dec 03, 2022
20 changed files
--- a/mmdet3d/datasets/det3d_dataset.py
+++ b/mmdet3d/datasets/det3d_dataset.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import copy
+import os
 from os import path as osp
-from typing import Callable, List, Optional, Union
+from typing import Callable, List, Optional, Set, Union

 import mmengine
 import numpy as np
+import torch
 from mmengine.dataset import BaseDataset
+from mmengine.logging import print_log
+from terminaltables import AsciiTable

 from mmdet3d.datasets import DATASETS
 from mmdet3d.structures import get_box_type
@@ -25,22 +29,22 @@ class Det3DDataset(BaseDataset):
        ann_file (str): Annotation file path. Defaults to ''.
        metainfo (dict, optional): Meta information for dataset, such as class
            information. Defaults to None.
-        data_prefix (dict, optional): Prefix for training data. Defaults to
+        data_prefix (dict): Prefix for training data. Defaults to
            dict(pts='velodyne', img='').
-        pipeline (list[dict], optional): Pipeline used for data processing.
-            Defaults to None.
-        modality (dict, optional): Modality to specify the sensor data used
-            as input, it usually has following keys:
+        pipeline (List[dict]): Pipeline used for data processing.
+            Defaults to [].
+        modality (dict): Modality to specify the sensor data used as input,
+            it usually has following keys:

                - use_camera: bool
                - use_lidar: bool
-            Defaults to `dict(use_lidar=True, use_camera=False)`
+            Defaults to dict(use_lidar=True, use_camera=False).
        default_cam_key (str, optional): The default camera name adopted.
            Defaults to None.
-        box_type_3d (str, optional): Type of 3D box of this dataset.
+        box_type_3d (str): Type of 3D box of this dataset.
            Based on the `box_type_3d`, the dataset will encapsulate the box
            to its original format then converted them to `box_type_3d`.
-            Defaults to 'LiDAR'. Available options includes:
+            Defaults to 'LiDAR' in this dataset. Available options includes:

            - 'LiDAR': Box in LiDAR coordinates, usually for
              outdoor point cloud 3d detection.
@@ -48,16 +52,20 @@ class Det3DDataset(BaseDataset):
              indoor point cloud 3d detection.
            - 'Camera': Box in camera coordinates, usually
              for vision-based 3d detection.
-
-        filter_empty_gt (bool, optional): Whether to filter the data with
-            empty GT. Defaults to True.
-        test_mode (bool, optional): Whether the dataset is in test mode.
+        filter_empty_gt (bool): Whether to filter the data with empty GT.
+            If it's set to be True, the example with empty annotations after
+            data pipeline will be dropped and a random example will be chosen
+            in `__getitem__`. Defaults to True.
+        test_mode (bool): Whether the dataset is in test mode.
            Defaults to False.
-        load_eval_anns (bool, optional): Whether to load annotations
-            in test_mode, the annotation will be save in `eval_ann_infos`,
-            which can be used in Evaluator. Defaults to True.
-        file_client_args (dict, optional): Configuration of file client.
+        load_eval_anns (bool): Whether to load annotations in test_mode,
+            the annotation will be save in `eval_ann_infos`, which can be
+            used in Evaluator. Defaults to True.
+        file_client_args (dict): Configuration of file client.
            Defaults to dict(backend='disk').
+        show_ins_var (bool): For debug purpose. Whether to show variation
+            of the number of instances before and after through pipeline.
+            Defaults to False.
    """

    def __init__(self,
@@ -71,8 +79,9 @@ class Det3DDataset(BaseDataset):
                 box_type_3d: dict = 'LiDAR',
                 filter_empty_gt: bool = True,
                 test_mode: bool = False,
-                 load_eval_anns=True,
+                 load_eval_anns: bool = True,
                 file_client_args: dict = dict(backend='disk'),
+                 show_ins_var: bool = False,
                 **kwargs) -> None:
        # init file client
        self.file_client = mmengine.FileClient(**file_client_args)
@@ -94,24 +103,31 @@ class Det3DDataset(BaseDataset):

        self.box_type_3d, self.box_mode_3d = get_box_type(box_type_3d)

-        if metainfo is not None and 'CLASSES' in metainfo:
-            # we allow to train on subset of self.METAINFO['CLASSES']
+        if metainfo is not None and 'classes' in metainfo:
+            # we allow to train on subset of self.METAINFO['classes']
            # map unselected labels to -1
            self.label_mapping = {
                i: -1
-                for i in range(len(self.METAINFO['CLASSES']))
+                for i in range(len(self.METAINFO['classes']))
            }
            self.label_mapping[-1] = -1
-            for label_idx, name in enumerate(metainfo['CLASSES']):
-                ori_label = self.METAINFO['CLASSES'].index(name)
+            for label_idx, name in enumerate(metainfo['classes']):
+                ori_label = self.METAINFO['classes'].index(name)
                self.label_mapping[ori_label] = label_idx
+
+            self.num_ins_per_cat = {name: 0 for name in metainfo['classes']}
        else:
            self.label_mapping = {
                i: i
-                for i in range(len(self.METAINFO['CLASSES']))
+                for i in range(len(self.METAINFO['classes']))
            }
            self.label_mapping[-1] = -1

+            self.num_ins_per_cat = {
+                name: 0
+                for name in self.METAINFO['classes']
+            }
+
        super().__init__(
            ann_file=ann_file,
            metainfo=metainfo,
@@ -125,10 +141,25 @@ class Det3DDataset(BaseDataset):
        self.metainfo['box_type_3d'] = box_type_3d
        self.metainfo['label_mapping'] = self.label_mapping

+        # used for showing variation of the number of instances before and
+        # after through the pipeline
+        self.show_ins_var = show_ins_var
+
+        # show statistics of this dataset
+        print_log('-' * 30, 'current')
+        print_log(f'The length of the dataset: {len(self)}', 'current')
+        content_show = [['category', 'number']]
+        for cat_name, num in self.num_ins_per_cat.items():
+            content_show.append([cat_name, num])
+        table = AsciiTable(content_show)
+        print_log(
+            f'The number of instances per category in the dataset:\n{table.table}',  # noqa: E501
+            'current')
+
    def _remove_dontcare(self, ann_info: dict) -> dict:
        """Remove annotations that do not need to be cared.

-        -1 indicate dontcare in MMDet3d.
+        -1 indicates dontcare in MMDet3d.

        Args:
            ann_info (dict): Dict of annotation infos. The
@@ -156,7 +187,7 @@ class Det3DDataset(BaseDataset):
            index (int): Index of the annotation data to get.

        Returns:
-            dict: annotation information.
+            dict: Annotation information.
        """
        data_info = self.get_data_info(index)
        # test model
@@ -167,8 +198,8 @@ class Det3DDataset(BaseDataset):

        return ann_info

-    def parse_ann_info(self, info: dict) -> Optional[dict]:
-        """Process the `instances` in data info to `ann_info`
+    def parse_ann_info(self, info: dict) -> Union[dict, None]:
+        """Process the `instances` in data info to `ann_info`.

        In `Custom3DDataset`, we simply concatenate all the field
        in `instances` to `np.ndarray`, you can do the specific
@@ -179,7 +210,7 @@ class Det3DDataset(BaseDataset):
            info (dict): Info dict.

        Returns:
-            dict | None: Processed `ann_info`
+            dict or None: Processed `ann_info`.
        """
        # add s or gt prefix for most keys after concat
        # we only process 3d annotations here, the corresponding
@@ -223,14 +254,20 @@ class Det3DDataset(BaseDataset):

                ann_info[mapped_ann_name] = temp_anns
            ann_info['instances'] = info['instances']
+
+            for label in ann_info['gt_labels_3d']:
+                if label != -1:
+                    cat_name = self.metainfo['classes'][label]
+                    self.num_ins_per_cat[cat_name] += 1
+
        return ann_info

    def parse_data_info(self, info: dict) -> dict:
        """Process the raw data info.

        Convert all relative path of needed modality data file to
-        the absolute path. And process
-        the `instances` field to `ann_info` in training stage.
+        the absolute path. And process the `instances` field to
+        `ann_info` in training stage.

        Args:
            info (dict): Raw info dict.
@@ -251,7 +288,7 @@ class Det3DDataset(BaseDataset):
            if 'lidar_sweeps' in info:
                for sweep in info['lidar_sweeps']:
                    file_suffix = sweep['lidar_points']['lidar_path'].split(
-                        '/')[-1]
+                        os.sep)[-1]
                    if 'samples' in sweep['lidar_points']['lidar_path']:
                        sweep['lidar_points']['lidar_path'] = osp.join(
                            self.data_prefix['pts'], file_suffix)
@@ -291,7 +328,37 @@ class Det3DDataset(BaseDataset):

        return info

-    def prepare_data(self, index: int) -> Optional[dict]:
+    def _show_ins_var(self, old_labels: np.ndarray,
+                      new_labels: torch.Tensor) -> None:
+        """Show variation of the number of instances before and after through
+        the pipeline.
+
+        Args:
+            old_labels (np.ndarray): The labels before through the pipeline.
+            new_labels (torch.Tensor): The labels after through the pipeline.
+        """
+        ori_num_per_cat = dict()
+        for label in old_labels:
+            if label != -1:
+                cat_name = self.metainfo['classes'][label]
+                ori_num_per_cat[cat_name] = ori_num_per_cat.get(cat_name,
+                                                                0) + 1
+        new_num_per_cat = dict()
+        for label in new_labels:
+            if label != -1:
+                cat_name = self.metainfo['classes'][label]
+                new_num_per_cat[cat_name] = new_num_per_cat.get(cat_name,
+                                                                0) + 1
+        content_show = [['category', 'new number', 'ori number']]
+        for cat_name, num in ori_num_per_cat.items():
+            new_num = new_num_per_cat.get(cat_name, 0)
+            content_show.append([cat_name, new_num, num])
+        table = AsciiTable(content_show)
+        print_log(
+            'The number of instances per category after and before '
+            f'through pipeline:\n{table.table}', 'current')
+
+    def prepare_data(self, index: int) -> Union[dict, None]:
        """Data preparation for both training and testing stage.

        Called by `__getitem__`  of dataset.
@@ -300,12 +367,12 @@ class Det3DDataset(BaseDataset):
            index (int): Index for accessing the target data.

        Returns:
-            dict | None: Data dict of the corresponding index.
+            dict or None: Data dict of the corresponding index.
        """
-        input_dict = self.get_data_info(index)
+        ori_input_dict = self.get_data_info(index)

        # deepcopy here to avoid inplace modification in pipeline.
-        input_dict = copy.deepcopy(input_dict)
+        input_dict = copy.deepcopy(ori_input_dict)

        # box_type_3d (str): 3D box type.
        input_dict['box_type_3d'] = self.box_type_3d
@@ -318,15 +385,29 @@ class Det3DDataset(BaseDataset):
                return None

        example = self.pipeline(input_dict)
+
        if not self.test_mode and self.filter_empty_gt:
            # after pipeline drop the example with empty annotations
            # return None to random another in `__getitem__`
            if example is None or len(
                    example['data_samples'].gt_instances_3d.labels_3d) == 0:
                return None
+
+        if self.show_ins_var:
+            if 'ann_info' in ori_input_dict:
+                self._show_ins_var(
+                    ori_input_dict['ann_info']['gt_labels_3d'],
+                    example['data_samples'].gt_instances_3d.labels_3d)
+            else:
+                print_log(
+                    "'ann_info' is not in the input dict. It's probably that "
+                    'the data is not in training mode',
+                    'current',
+                    level=30)
+
        return example

-    def get_cat_ids(self, idx: int) -> List[int]:
+    def get_cat_ids(self, idx: int) -> Set[int]:
        """Get category ids by index. Dataset wrapped by ClassBalancedDataset
        must implement this method.


--- a/mmdet3d/datasets/kitti2d_dataset.py
+++ b/mmdet3d/datasets/kitti2d_dataset.py
@@ -36,7 +36,7 @@ class Kitti2DDataset(Det3DDataset):
            Defaults to False.
    """

-    CLASSES = ('car', 'pedestrian', 'cyclist')
+    classes = ('car', 'pedestrian', 'cyclist')
    """
    Annotation format:
    [
@@ -90,7 +90,7 @@ class Kitti2DDataset(Det3DDataset):
        self.data_infos = mmengine.load(ann_file)
        self.cat2label = {
            cat_name: i
-            for i, cat_name in enumerate(self.CLASSES)
+            for i, cat_name in enumerate(self.classes)
        }
        return self.data_infos

@@ -122,7 +122,7 @@ class Kitti2DDataset(Det3DDataset):
        difficulty = annos['difficulty']

        # remove classes that is not needed
-        selected = self.keep_arrays_by_name(gt_names, self.CLASSES)
+        selected = self.keep_arrays_by_name(gt_names, self.classes)
        gt_bboxes = gt_bboxes[selected]
        gt_names = gt_names[selected]
        difficulty = difficulty[selected]
@@ -215,7 +215,7 @@ class Kitti2DDataset(Det3DDataset):
        """
        from mmdet3d.structures.ops.transforms import bbox2result_kitti2d
        sample_idx = [info['image']['image_idx'] for info in self.data_infos]
-        result_files = bbox2result_kitti2d(outputs, self.CLASSES, sample_idx,
+        result_files = bbox2result_kitti2d(outputs, self.classes, sample_idx,
                                           out)
        return result_files

@@ -237,5 +237,5 @@ class Kitti2DDataset(Det3DDataset):
                                       ]), 'KITTI data set only evaluate bbox'
        gt_annos = [info['annos'] for info in self.data_infos]
        ap_result_str, ap_dict = kitti_eval(
-            gt_annos, result_files, self.CLASSES, eval_types=['bbox'])
+            gt_annos, result_files, self.classes, eval_types=['bbox'])
        return ap_result_str, ap_dict
--- a/mmdet3d/datasets/kitti_dataset.py
+++ b/mmdet3d/datasets/kitti_dataset.py
@@ -18,13 +18,13 @@ class KittiDataset(Det3DDataset):
    Args:
        data_root (str): Path of dataset root.
        ann_file (str): Path of annotation file.
-        pipeline (list[dict], optional): Pipeline used for data processing.
-            Defaults to None.
-        modality (dict, optional): Modality to specify the sensor data used
-            as input. Defaults to `dict(use_lidar=True)`.
-        default_cam_key (str, optional): The default camera name adopted.
+        pipeline (List[dict]): Pipeline used for data processing.
+            Defaults to [].
+        modality (dict): Modality to specify the sensor data used as input.
+            Defaults to dict(use_lidar=True).
+        default_cam_key (str): The default camera name adopted.
            Defaults to 'CAM2'.
-        box_type_3d (str, optional): Type of 3D box of this dataset.
+        box_type_3d (str): Type of 3D box of this dataset.
            Based on the `box_type_3d`, the dataset will encapsulate the box
            to its original format then converted them to `box_type_3d`.
            Defaults to 'LiDAR' in this dataset. Available options includes:
@@ -32,17 +32,28 @@ class KittiDataset(Det3DDataset):
            - 'LiDAR': Box in LiDAR coordinates.
            - 'Depth': Box in depth coordinates, usually for indoor dataset.
            - 'Camera': Box in camera coordinates.
-        filter_empty_gt (bool, optional): Whether to filter empty GT.
-            Defaults to True.
-        test_mode (bool, optional): Whether the dataset is in test mode.
+        load_type (str): Type of loading mode. Defaults to 'frame_based'.
+
+            - 'frame_based': Load all of the instances in the frame.
+            - 'mv_image_based': Load all of the instances in the frame and need
+                to convert to the FOV-based data type to support image-based
+                detector.
+            - 'fov_image_based': Only load the instances inside the default
+                cam, and need to convert to the FOV-based data type to support
+                image-based detector.
+        filter_empty_gt (bool): Whether to filter the data with empty GT.
+            If it's set to be True, the example with empty annotations after
+            data pipeline will be dropped and a random example will be chosen
+            in `__getitem__`. Defaults to True.
+        test_mode (bool): Whether the dataset is in test mode.
            Defaults to False.
-        pcd_limit_range (list[float], optional): The range of point cloud
-            used to filter invalid predicted boxes.
+        pcd_limit_range (List[float]): The range of point cloud used to filter
+            invalid predicted boxes.
            Defaults to [0, -40, -3, 70.4, 40, 0.0].
    """
    # TODO: use full classes of kitti
    METAINFO = {
-        'CLASSES': ('Pedestrian', 'Cyclist', 'Car', 'Van', 'Truck',
+        'classes': ('Pedestrian', 'Cyclist', 'Car', 'Van', 'Truck',
                    'Person_sitting', 'Tram', 'Misc')
    }

@@ -52,7 +63,7 @@ class KittiDataset(Det3DDataset):
                 pipeline: List[Union[dict, Callable]] = [],
                 modality: dict = dict(use_lidar=True),
                 default_cam_key: str = 'CAM2',
-                 task: str = 'lidar_det',
+                 load_type: str = 'frame_based',
                 box_type_3d: str = 'LiDAR',
                 filter_empty_gt: bool = True,
                 test_mode: bool = False,
@@ -60,8 +71,9 @@ class KittiDataset(Det3DDataset):
                 **kwargs) -> None:

        self.pcd_limit_range = pcd_limit_range
-        assert task in ('lidar_det', 'mono_det')
-        self.task = task
+        assert load_type in ('frame_based', 'mv_image_based',
+                             'fov_image_based')
+        self.load_type = load_type
        super().__init__(
            data_root=data_root,
            ann_file=ann_file,
@@ -111,7 +123,7 @@ class KittiDataset(Det3DDataset):

            info['plane'] = plane_lidar

-        if self.task == 'mono_det':
+        if self.load_type == 'fov_image_based' and self.load_eval_anns:
            info['instances'] = info['cam_instances'][self.default_cam_key]

        info = super().parse_data_info(info)
@@ -119,13 +131,13 @@ class KittiDataset(Det3DDataset):
        return info

    def parse_ann_info(self, info: dict) -> dict:
-        """Get annotation info according to the given index.
+        """Process the `instances` in data info to `ann_info`.

        Args:
            info (dict): Data information of single data sample.

        Returns:
-            dict: annotation information consists of the following keys:
+            dict: Annotation information consists of the following keys:

                - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`):
                  3D ground truth bboxes.
@@ -142,7 +154,7 @@ class KittiDataset(Det3DDataset):
            ann_info['gt_bboxes_3d'] = np.zeros((0, 7), dtype=np.float32)
            ann_info['gt_labels_3d'] = np.zeros(0, dtype=np.int64)

-            if self.task == 'mono_det':
+            if self.load_type in ['fov_image_based', 'mv_image_based']:
                ann_info['gt_bboxes'] = np.zeros((0, 4), dtype=np.float32)
                ann_info['gt_bboxes_labels'] = np.array(0, dtype=np.int64)
                ann_info['centers_2d'] = np.zeros((0, 2), dtype=np.float32)

--- a/mmdet3d/datasets/lyft_dataset.py
+++ b/mmdet3d/datasets/lyft_dataset.py
@@ -21,10 +21,10 @@ class LyftDataset(Det3DDataset):
    Args:
        data_root (str): Path of dataset root.
        ann_file (str): Path of annotation file.
-        pipeline (list[dict], optional): Pipeline used for data processing.
-            Defaults to None.
-        modality (dict, optional): Modality to specify the sensor data used
-            as input. Defaults to dict(use_camera=False, use_lidar=True).
+        pipeline (List[dict]): Pipeline used for data processing.
+            Defaults to [].
+        modality (dict): Modality to specify the sensor data used as input.
+            Defaults to dict(use_camera=False, use_lidar=True).
        box_type_3d (str): Type of 3D box of this dataset.
            Based on the `box_type_3d`, the dataset will encapsulate the box
            to its original format then converted them to `box_type_3d`.
@@ -33,14 +33,16 @@ class LyftDataset(Det3DDataset):
            - 'LiDAR': Box in LiDAR coordinates.
            - 'Depth': Box in depth coordinates, usually for indoor dataset.
            - 'Camera': Box in camera coordinates.
-        filter_empty_gt (bool, optional): Whether to filter empty GT.
-            Defaults to True.
-        test_mode (bool, optional): Whether the dataset is in test mode.
+        filter_empty_gt (bool): Whether to filter the data with empty GT.
+            If it's set to be True, the example with empty annotations after
+            data pipeline will be dropped and a random example will be chosen
+            in `__getitem__`. Defaults to True.
+        test_mode (bool): Whether the dataset is in test mode.
            Defaults to False.
    """

    METAINFO = {
-        'CLASSES':
+        'classes':
        ('car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle',
         'motorcycle', 'bicycle', 'pedestrian', 'animal')
    }
@@ -66,13 +68,13 @@ class LyftDataset(Det3DDataset):
            **kwargs)

    def parse_ann_info(self, info: dict) -> dict:
-        """Get annotation info according to the given index.
+        """Process the `instances` in data info to `ann_info`.

        Args:
            info (dict): Data information of single data sample.

        Returns:
-            dict: annotation information consists of the following keys:
+            dict: Annotation information consists of the following keys:

                - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`):
                  3D ground truth bboxes.

--- a/mmdet3d/datasets/nuscenes_dataset.py
+++ b/mmdet3d/datasets/nuscenes_dataset.py
@@ -22,9 +22,8 @@ class NuScenesDataset(Det3DDataset):
    Args:
        data_root (str): Path of dataset root.
        ann_file (str): Path of annotation file.
-        task (str, optional): Detection task. Defaults to 'lidar_det'.
-        pipeline (list[dict], optional): Pipeline used for data processing.
-            Defaults to None.
+        pipeline (list[dict]): Pipeline used for data processing.
+            Defaults to [].
        box_type_3d (str): Type of 3D box of this dataset.
            Based on the `box_type_3d`, the dataset will encapsulate the box
            to its original format then converted them to `box_type_3d`.
@@ -33,20 +32,31 @@ class NuScenesDataset(Det3DDataset):
            - 'LiDAR': Box in LiDAR coordinates.
            - 'Depth': Box in depth coordinates, usually for indoor dataset.
            - 'Camera': Box in camera coordinates.
-        modality (dict, optional): Modality to specify the sensor data used
-            as input. Defaults to dict(use_camera=False, use_lidar=True).
-        filter_empty_gt (bool, optional): Whether to filter empty GT.
-            Defaults to True.
-        test_mode (bool, optional): Whether the dataset is in test mode.
+        load_type (str): Type of loading mode. Defaults to 'frame_based'.
+
+            - 'frame_based': Load all of the instances in the frame.
+            - 'mv_image_based': Load all of the instances in the frame and need
+                to convert to the FOV-based data type to support image-based
+                detector.
+            - 'fov_image_based': Only load the instances inside the default
+                cam, and need to convert to the FOV-based data type to support
+                image-based detector.
+        modality (dict): Modality to specify the sensor data used as input.
+            Defaults to dict(use_camera=False, use_lidar=True).
+        filter_empty_gt (bool): Whether to filter the data with empty GT.
+            If it's set to be True, the example with empty annotations after
+            data pipeline will be dropped and a random example will be chosen
+            in `__getitem__`. Defaults to True.
+        test_mode (bool): Whether the dataset is in test mode.
            Defaults to False.
-        with_velocity (bool, optional): Whether to include velocity prediction
+        with_velocity (bool): Whether to include velocity prediction
            into the experiments. Defaults to True.
-        use_valid_flag (bool, optional): Whether to use `use_valid_flag` key
+        use_valid_flag (bool): Whether to use `use_valid_flag` key
            in the info file as mask to filter gt_boxes and gt_names.
            Defaults to False.
    """
    METAINFO = {
-        'CLASSES':
+        'classes':
        ('car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
         'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'),
        'version':
@@ -56,9 +66,9 @@ class NuScenesDataset(Det3DDataset):
    def __init__(self,
                 data_root: str,
                 ann_file: str,
-                 task: str = 'lidar_det',
                 pipeline: List[Union[dict, Callable]] = [],
                 box_type_3d: str = 'LiDAR',
+                 load_type: str = 'frame_based',
                 modality: dict = dict(
                     use_camera=False,
                     use_lidar=True,
@@ -72,8 +82,9 @@ class NuScenesDataset(Det3DDataset):
        self.with_velocity = with_velocity

        # TODO: Redesign multi-view data process in the future
-        assert task in ('lidar_det', 'mono_det', 'multi-view_det')
-        self.task = task
+        assert load_type in ('frame_based', 'mv_image_based',
+                             'fov_image_based')
+        self.load_type = load_type

        assert box_type_3d.lower() in ('lidar', 'camera')
        super().__init__(
@@ -108,13 +119,13 @@ class NuScenesDataset(Det3DDataset):
        return filtered_annotations

    def parse_ann_info(self, info: dict) -> dict:
-        """Get annotation info according to the given index.
+        """Process the `instances` in data info to `ann_info`.

        Args:
            info (dict): Data information of single data sample.

        Returns:
-            dict: annotation information consists of the following keys:
+            dict: Annotation information consists of the following keys:

                - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`):
                  3D ground truth bboxes.
@@ -142,7 +153,7 @@ class NuScenesDataset(Det3DDataset):
                ann_info['gt_bboxes_3d'] = np.zeros((0, 7), dtype=np.float32)
            ann_info['gt_labels_3d'] = np.zeros(0, dtype=np.int64)

-            if self.task == 'mono3d':
+            if self.load_type in ['fov_image_based', 'mv_image_based']:
                ann_info['gt_bboxes'] = np.zeros((0, 4), dtype=np.float32)
                ann_info['gt_bboxes_labels'] = np.array(0, dtype=np.int64)
                ann_info['attr_labels'] = np.array(0, dtype=np.int64)
@@ -152,7 +163,7 @@ class NuScenesDataset(Det3DDataset):
        # the nuscenes box center is [0.5, 0.5, 0.5], we change it to be
        # the same as KITTI (0.5, 0.5, 0)
        # TODO: Unify the coordinates
-        if self.task == 'mono_det':
+        if self.load_type in ['fov_image_based', 'mv_image_based']:
            gt_bboxes_3d = CameraInstance3DBoxes(
                ann_info['gt_bboxes_3d'],
                box_dim=ann_info['gt_bboxes_3d'].shape[-1],
@@ -167,7 +178,7 @@ class NuScenesDataset(Det3DDataset):

        return ann_info

-    def parse_data_info(self, info: dict) -> dict:
+    def parse_data_info(self, info: dict) -> Union[List[dict], dict]:
        """Process the raw data info.

        The only difference with it in `Det3DDataset`
@@ -177,10 +188,10 @@ class NuScenesDataset(Det3DDataset):
            info (dict): Raw info dict.

        Returns:
-            dict: Has `ann_info` in training stage. And
+            List[dict] or dict: Has `ann_info` in training stage. And
            all path has been converted to absolute path.
        """
-        if self.task == 'mono_det':
+        if self.load_type == 'mv_image_based':
            data_list = []
            if self.modality['use_lidar']:
                info['lidar_points']['lidar_path'] = \

--- a/mmdet3d/datasets/s3dis_dataset.py
+++ b/mmdet3d/datasets/s3dis_dataset.py
 # Copyright (c) OpenMMLab. All rights reserved.
 from os import path as osp
-from typing import Callable, List, Optional, Union
+from typing import Any, Callable, List, Optional, Tuple, Union

 import numpy as np

@@ -8,7 +8,6 @@ from mmdet3d.registry import DATASETS
 from mmdet3d.structures import DepthInstance3DBoxes
 from .det3d_dataset import Det3DDataset
 from .seg3d_dataset import Seg3DDataset
-from .transforms import Compose


 @DATASETS.register_module()
@@ -19,138 +18,132 @@ class S3DISDataset(Det3DDataset):
    often train on 5 of them and test on the remaining one. The one for
    test is Area_5 as suggested in `GSDN <https://arxiv.org/abs/2006.12356>`_.
    To concatenate 5 areas during training
-    `mmdet.datasets.dataset_wrappers.ConcatDataset` should be used.
+    `mmengine.datasets.dataset_wrappers.ConcatDataset` should be used.

    Args:
        data_root (str): Path of dataset root.
        ann_file (str): Path of annotation file.
-        pipeline (list[dict], optional): Pipeline used for data processing.
-            Defaults to None.
-        classes (tuple[str], optional): Classes used in the dataset.
-            Defaults to None.
-        modality (dict, optional): Modality to specify the sensor data used
-            as input. Defaults to None.
-        box_type_3d (str, optional): Type of 3D box of this dataset.
+        metainfo (dict, optional): Meta information for dataset, such as class
+            information. Defaults to None.
+        data_prefix (dict): Prefix for data. Defaults to
+            dict(pts='points',
+                 pts_instance_mask='instance_mask',
+                 pts_semantic_mask='semantic_mask').
+        pipeline (List[dict]): Pipeline used for data processing.
+            Defaults to [].
+        modality (dict): Modality to specify the sensor data used as input.
+            Defaults to dict(use_camera=False, use_lidar=True).
+        box_type_3d (str): Type of 3D box of this dataset.
            Based on the `box_type_3d`, the dataset will encapsulate the box
            to its original format then converted them to `box_type_3d`.
-            Defaults to 'Depth' in this dataset. Available options includes
+            Defaults to 'Depth' in this dataset. Available options includes:

            - 'LiDAR': Box in LiDAR coordinates.
            - 'Depth': Box in depth coordinates, usually for indoor dataset.
            - 'Camera': Box in camera coordinates.
-        filter_empty_gt (bool, optional): Whether to filter empty GT.
-            Defaults to True.
-        test_mode (bool, optional): Whether the dataset is in test mode.
+        filter_empty_gt (bool): Whether to filter the data with empty GT.
+            If it's set to be True, the example with empty annotations after
+            data pipeline will be dropped and a random example will be chosen
+            in `__getitem__`. Defaults to True.
+        test_mode (bool): Whether the dataset is in test mode.
            Defaults to False.
    """
-    CLASSES = ('table', 'chair', 'sofa', 'bookcase', 'board')
+    METAINFO = {
+        'classes': ('table', 'chair', 'sofa', 'bookcase', 'board'),
+        # the valid ids of segmentation annotations
+        'seg_valid_class_ids': (7, 8, 9, 10, 11),
+        'seg_all_class_ids': tuple(range(1, 14))  # possibly with 'stair' class
+    }

    def __init__(self,
-                 data_root,
-                 ann_file,
-                 pipeline=None,
-                 classes=None,
-                 modality=None,
-                 box_type_3d='Depth',
-                 filter_empty_gt=True,
-                 test_mode=False,
-                 *kwargs):
+                 data_root: str,
+                 ann_file: str,
+                 metainfo: Optional[dict] = None,
+                 data_prefix: dict = dict(
+                     pts='points',
+                     pts_instance_mask='instance_mask',
+                     pts_semantic_mask='semantic_mask'),
+                 pipeline: List[Union[dict, Callable]] = [],
+                 modality: dict = dict(use_camera=False, use_lidar=True),
+                 box_type_3d: str = 'Depth',
+                 filter_empty_gt: bool = True,
+                 test_mode: bool = False,
+                 **kwargs) -> None:
+
+        # construct seg_label_mapping for semantic mask
+        seg_max_cat_id = len(self.METAINFO['seg_all_class_ids'])
+        seg_valid_cat_ids = self.METAINFO['seg_valid_class_ids']
+        neg_label = len(seg_valid_cat_ids)
+        seg_label_mapping = np.ones(
+            seg_max_cat_id + 1, dtype=np.int) * neg_label
+        for cls_idx, cat_id in enumerate(seg_valid_cat_ids):
+            seg_label_mapping[cat_id] = cls_idx
+        self.seg_label_mapping = seg_label_mapping
+
        super().__init__(
            data_root=data_root,
            ann_file=ann_file,
+            metainfo=metainfo,
+            data_prefix=data_prefix,
            pipeline=pipeline,
-            classes=classes,
            modality=modality,
            box_type_3d=box_type_3d,
            filter_empty_gt=filter_empty_gt,
            test_mode=test_mode,
-            *kwargs)
+            **kwargs)
+
+        self.metainfo['seg_label_mapping'] = self.seg_label_mapping
+        assert 'use_camera' in self.modality and \
+               'use_lidar' in self.modality
+        assert self.modality['use_camera'] or self.modality['use_lidar']

-    def get_ann_info(self, index):
-        """Get annotation info according to the given index.
+    def parse_data_info(self, info: dict) -> dict:
+        """Process the raw data info.

        Args:
-            index (int): Index of the annotation data to get.
+            info (dict): Raw info dict.

        Returns:
-            dict: annotation information consists of the following keys:
-
-                - gt_bboxes_3d (:obj:`DepthInstance3DBoxes`):
-                    3D ground truth bboxes
-                - gt_labels_3d (np.ndarray): Labels of ground truths.
-                - pts_instance_mask_path (str): Path of instance masks.
-                - pts_semantic_mask_path (str): Path of semantic masks.
+            dict: Has `ann_info` in training stage. And
+            all path has been converted to absolute path.
        """
-        # Use index to get the annos, thus the evalhook could also use this api
-        info = self.data_infos[index]
-        if info['annos']['gt_num'] != 0:
-            gt_bboxes_3d = info['annos']['gt_boxes_upright_depth'].astype(
-                np.float32)  # k, 6
-            gt_labels_3d = info['annos']['class'].astype(np.int64)
-        else:
-            gt_bboxes_3d = np.zeros((0, 6), dtype=np.float32)
-            gt_labels_3d = np.zeros((0, ), dtype=np.int64)
-
-        # to target box structure
-        gt_bboxes_3d = DepthInstance3DBoxes(
-            gt_bboxes_3d,
-            box_dim=gt_bboxes_3d.shape[-1],
-            with_yaw=False,
-            origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
-
-        pts_instance_mask_path = osp.join(self.data_root,
+        info['pts_instance_mask_path'] = osp.join(
+            self.data_prefix.get('pts_instance_mask', ''),
            info['pts_instance_mask_path'])
-        pts_semantic_mask_path = osp.join(self.data_root,
+        info['pts_semantic_mask_path'] = osp.join(
+            self.data_prefix.get('pts_semantic_mask', ''),
            info['pts_semantic_mask_path'])

-        anns_results = dict(
-            gt_bboxes_3d=gt_bboxes_3d,
-            gt_labels_3d=gt_labels_3d,
-            pts_instance_mask_path=pts_instance_mask_path,
-            pts_semantic_mask_path=pts_semantic_mask_path)
-        return anns_results
+        info = super().parse_data_info(info)
+        # only be used in `PointSegClassMapping` in pipeline
+        # to map original semantic class to valid category ids.
+        info['seg_label_mapping'] = self.seg_label_mapping
+        return info

-    def get_data_info(self, index):
-        """Get data info according to the given index.
+    def parse_ann_info(self, info: dict) -> dict:
+        """Process the `instances` in data info to `ann_info`.

        Args:
-            index (int): Index of the sample data to get.
+            info (dict): Info dict.

        Returns:
-            dict: Data information that will be passed to the data
-                preprocessing transforms. It includes the following keys:
-
-                - pts_filename (str): Filename of point clouds.
-                - file_name (str): Filename of point clouds.
-                - ann_info (dict): Annotation info.
+            dict: Processed `ann_info`.
        """
-        info = self.data_infos[index]
-        pts_filename = osp.join(self.data_root, info['pts_path'])
-        input_dict = dict(pts_filename=pts_filename)
+        ann_info = super().parse_ann_info(info)
+        # empty gt
+        if ann_info is None:
+            ann_info = dict()
+            ann_info['gt_bboxes_3d'] = np.zeros((0, 6), dtype=np.float32)
+            ann_info['gt_labels_3d'] = np.zeros((0, ), dtype=np.int64)
+        # to target box structure

-        if not self.test_mode:
-            annos = self.get_ann_info(index)
-            input_dict['ann_info'] = annos
-            if self.filter_empty_gt and ~(annos['gt_labels_3d'] != -1).any():
-                return None
-        return input_dict
-
-    def _build_default_pipeline(self):
-        """Build the default pipeline for this dataset."""
-        pipeline = [
-            dict(
-                type='LoadPointsFromFile',
-                coord_type='DEPTH',
-                shift_height=False,
-                load_dim=6,
-                use_dim=[0, 1, 2, 3, 4, 5]),
-            dict(
-                type='DefaultFormatBundle3D',
-                class_names=self.CLASSES,
-                with_label=False),
-            dict(type='Collect3D', keys=['points'])
-        ]
-        return Compose(pipeline)
+        ann_info['gt_bboxes_3d'] = DepthInstance3DBoxes(
+            ann_info['gt_bboxes_3d'],
+            box_dim=ann_info['gt_bboxes_3d'].shape[-1],
+            with_yaw=False,
+            origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+
+        return ann_info


 class _S3DISSegDataset(Seg3DDataset):
@@ -166,30 +159,31 @@ class _S3DISSegDataset(Seg3DDataset):
    wrapper to concat all the provided data in different areas.

    Args:
-        data_root (str): Path of dataset root.
-        ann_file (str): Path of annotation file.
-        pipeline (list[dict], optional): Pipeline used for data processing.
-            Defaults to None.
-        classes (tuple[str], optional): Classes used in the dataset.
-            Defaults to None.
-        palette (list[list[int]], optional): The palette of segmentation map.
-            Defaults to None.
-        modality (dict, optional): Modality to specify the sensor data used
-            as input. Defaults to None.
-        test_mode (bool, optional): Whether the dataset is in test mode.
-            Defaults to False.
+        data_root (str, optional): Path of dataset root, Defaults to None.
+        ann_file (str): Path of annotation file. Defaults to ''.
+        metainfo (dict, optional): Meta information for dataset, such as class
+            information. Defaults to None.
+        data_prefix (dict): Prefix for training data. Defaults to
+            dict(pts='points', pts_instance_mask='', pts_semantic_mask='').
+        pipeline (List[dict]): Pipeline used for data processing.
+            Defaults to [].
+        modality (dict): Modality to specify the sensor data used as input.
+            Defaults to dict(use_lidar=True, use_camera=False).
        ignore_index (int, optional): The label index to be ignored, e.g.
-            unannotated points. If None is given, set to len(self.CLASSES).
+            unannotated points. If None is given, set to len(self.classes) to
+            be consistent with PointSegClassMapping function in pipeline.
            Defaults to None.
-        scene_idxs (np.ndarray | str, optional): Precomputed index to load
+        scene_idxs (np.ndarray or str, optional): Precomputed index to load
            data. For scenes with many points, we may sample it several times.
            Defaults to None.
+        test_mode (bool): Whether the dataset is in test mode.
+            Defaults to False.
    """
    METAINFO = {
-        'CLASSES':
+        'classes':
        ('ceiling', 'floor', 'wall', 'beam', 'column', 'window', 'door',
         'table', 'chair', 'sofa', 'bookcase', 'board', 'clutter'),
-        'PALETTE': [[0, 255, 0], [0, 0, 255], [0, 255, 255], [255, 255, 0],
+        'palette': [[0, 255, 0], [0, 0, 255], [0, 255, 255], [255, 255, 0],
                    [255, 0, 255], [100, 100, 255], [200, 200, 100],
                    [170, 120, 200], [255, 0, 0], [200, 100, 100],
                    [10, 200, 100], [200, 200, 200], [50, 50, 50]],
@@ -204,12 +198,12 @@ class _S3DISSegDataset(Seg3DDataset):
                 ann_file: str = '',
                 metainfo: Optional[dict] = None,
                 data_prefix: dict = dict(
-                     pts='points', img='', instance_mask='', semantic_mask=''),
+                     pts='points', pts_instance_mask='', pts_semantic_mask=''),
                 pipeline: List[Union[dict, Callable]] = [],
                 modality: dict = dict(use_lidar=True, use_camera=False),
-                 ignore_index=None,
-                 scene_idxs=None,
-                 test_mode=False,
+                 ignore_index: Optional[int] = None,
+                 scene_idxs: Optional[Union[np.ndarray, str]] = None,
+                 test_mode: bool = False,
                 **kwargs) -> None:
        super().__init__(
            data_root=data_root,
@@ -223,7 +217,8 @@ class _S3DISSegDataset(Seg3DDataset):
            test_mode=test_mode,
            **kwargs)

-    def get_scene_idxs(self, scene_idxs):
+    def get_scene_idxs(self, scene_idxs: Union[np.ndarray, str,
+                                               None]) -> np.ndarray:
        """Compute scene_idxs for data sampling.

        We sample more times for scenes with more points.
@@ -250,37 +245,40 @@ class S3DISSegDataset(_S3DISSegDataset):
    data downloading.

    Args:
-        data_root (str): Path of dataset root.
-        ann_files (list[str]): Path of several annotation files.
-        pipeline (list[dict], optional): Pipeline used for data processing.
-            Defaults to None.
-        classes (tuple[str], optional): Classes used in the dataset.
-            Defaults to None.
-        palette (list[list[int]], optional): The palette of segmentation map.
-            Defaults to None.
-        modality (dict, optional): Modality to specify the sensor data used
-            as input. Defaults to None.
-        test_mode (bool, optional): Whether the dataset is in test mode.
-            Defaults to False.
+        data_root (str, optional): Path of dataset root. Defaults to None.
+        ann_files (List[str]): Path of several annotation files.
+            Defaults to ''.
+        metainfo (dict, optional): Meta information for dataset, such as class
+            information. Defaults to None.
+        data_prefix (dict): Prefix for training data. Defaults to
+            dict(pts='points', pts_instance_mask='', pts_semantic_mask='').
+        pipeline (List[dict]): Pipeline used for data processing.
+            Defaults to [].
+        modality (dict): Modality to specify the sensor data used as input.
+            Defaults to dict(use_lidar=True, use_camera=False).
        ignore_index (int, optional): The label index to be ignored, e.g.
-            unannotated points. If None is given, set to len(self.CLASSES).
+            unannotated points. If None is given, set to len(self.classes) to
+            be consistent with PointSegClassMapping function in pipeline.
            Defaults to None.
-        scene_idxs (list[np.ndarray] | list[str], optional): Precomputed index
-            to load data. For scenes with many points, we may sample it several
-            times. Defaults to None.
+        scene_idxs (List[np.ndarray] | List[str], optional): Precomputed index
+            to load data. For scenes with many points, we may sample it
+            several times. Defaults to None.
+        test_mode (bool): Whether the dataset is in test mode.
+            Defaults to False.
    """

    def __init__(self,
                 data_root: Optional[str] = None,
-                 ann_files: str = '',
+                 ann_files: List[str] = '',
                 metainfo: Optional[dict] = None,
                 data_prefix: dict = dict(
-                     pts='points', img='', instance_mask='', semantic_mask=''),
+                     pts='points', pts_instance_mask='', pts_semantic_mask=''),
                 pipeline: List[Union[dict, Callable]] = [],
                 modality: dict = dict(use_lidar=True, use_camera=False),
-                 ignore_index=None,
-                 scene_idxs=None,
-                 test_mode=False,
+                 ignore_index: Optional[int] = None,
+                 scene_idxs: Optional[Union[List[np.ndarray],
+                                            List[str]]] = None,
+                 test_mode: bool = False,
                 **kwargs) -> None:

        # make sure that ann_files and scene_idxs have same length
@@ -298,7 +296,6 @@ class S3DISSegDataset(_S3DISSegDataset):
            ignore_index=ignore_index,
            scene_idxs=scene_idxs[0],
            test_mode=test_mode,
-            serialize_data=False,
            **kwargs)

        datasets = [
@@ -312,56 +309,44 @@ class S3DISSegDataset(_S3DISSegDataset):
                ignore_index=ignore_index,
                scene_idxs=scene_idxs[i],
                test_mode=test_mode,
-                serialize_data=False,
                **kwargs) for i in range(len(ann_files))
        ]

        # data_list and scene_idxs need to be concat
        self.concat_data_list([dst.data_list for dst in datasets])
-        self.concat_scene_idxs([dst.scene_idxs for dst in datasets])

        # set group flag for the sampler
        if not self.test_mode:
            self._set_group_flag()

-    def concat_data_list(self, data_lists):
+    def concat_data_list(self, data_lists: List[List[dict]]) -> None:
        """Concat data_list from several datasets to form self.data_list.

        Args:
-            data_lists (list[list[dict]])
+            data_lists (List[List[dict]]): List of dict containing
+                annotation information.
        """
        self.data_list = [
            data for data_list in data_lists for data in data_list
        ]

-    def concat_scene_idxs(self, scene_idxs):
-        """Concat scene_idxs from several datasets to form self.scene_idxs.
-
-        Needs to manually add offset to scene_idxs[1, 2, ...].
-
-        Args:
-            scene_idxs (list[np.ndarray])
-        """
-        self.scene_idxs = np.array([], dtype=np.int32)
-        offset = 0
-        for one_scene_idxs in scene_idxs:
-            self.scene_idxs = np.concatenate(
-                [self.scene_idxs, one_scene_idxs + offset]).astype(np.int32)
-            offset = np.unique(self.scene_idxs).max() + 1
-
    @staticmethod
-    def _duplicate_to_list(x, num):
+    def _duplicate_to_list(x: Any, num: int) -> list:
        """Repeat x `num` times to form a list."""
        return [x for _ in range(num)]

-    def _check_ann_files(self, ann_file):
+    def _check_ann_files(
+            self, ann_file: Union[List[str], Tuple[str], str]) -> List[str]:
        """Make ann_files as list/tuple."""
        # ann_file could be str
        if not isinstance(ann_file, (list, tuple)):
            ann_file = self._duplicate_to_list(ann_file, 1)
        return ann_file

-    def _check_scene_idxs(self, scene_idx, num):
+    def _check_scene_idxs(self, scene_idx: Union[str, List[Union[list, tuple,
+                                                                 np.ndarray]],
+                                                 List[str], None],
+                          num: int) -> List[np.ndarray]:
        """Make scene_idxs as list/tuple."""
        if scene_idx is None:
            return self._duplicate_to_list(scene_idx, num)

--- a/mmdet3d/datasets/scannet_dataset.py
+++ b/mmdet3d/datasets/scannet_dataset.py
@@ -26,13 +26,13 @@ class ScanNetDataset(Det3DDataset):
        metainfo (dict, optional): Meta information for dataset, such as class
            information. Defaults to None.
        data_prefix (dict): Prefix for data. Defaults to
-            `dict(pts='points',
-                pts_isntance_mask='instance_mask',
-                pts_semantic_mask='semantic_mask')`.
-        pipeline (list[dict]): Pipeline used for data processing.
-            Defaults to None.
-        modality (dict): Modality to specify the sensor data used
-            as input. Defaults to None.
+            dict(pts='points',
+                 pts_instance_mask='instance_mask',
+                 pts_semantic_mask='semantic_mask').
+        pipeline (List[dict]): Pipeline used for data processing.
+            Defaults to [].
+        modality (dict): Modality to specify the sensor data used as input.
+            Defaults to dict(use_camera=False, use_lidar=True).
        box_type_3d (str): Type of 3D box of this dataset.
            Based on the `box_type_3d`, the dataset will encapsulate the box
            to its original format then converted them to `box_type_3d`.
@@ -41,13 +41,15 @@ class ScanNetDataset(Det3DDataset):
            - 'LiDAR': Box in LiDAR coordinates.
            - 'Depth': Box in depth coordinates, usually for indoor dataset.
            - 'Camera': Box in camera coordinates.
-        filter_empty_gt (bool): Whether to filter empty GT.
-            Defaults to True.
+        filter_empty_gt (bool): Whether to filter the data with empty GT.
+            If it's set to be True, the example with empty annotations after
+            data pipeline will be dropped and a random example will be chosen
+            in `__getitem__`. Defaults to True.
        test_mode (bool): Whether the dataset is in test mode.
            Defaults to False.
    """
    METAINFO = {
-        'CLASSES':
+        'classes':
        ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
         'bookshelf', 'picture', 'counter', 'desk', 'curtain', 'refrigerator',
         'showercurtrain', 'toilet', 'sink', 'bathtub', 'garbagebin'),
@@ -71,7 +73,7 @@ class ScanNetDataset(Det3DDataset):
                 box_type_3d: str = 'Depth',
                 filter_empty_gt: bool = True,
                 test_mode: bool = False,
-                 **kwargs):
+                 **kwargs) -> None:

        # construct seg_label_mapping for semantic mask
        seg_max_cat_id = len(self.METAINFO['seg_all_class_ids'])
@@ -128,8 +130,8 @@ class ScanNetDataset(Det3DDataset):
            info (dict): Raw info dict.

        Returns:
-            dict: Data information that will be passed to the data
-            preprocessing transforms. It includes the following keys:
+            dict: Has `ann_info` in training stage. And
+            all path has been converted to absolute path.
        """
        info['axis_align_matrix'] = self._get_axis_align_matrix(info)
        info['pts_instance_mask_path'] = osp.join(
@@ -146,13 +148,13 @@ class ScanNetDataset(Det3DDataset):
        return info

    def parse_ann_info(self, info: dict) -> dict:
-        """Process the `instances` in data info to `ann_info`
+        """Process the `instances` in data info to `ann_info`.

        Args:
            info (dict): Info dict.

        Returns:
-            dict: Processed `ann_info`
+            dict: Processed `ann_info`.
        """
        ann_info = super().parse_ann_info(info)
        # empty gt
@@ -181,32 +183,36 @@ class ScanNetSegDataset(Seg3DDataset):
    for data downloading.

    Args:
-        data_root (str): Path of dataset root.
-        ann_file (str): Path of annotation file.
-        pipeline (list[dict], optional): Pipeline used for data processing.
-            Defaults to None.
-        classes (tuple[str], optional): Classes used in the dataset.
-            Defaults to None.
-        palette (list[list[int]], optional): The palette of segmentation map.
-            Defaults to None.
-        modality (dict, optional): Modality to specify the sensor data used
-            as input. Defaults to None.
-        test_mode (bool, optional): Whether the dataset is in test mode.
-            Defaults to False.
+        data_root (str, optional): Path of dataset root. Defaults to None.
+        ann_file (str): Path of annotation file. Defaults to ''.
+        pipeline (List[dict]): Pipeline used for data processing.
+            Defaults to [].
+        metainfo (dict, optional): Meta information for dataset, such as class
+            information. Defaults to None.
+        data_prefix (dict): Prefix for training data. Defaults to
+            dict(pts='points',
+                 img='',
+                 pts_instance_mask='',
+                 pts_semantic_mask='').
+        modality (dict): Modality to specify the sensor data used as input.
+            Defaults to dict(use_lidar=True, use_camera=False).
        ignore_index (int, optional): The label index to be ignored, e.g.
-            unannotated points. If None is given, set to len(self.CLASSES).
+            unannotated points. If None is given, set to len(self.classes) to
+            be consistent with PointSegClassMapping function in pipeline.
            Defaults to None.
-        scene_idxs (np.ndarray | str, optional): Precomputed index to load
+        scene_idxs (np.ndarray or str, optional): Precomputed index to load
            data. For scenes with many points, we may sample it several times.
            Defaults to None.
+        test_mode (bool): Whether the dataset is in test mode.
+            Defaults to False.
    """
    METAINFO = {
-        'CLASSES':
+        'classes':
        ('wall', 'floor', 'cabinet', 'bed', 'chair', 'sofa', 'table', 'door',
         'window', 'bookshelf', 'picture', 'counter', 'desk', 'curtain',
         'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
         'otherfurniture'),
-        'PALETTE': [
+        'palette': [
            [174, 199, 232],
            [152, 223, 138],
            [31, 119, 180],
@@ -239,12 +245,15 @@ class ScanNetSegDataset(Seg3DDataset):
                 ann_file: str = '',
                 metainfo: Optional[dict] = None,
                 data_prefix: dict = dict(
-                     pts='points', img='', instance_mask='', semantic_mask=''),
+                     pts='points',
+                     img='',
+                     pts_instance_mask='',
+                     pts_semantic_mask=''),
                 pipeline: List[Union[dict, Callable]] = [],
                 modality: dict = dict(use_lidar=True, use_camera=False),
-                 ignore_index=None,
-                 scene_idxs=None,
-                 test_mode=False,
+                 ignore_index: Optional[int] = None,
+                 scene_idxs: Optional[Union[np.ndarray, str]] = None,
+                 test_mode: bool = False,
                 **kwargs) -> None:
        super().__init__(
            data_root=data_root,
@@ -258,7 +267,8 @@ class ScanNetSegDataset(Seg3DDataset):
            test_mode=test_mode,
            **kwargs)

-    def get_scene_idxs(self, scene_idxs):
+    def get_scene_idxs(self, scene_idxs: Union[np.ndarray, str,
+                                               None]) -> np.ndarray:
        """Compute scene_idxs for data sampling.

        We sample more times for scenes with more points.
@@ -275,11 +285,11 @@ class ScanNetSegDataset(Seg3DDataset):
 class ScanNetInstanceSegDataset(Seg3DDataset):

    METAINFO = {
-        'CLASSES':
+        'classes':
        ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
         'bookshelf', 'picture', 'counter', 'desk', 'curtain', 'refrigerator',
         'showercurtrain', 'toilet', 'sink', 'bathtub', 'garbagebin'),
-        'PLATTE': [
+        'palette': [
            [174, 199, 232],
            [152, 223, 138],
            [31, 119, 180],
@@ -312,13 +322,16 @@ class ScanNetInstanceSegDataset(Seg3DDataset):
                 ann_file: str = '',
                 metainfo: Optional[dict] = None,
                 data_prefix: dict = dict(
-                     pts='points', img='', instance_mask='', semantic_mask=''),
+                     pts='points',
+                     img='',
+                     pts_instance_mask='',
+                     pts_semantic_mask=''),
                 pipeline: List[Union[dict, Callable]] = [],
                 modality: dict = dict(use_lidar=True, use_camera=False),
-                 test_mode=False,
-                 ignore_index=None,
-                 scene_idxs=None,
-                 file_client_args=dict(backend='disk'),
+                 test_mode: bool = False,
+                 ignore_index: Optional[int] = None,
+                 scene_idxs: Optional[Union[np.ndarray, str]] = None,
+                 file_client_args: dict = dict(backend='disk'),
                 **kwargs) -> None:
        super().__init__(
            data_root=data_root,

--- a/mmdet3d/datasets/seg3d_dataset.py
+++ b/mmdet3d/datasets/seg3d_dataset.py
 # Copyright (c) OpenMMLab. All rights reserved.
 from os import path as osp
-from typing import Callable, Dict, List, Optional, Sequence, Union
+from typing import Callable, List, Optional, Sequence, Union

 import mmengine
 import numpy as np
@@ -16,40 +16,45 @@ class Seg3DDataset(BaseDataset):
    This is the base dataset of ScanNet, S3DIS and SemanticKITTI dataset.

    Args:
-        data_root (str): Path of dataset root.
-        ann_file (str): Path of annotation file.
-        pipeline (list[dict], optional): Pipeline used for data processing.
-            Defaults to None.
+        data_root (str, optional): Path of dataset root. Defaults to None.
+        ann_file (str): Path of annotation file. Defaults to ''.
        metainfo (dict, optional): Meta information for dataset, such as class
            information. Defaults to None.
-        data_prefix (dict, optional): Prefix for training data. Defaults to
-            dict(pts='velodyne', img='', instance_mask='', semantic_mask='').
-        pipeline (list[dict], optional): Pipeline used for data processing.
-            Defaults to None.
-        modality (dict, optional): Modality to specify the sensor data used
-            as input, it usually has following keys.
+        data_prefix (dict): Prefix for training data. Defaults to
+            dict(pts='points',
+                 img='',
+                 pts_instance_mask='',
+                 pts_semantic_mask='').
+        pipeline (List[dict]): Pipeline used for data processing.
+            Defaults to [].
+        modality (dict): Modality to specify the sensor data used
+            as input, it usually has following keys:

                - use_camera: bool
                - use_lidar: bool
-            Defaults to `dict(use_lidar=True, use_camera=False)`
-        test_mode (bool, optional): Whether the dataset is in test mode.
-            Defaults to False.
+            Defaults to dict(use_lidar=True, use_camera=False).
        ignore_index (int, optional): The label index to be ignored, e.g.
-            unannotated points. If None is given, set to len(self.CLASSES) to
+            unannotated points. If None is given, set to len(self.classes) to
            be consistent with PointSegClassMapping function in pipeline.
            Defaults to None.
-        scene_idxs (np.ndarray | str, optional): Precomputed index to load
+        scene_idxs (np.ndarray or str, optional): Precomputed index to load
            data. For scenes with many points, we may sample it several times.
            Defaults to None.
-        load_eval_anns (bool): Whether to load annotations
-            in test_mode, the annotation will be save in
-            `eval_ann_infos`, which can be use in Evaluator.
+        test_mode (bool): Whether the dataset is in test mode.
+            Defaults to False.
+        serialize_data (bool): Whether to hold memory using serialized objects,
+            when enabled, data loader workers can use shared RAM from master
+            process instead of making a copy.
+            Defaults to False for 3D Segmentation datasets.
+        load_eval_anns (bool): Whether to load annotations in test_mode,
+            the annotation will be save in `eval_ann_infos`, which can be used
+            in Evaluator. Defaults to True.
        file_client_args (dict): Configuration of file client.
-            Defaults to `dict(backend='disk')`.
+            Defaults to dict(backend='disk').
    """
    METAINFO = {
-        'CLASSES': None,  # names of all classes data used for the task
-        'PALETTE': None,  # official color for visualization
+        'classes': None,  # names of all classes data used for the task
+        'palette': None,  # official color for visualization
        'seg_valid_class_ids': None,  # class_ids used for training
        'seg_all_class_ids': None,  # all possible class_ids in loaded seg mask
    }
@@ -62,12 +67,13 @@ class Seg3DDataset(BaseDataset):
                     pts='points',
                     img='',
                     pts_instance_mask='',
-                     pts_emantic_mask=''),
+                     pts_semantic_mask=''),
                 pipeline: List[Union[dict, Callable]] = [],
                 modality: dict = dict(use_lidar=True, use_camera=False),
                 ignore_index: Optional[int] = None,
-                 scene_idxs: Optional[str] = None,
+                 scene_idxs: Optional[Union[str, np.ndarray]] = None,
                 test_mode: bool = False,
+                 serialize_data: bool = False,
                 load_eval_anns: bool = True,
                 file_client_args: dict = dict(backend='disk'),
                 **kwargs) -> None:
@@ -78,11 +84,11 @@ class Seg3DDataset(BaseDataset):

        # TODO: We maintain the ignore_index attributes,
        # but we may consider to remove it in the future.
-        self.ignore_index = len(self.METAINFO['CLASSES']) if \
+        self.ignore_index = len(self.METAINFO['classes']) if \
            ignore_index is None else ignore_index

        # Get label mapping for custom classes
-        new_classes = metainfo.get('CLASSES', None)
+        new_classes = metainfo.get('classes', None)

        self.label_mapping, self.label2cat, seg_valid_class_ids = \
            self.get_label_mapping(new_classes)
@@ -95,10 +101,10 @@ class Seg3DDataset(BaseDataset):
        # generate palette if it is not defined based on
        # label mapping, otherwise directly use palette
        # defined in dataset config.
-        palette = metainfo.get('PALETTE', None)
+        palette = metainfo.get('palette', None)
        updated_palette = self._update_palette(new_classes, palette)

-        metainfo['PALETTE'] = updated_palette
+        metainfo['palette'] = updated_palette

        # construct seg_label_mapping for semantic mask
        seg_max_cat_id = len(self.METAINFO['seg_all_class_ids'])
@@ -117,18 +123,19 @@ class Seg3DDataset(BaseDataset):
            data_prefix=data_prefix,
            pipeline=pipeline,
            test_mode=test_mode,
+            serialize_data=serialize_data,
            **kwargs)

        self.metainfo['seg_label_mapping'] = self.seg_label_mapping
        self.scene_idxs = self.get_scene_idxs(scene_idxs)
+        self.data_list = [self.data_list[i] for i in self.scene_idxs]

        # set group flag for the sampler
        if not self.test_mode:
            self._set_group_flag()

    def get_label_mapping(self,
-                          new_classes: Optional[Sequence] = None
-                          ) -> Union[Dict, None]:
+                          new_classes: Optional[Sequence] = None) -> tuple:
        """Get label mapping.

        The ``label_mapping`` is a dictionary, its keys are the old label ids
@@ -138,21 +145,20 @@ class Seg3DDataset(BaseDataset):
        None, `label_mapping` is not None.

        Args:
-            new_classes (list, tuple, optional): The new classes name from
-                metainfo. Default to None.
-
+            new_classes (list or tuple, optional): The new classes name from
+                metainfo. Defaults to None.

        Returns:
            tuple: The mapping from old classes in cls.METAINFO to
            new classes in metainfo
        """
-        old_classes = self.METAINFO.get('CLASSES', None)
+        old_classes = self.METAINFO.get('classes', None)
        if (new_classes is not None and old_classes is not None
                and list(new_classes) != list(old_classes)):
            if not set(new_classes).issubset(old_classes):
                raise ValueError(
                    f'new classes {new_classes} is not a '
-                    f'subset of CLASSES {old_classes} in METAINFO.')
+                    f'subset of classes {old_classes} in METAINFO.')

            # obtain true id from valid_class_ids
            valid_class_ids = [
@@ -180,13 +186,14 @@ class Seg3DDataset(BaseDataset):
            # map label to category name
            label2cat = {
                i: cat_name
-                for i, cat_name in enumerate(self.METAINFO['CLASSES'])
+                for i, cat_name in enumerate(self.METAINFO['classes'])
            }
            valid_class_ids = self.METAINFO['seg_valid_class_ids']

        return label_mapping, label2cat, valid_class_ids

-    def _update_palette(self, new_classes, palette) -> list:
+    def _update_palette(self, new_classes: list, palette: Union[None,
+                                                                list]) -> list:
        """Update palette according to metainfo.

        If length of palette is equal to classes, just return the palette.
@@ -199,10 +206,10 @@ class Seg3DDataset(BaseDataset):
        """
        if palette is None:
            # If palette is not defined, it generate a palette according
-            # to the original PALETTE and classes.
-            old_classes = self.METAINFO.get('CLASSES', None)
+            # to the original palette and classes.
+            old_classes = self.METAINFO.get('classes', None)
            palette = [
-                self.METAINFO['PALETTE'][old_classes.index(cls_name)]
+                self.METAINFO['palette'][old_classes.index(cls_name)]
                for cls_name in new_classes
            ]
            return palette
@@ -211,8 +218,8 @@ class Seg3DDataset(BaseDataset):
        if len(palette) == len(new_classes):
            return palette
        else:
-            raise ValueError('Once PLATTE in set in metainfo, it should'
-                             'match CLASSES in metainfo')
+            raise ValueError('Once palette in set in metainfo, it should'
+                             'match classes in metainfo')

    def parse_data_info(self, info: dict) -> dict:
        """Process the raw data info.
@@ -260,7 +267,8 @@ class Seg3DDataset(BaseDataset):

        return info

-    def get_scene_idxs(self, scene_idxs):
+    def get_scene_idxs(self, scene_idxs: Union[None, str,
+                                               np.ndarray]) -> np.ndarray:
        """Compute scene_idxs for data sampling.

        We sample more times for scenes with more points.
@@ -282,7 +290,7 @@ class Seg3DDataset(BaseDataset):

        return scene_idxs.astype(np.int32)

-    def _set_group_flag(self):
+    def _set_group_flag(self) -> None:
        """Set flag according to image aspect ratio.

        Images with aspect ratio greater than 1 will be set as group 1,

--- a/mmdet3d/datasets/semantickitti_dataset.py
+++ b/mmdet3d/datasets/semantickitti_dataset.py
 # Copyright (c) OpenMMLab. All rights reserved.
 from typing import Callable, List, Optional, Union

+import numpy as np
+
 from mmdet3d.registry import DATASETS
 from .seg3d_dataset import Seg3DDataset

@@ -14,30 +16,35 @@ class SemanticKITTIDataset(Seg3DDataset):
    for data downloading

    Args:
-        data_root (str): Path of dataset root.
-        ann_file (str): Path of annotation file.
-        pipeline (list[dict], optional): Pipeline used for data processing.
+        data_root (str, optional): Path of dataset root. Defaults to None.
+        ann_file (str): Path of annotation file. Defaults to ''.
+        metainfo (dict, optional): Meta information for dataset, such as class
+            information. Defaults to None.
+        data_prefix (dict): Prefix for training data. Defaults to
+            dict(pts='points',
+                 img='',
+                 pts_instance_mask='',
+                 pts_semantic_mask='').
+        pipeline (List[dict]): Pipeline used for data processing.
+            Defaults to [].
+        modality (dict): Modality to specify the sensor data used as input,
+            it usually has following keys:
+
+                - use_camera: bool
+                - use_lidar: bool
+            Defaults to dict(use_lidar=True, use_camera=False).
+        ignore_index (int, optional): The label index to be ignored, e.g.
+            unannotated points. If None is given, set to len(self.classes) to
+            be consistent with PointSegClassMapping function in pipeline.
            Defaults to None.
-        classes (tuple[str], optional): Classes used in the dataset.
+        scene_idxs (np.ndarray or str, optional): Precomputed index to load
+            data. For scenes with many points, we may sample it several times.
            Defaults to None.
-        modality (dict, optional): Modality to specify the sensor data used
-            as input. Defaults to None.
-        box_type_3d (str, optional): NO 3D box for this dataset.
-            You can choose any type
-            Based on the `box_type_3d`, the dataset will encapsulate the box
-            to its original format then converted them to `box_type_3d`.
-            Defaults to 'LiDAR' in this dataset. Available options includes
-
-            - 'LiDAR': Box in LiDAR coordinates.
-            - 'Depth': Box in depth coordinates, usually for indoor dataset.
-            - 'Camera': Box in camera coordinates.
-        filter_empty_gt (bool, optional): Whether to filter empty GT.
-            Defaults to True.
-        test_mode (bool, optional): Whether the dataset is in test mode.
+        test_mode (bool): Whether the dataset is in test mode.
            Defaults to False.
    """
    METAINFO = {
-        'CLASSES': ('unlabeled', 'car', 'bicycle', 'motorcycle', 'truck',
+        'classes': ('unlabeled', 'car', 'bicycle', 'motorcycle', 'truck',
                    'bus', 'person', 'bicyclist', 'motorcyclist', 'road',
                    'parking', 'sidewalk', 'other-ground', 'building', 'fence',
                    'vegetation', 'trunck', 'terrian', 'pole', 'traffic-sign'),
@@ -52,12 +59,15 @@ class SemanticKITTIDataset(Seg3DDataset):
                 ann_file: str = '',
                 metainfo: Optional[dict] = None,
                 data_prefix: dict = dict(
-                     pts='points', img='', instance_mask='', semantic_mask=''),
+                     pts='points',
+                     img='',
+                     pts_instance_mask='',
+                     pts_semantic_mask=''),
                 pipeline: List[Union[dict, Callable]] = [],
                 modality: dict = dict(use_lidar=True, use_camera=False),
-                 ignore_index=None,
-                 scene_idxs=None,
-                 test_mode=False,
+                 ignore_index: Optional[int] = None,
+                 scene_idxs: Optional[Union[str, np.ndarray]] = None,
+                 test_mode: bool = False,
                 **kwargs) -> None:

        super().__init__(

--- a/mmdet3d/datasets/sunrgbd_dataset.py
+++ b/mmdet3d/datasets/sunrgbd_dataset.py
@@ -24,13 +24,13 @@ class SUNRGBDDataset(Det3DDataset):
        ann_file (str): Path of annotation file.
        metainfo (dict, optional): Meta information for dataset, such as class
            information. Defaults to None.
-        data_prefix (dict, optiona;): Prefix for data. Defaults to
+        data_prefix (dict): Prefix for data. Defaults to
            dict(pts='points',img='sunrgbd_trainval').
-        pipeline (list[dict], optional): Pipeline used for data processing.
-            Defaults to None.
-        modality (dict, optional): Modality to specify the sensor data used
-            as input. Defaults to dict(use_camera=True, use_lidar=True).
-        default_cam_key (str, optional): The default camera name adopted.
+        pipeline (List[dict]): Pipeline used for data processing.
+            Defaults to [].
+        modality (dict): Modality to specify the sensor data used as input.
+            Defaults to dict(use_camera=True, use_lidar=True).
+        default_cam_key (str): The default camera name adopted.
            Defaults to 'CAM0'.
        box_type_3d (str): Type of 3D box of this dataset.
            Based on the `box_type_3d`, the dataset will encapsulate the box
@@ -40,13 +40,13 @@ class SUNRGBDDataset(Det3DDataset):
            - 'LiDAR': Box in LiDAR coordinates.
            - 'Depth': Box in depth coordinates, usually for indoor dataset.
            - 'Camera': Box in camera coordinates.
-        filter_empty_gt (bool, optional): Whether to filter empty GT.
+        filter_empty_gt (bool): Whether to filter empty GT.
            Defaults to True.
-        test_mode (bool, optional): Whether the dataset is in test mode.
+        test_mode (bool): Whether the dataset is in test mode.
            Defaults to False.
    """
    METAINFO = {
-        'CLASSES': ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk',
+        'classes': ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk',
                    'dresser', 'night_stand', 'bookshelf', 'bathtub')
    }

@@ -58,11 +58,11 @@ class SUNRGBDDataset(Det3DDataset):
                     pts='points', img='sunrgbd_trainval/image'),
                 pipeline: List[Union[dict, Callable]] = [],
                 default_cam_key: str = 'CAM0',
-                 modality=dict(use_camera=True, use_lidar=True),
+                 modality: dict = dict(use_camera=True, use_lidar=True),
                 box_type_3d: str = 'Depth',
                 filter_empty_gt: bool = True,
                 test_mode: bool = False,
-                 **kwargs):
+                 **kwargs) -> None:
        super().__init__(
            data_root=data_root,
            ann_file=ann_file,
@@ -121,7 +121,7 @@ class SUNRGBDDataset(Det3DDataset):
        return info

    def parse_ann_info(self, info: dict) -> dict:
-        """Process the `instances` in data info to `ann_info`
+        """Process the `instances` in data info to `ann_info`.

        Args:
            info (dict): Info dict.

--- a/mmdet3d/datasets/transforms/__init__.py
+++ b/mmdet3d/datasets/transforms/__init__.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from .compose import Compose
 from .dbsampler import DataBaseSampler
 from .formating import Pack3DDetInputs
 from .loading import (LoadAnnotations3D, LoadImageFromFileMono3D,
@@ -21,15 +20,13 @@ from .transforms_3d import (AffineResize, BackgroundPointsFilter,
 __all__ = [
    'ObjectSample', 'RandomFlip3D', 'ObjectNoise', 'GlobalRotScaleTrans',
    'PointShuffle', 'ObjectRangeFilter', 'PointsRangeFilter',
-    'Pack3DDetInputs',
-    'Compose', 'LoadMultiViewImageFromFiles', 'LoadPointsFromFile',
-    'DataBaseSampler',
-    'NormalizePointsColor', 'LoadAnnotations3D', 'IndoorPointSample',
-    'PointSample', 'PointSegClassMapping', 'MultiScaleFlipAug3D',
-    'LoadPointsFromMultiSweeps', 'BackgroundPointsFilter',
-    'VoxelBasedPointSampler', 'GlobalAlignment', 'IndoorPatchPointSample',
-    'LoadImageFromFileMono3D', 'ObjectNameFilter', 'RandomDropPointsColor',
-    'RandomJitterPoints', 'AffineResize', 'RandomShiftScale',
-    'LoadPointsFromDict', 'Resize3D', 'RandomResize3D',
+    'Pack3DDetInputs', 'LoadMultiViewImageFromFiles', 'LoadPointsFromFile',
+    'DataBaseSampler', 'NormalizePointsColor', 'LoadAnnotations3D',
+    'IndoorPointSample', 'PointSample', 'PointSegClassMapping',
+    'MultiScaleFlipAug3D', 'LoadPointsFromMultiSweeps',
+    'BackgroundPointsFilter', 'VoxelBasedPointSampler', 'GlobalAlignment',
+    'IndoorPatchPointSample', 'LoadImageFromFileMono3D', 'ObjectNameFilter',
+    'RandomDropPointsColor', 'RandomJitterPoints', 'AffineResize',
+    'RandomShiftScale', 'LoadPointsFromDict', 'Resize3D', 'RandomResize3D',
    'MultiViewWrapper', 'PhotoMetricDistortion3D'
 ]
--- a/mmdet3d/datasets/transforms/compose.py
+++ b/mmdet3d/datasets/transforms/compose.py
-# Copyright (c) OpenMMLab. All rights reserved.
-import collections
-
-from mmdet3d.registry import TRANSFORMS
-
-
-@TRANSFORMS.register_module()
-class Compose:
-    """Compose multiple transforms sequentially.
-
-    Args:
-        transforms (Sequence[dict | callable]): Sequence of transform object or
-            config dict to be composed.
-    """
-
-    def __init__(self, transforms):
-        assert isinstance(transforms, collections.abc.Sequence)
-        self.transforms = []
-        for transform in transforms:
-            if isinstance(transform, dict):
-                transform = TRANSFORMS.build(transform)
-                self.transforms.append(transform)
-            elif callable(transform):
-                self.transforms.append(transform)
-            else:
-                raise TypeError('transform must be callable or a dict')
-
-    def __call__(self, data):
-        """Call function to apply transforms sequentially.
-
-        Args:
-            data (dict): A result dict contains the data to transform.
-
-        Returns:
-            dict: Transformed data.
-        """
-
-        for t in self.transforms:
-            data = t(data)
-            if data is None:
-                return None
-        return data
-
-    def __repr__(self):
-        format_string = self.__class__.__name__ + '('
-        for t in self.transforms:
-            str_ = t.__repr__()
-            if 'Compose(' in str_:
-                str_ = str_.replace('\n', '\n    ')
-            format_string += '\n'
-            format_string += f'    {str_}'
-        format_string += '\n)'
-        return format_string
--- a/mmdet3d/datasets/transforms/dbsampler.py
+++ b/mmdet3d/datasets/transforms/dbsampler.py
@@ -18,9 +18,8 @@ class BatchSampler:
        sample_list (list[dict]): List of samples.
        name (str, optional): The category of samples. Defaults to None.
        epoch (int, optional): Sampling epoch. Defaults to None.
-        shuffle (bool, optional): Whether to shuffle indices.
-            Defaults to False.
-        drop_reminder (bool, optional): Drop reminder. Defaults to False.
+        shuffle (bool): Whether to shuffle indices. Defaults to False.
+        drop_reminder (bool): Drop reminder. Defaults to False.
    """

    def __init__(self,
@@ -90,12 +89,11 @@ class DataBaseSampler(object):
        prepare (dict): Name of preparation functions and the input value.
        sample_groups (dict): Sampled classes and numbers.
        classes (list[str], optional): List of classes. Defaults to None.
-        points_loader(dict, optional): Config of points loader. Defaults to
+        points_loader (dict): Config of points loader. Defaults to
            dict(type='LoadPointsFromFile', load_dim=4, use_dim=[0, 1, 2, 3]).
-        file_client_args (dict, optional): Config dict of file clients,
-            refer to
-            https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/file_client.py
-            for more details. Defaults to dict(backend='disk').
+        file_client_args (dict): Arguments to instantiate a FileClient.
+            See :class:`mmengine.fileio.FileClient` for details.
+            Defaults to dict(backend='disk').
    """

    def __init__(
@@ -133,12 +131,12 @@ class DataBaseSampler(object):
        from mmengine.logging import MMLogger
        logger: MMLogger = MMLogger.get_current_instance()
        for k, v in db_infos.items():
-            logger.info(f'load {len(v)} {k} database infos')
+            logger.info(f'load {len(v)} {k} database infos in DataBaseSampler')
        for prep_func, val in prepare.items():
            db_infos = getattr(self, prep_func)(db_infos, val)
        logger.info('After filter database:')
        for k, v in db_infos.items():
-            logger.info(f'load {len(v)} {k} database infos')
+            logger.info(f'load {len(v)} {k} database infos in DataBaseSampler')

        self.db_infos = db_infos


--- a/mmdet3d/datasets/transforms/formating.py
+++ b/mmdet3d/datasets/transforms/formating.py
@@ -102,7 +102,7 @@ class Pack3DDetInputs(BaseTransform):
                - points
                - img

-            - 'data_samples' (obj:`Det3DDataSample`): The annotation info of
+            - 'data_samples' (:obj:`Det3DDataSample`): The annotation info of
              the sample.
        """
        # augtest

--- a/mmdet3d/datasets/transforms/loading.py
+++ b/mmdet3d/datasets/transforms/loading.py
@@ -7,10 +7,10 @@ import mmengine
 import numpy as np
 from mmcv.transforms import LoadImageFromFile
 from mmcv.transforms.base import BaseTransform
+from mmdet.datasets.transforms import LoadAnnotations

 from mmdet3d.registry import TRANSFORMS
 from mmdet3d.structures.points import BasePoints, get_points_type
-from mmdet.datasets.transforms import LoadAnnotations


 @TRANSFORMS.register_module()
@@ -20,19 +20,17 @@ class LoadMultiViewImageFromFiles(BaseTransform):
    Expects results['img_filename'] to be a list of filenames.

    Args:
-        to_float32 (bool, optional): Whether to convert the img to float32.
+        to_float32 (bool): Whether to convert the img to float32.
            Defaults to False.
-        color_type (str, optional): Color type of the file.
-            Defaults to 'unchanged'.
-        file_client_args (dict): Config dict of file clients,
-            refer to
-            https://github.com/open-mmlab/mmcv/blob/master/mmcv/fileio/file_client.py
-            for more details. Defaults to dict(backend='disk').
-        num_views (int): num of view in a frame. Default to 5.
-        num_ref_frames (int): num of frame in loading. Default to -1.
-        test_mode (bool): Whether is test mode in loading. Default to False.
-        set_default_scale (bool): Whether to set default scale. Default to
-        True.
+        color_type (str): Color type of the file. Defaults to 'unchanged'.
+        file_client_args (dict): Arguments to instantiate a FileClient.
+            See :class:`mmengine.fileio.FileClient` for details.
+            Defaults to dict(backend='disk').
+        num_views (int): Number of view in a frame. Defaults to 5.
+        num_ref_frames (int): Number of frame in loading. Defaults to -1.
+        test_mode (bool): Whether is test mode in loading. Defaults to False.
+        set_default_scale (bool): Whether to set default scale.
+            Defaults to True.
    """

    def __init__(self,
@@ -210,7 +208,7 @@ class LoadMultiViewImageFromFiles(BaseTransform):
        results['num_ref_frames'] = self.num_ref_frames
        return results

-    def __repr__(self):
+    def __repr__(self) -> str:
        """str: Return a string that describes the module."""
        repr_str = self.__class__.__name__
        repr_str += f'(to_float32={self.to_float32}, '
@@ -276,22 +274,17 @@ class LoadPointsFromMultiSweeps(BaseTransform):
    This is usually used for nuScenes dataset to utilize previous sweeps.

    Args:
-        sweeps_num (int, optional): Number of sweeps. Defaults to 10.
-        load_dim (int, optional): Dimension number of the loaded points.
-            Defaults to 5.
-        use_dim (list[int], optional): Which dimension to use.
-            Defaults to [0, 1, 2, 4].
-        file_client_args (dict, optional): Config dict of file clients,
-            refer to
-            https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/file_client.py
-            for more details. Defaults to dict(backend='disk').
-        pad_empty_sweeps (bool, optional): Whether to repeat keyframe when
+        sweeps_num (int): Number of sweeps. Defaults to 10.
+        load_dim (int): Dimension number of the loaded points. Defaults to 5.
+        use_dim (list[int]): Which dimension to use. Defaults to [0, 1, 2, 4].
+        file_client_args (dict): Arguments to instantiate a FileClient.
+            See :class:`mmengine.fileio.FileClient` for details.
+            Defaults to dict(backend='disk').
+        pad_empty_sweeps (bool): Whether to repeat keyframe when
            sweeps is empty. Defaults to False.
-        remove_close (bool, optional): Whether to remove close points.
-            Defaults to False.
-        test_mode (bool, optional): If `test_mode=True`, it will not
-            randomly sample sweeps but select the nearest N frames.
-            Defaults to False.
+        remove_close (bool): Whether to remove close points. Defaults to False.
+        test_mode (bool): If `test_mode=True`, it will not randomly sample
+            sweeps but select the nearest N frames. Defaults to False.
    """

    def __init__(self,
@@ -336,11 +329,11 @@ class LoadPointsFromMultiSweeps(BaseTransform):
    def _remove_close(self,
                      points: Union[np.ndarray, BasePoints],
                      radius: float = 1.0) -> Union[np.ndarray, BasePoints]:
-        """Removes point too close within a certain radius from origin.
+        """Remove point too close within a certain radius from origin.

        Args:
            points (np.ndarray | :obj:`BasePoints`): Sweep points.
-            radius (float, optional): Radius below which points are removed.
+            radius (float): Radius below which points are removed.
                Defaults to 1.0.

        Returns:
@@ -414,7 +407,7 @@ class LoadPointsFromMultiSweeps(BaseTransform):
        results['points'] = points
        return results

-    def __repr__(self):
+    def __repr__(self) -> str:
        """str: Return a string that describes the module."""
        return f'{self.__class__.__name__}(sweeps_num={self.sweeps_num})'

@@ -465,7 +458,7 @@ class PointSegClassMapping(BaseTransform):

        return results

-    def __repr__(self):
+    def __repr__(self) -> str:
        """str: Return a string that describes the module."""
        repr_str = self.__class__.__name__
        return repr_str
@@ -505,7 +498,7 @@ class NormalizePointsColor(BaseTransform):
        input_dict['points'] = points
        return input_dict

-    def __repr__(self):
+    def __repr__(self) -> str:
        """str: Return a string that describes the module."""
        repr_str = self.__class__.__name__
        repr_str += f'(color_mean={self.color_mean})'
@@ -533,19 +526,15 @@ class LoadPointsFromFile(BaseTransform):
            - 'LIDAR': Points in LiDAR coordinates.
            - 'DEPTH': Points in depth coordinates, usually for indoor dataset.
            - 'CAMERA': Points in camera coordinates.
-        load_dim (int, optional): The dimension of the loaded points.
-            Defaults to 6.
-        use_dim (list[int] | int, optional): Which dimensions of the points
-            to use. Defaults to [0, 1, 2]. For KITTI dataset, set use_dim=4
+        load_dim (int): The dimension of the loaded points. Defaults to 6.
+        use_dim (list[int] | int): Which dimensions of the points to use.
+            Defaults to [0, 1, 2]. For KITTI dataset, set use_dim=4
            or use_dim=[0, 1, 2, 3] to use the intensity dimension.
-        shift_height (bool, optional): Whether to use shifted height.
-            Defaults to False.
-        use_color (bool, optional): Whether to use color features.
-            Defaults to False.
-        file_client_args (dict, optional): Config dict of file clients,
-            refer to
-            https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/file_client.py
-            for more details. Defaults to dict(backend='disk').
+        shift_height (bool): Whether to use shifted height. Defaults to False.
+        use_color (bool): Whether to use color features. Defaults to False.
+        file_client_args (dict): Arguments to instantiate a FileClient.
+            See :class:`mmengine.fileio.FileClient` for details.
+            Defaults to dict(backend='disk').
    """

    def __init__(
@@ -638,7 +627,7 @@ class LoadPointsFromFile(BaseTransform):

        return results

-    def __repr__(self):
+    def __repr__(self) -> str:
        """str: Return a string that describes the module."""
        repr_str = self.__class__.__name__ + '('
        repr_str += f'shift_height={self.shift_height}, '
@@ -688,7 +677,7 @@ class LoadAnnotations3D(LoadAnnotations):
    - pts_instance_mask_path (str): Path of instance mask file.
      Only when `with_mask_3d` is True.
    - pts_semantic_mask_path (str): Path of semantic mask file.
-      Only when
+      Only when `with_seg_3d` is True.

    Added Keys:

@@ -713,33 +702,25 @@ class LoadAnnotations3D(LoadAnnotations):
      Only when `with_seg_3d` is True.

    Args:
-        with_bbox_3d (bool, optional): Whether to load 3D boxes.
-            Defaults to True.
-        with_label_3d (bool, optional): Whether to load 3D labels.
-            Defaults to True.
-        with_attr_label (bool, optional): Whether to load attribute label.
-            Defaults to False.
-        with_mask_3d (bool, optional): Whether to load 3D instance masks.
-            for points. Defaults to False.
-        with_seg_3d (bool, optional): Whether to load 3D semantic masks.
-            for points. Defaults to False.
-        with_bbox (bool, optional): Whether to load 2D boxes.
-            Defaults to False.
-        with_label (bool, optional): Whether to load 2D labels.
+        with_bbox_3d (bool): Whether to load 3D boxes. Defaults to True.
+        with_label_3d (bool): Whether to load 3D labels. Defaults to True.
+        with_attr_label (bool): Whether to load attribute label.
            Defaults to False.
-        with_mask (bool, optional): Whether to load 2D instance masks.
+        with_mask_3d (bool): Whether to load 3D instance masks for points.
            Defaults to False.
-        with_seg (bool, optional): Whether to load 2D semantic masks.
+        with_seg_3d (bool): Whether to load 3D semantic masks for points.
            Defaults to False.
-        with_bbox_depth (bool, optional): Whether to load 2.5D boxes.
-            Defaults to False.
-        poly2mask (bool, optional): Whether to convert polygon annotations
-            to bitmasks. Defaults to True.
-        seg_3d_dtype (dtype, optional): Dtype of 3D semantic masks.
-            Defaults to int64.
-        file_client_args (dict): Config dict of file clients, refer to
-            https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/file_client.py
-            for more details.
+        with_bbox (bool): Whether to load 2D boxes. Defaults to False.
+        with_label (bool): Whether to load 2D labels. Defaults to False.
+        with_mask (bool): Whether to load 2D instance masks. Defaults to False.
+        with_seg (bool): Whether to load 2D semantic masks. Defaults to False.
+        with_bbox_depth (bool): Whether to load 2.5D boxes. Defaults to False.
+        poly2mask (bool): Whether to convert polygon annotations to bitmasks.
+            Defaults to True.
+        seg_3d_dtype (dtype): Dtype of 3D semantic masks. Defaults to int64.
+        file_client_args (dict): Arguments to instantiate a FileClient.
+            See :class:`mmengine.fileio.FileClient` for details.
+            Defaults to dict(backend='disk').
    """

    def __init__(
@@ -889,7 +870,8 @@ class LoadAnnotations3D(LoadAnnotations):
        `ignore_flag`

        Args:
-            results (dict): Result dict from :obj:``mmcv.BaseDataset``.
+            results (dict): Result dict from :obj:`mmcv.BaseDataset`.
+
        Returns:
            dict: The dict contains loaded bounding box annotations.
        """
@@ -900,7 +882,7 @@ class LoadAnnotations3D(LoadAnnotations):
        """Private function to load label annotations.

        Args:
-            results (dict): Result dict from :obj :obj:``mmcv.BaseDataset``.
+            results (dict): Result dict from :obj :obj:`mmcv.BaseDataset`.

        Returns:
            dict: The dict contains loaded label annotations.
@@ -933,7 +915,7 @@ class LoadAnnotations3D(LoadAnnotations):

        return results

-    def __repr__(self):
+    def __repr__(self) -> str:
        """str: Return a string that describes the module."""
        indent_str = '    '
        repr_str = self.__class__.__name__ + '(\n'

--- a/mmdet3d/datasets/transforms/test_time_aug.py
+++ b/mmdet3d/datasets/transforms/test_time_aug.py
@@ -19,18 +19,17 @@ class MultiScaleFlipAug3D(BaseTransform):
        img_scale (tuple | list[tuple]): Images scales for resizing.
        pts_scale_ratio (float | list[float]): Points scale ratios for
            resizing.
-        flip (bool, optional): Whether apply flip augmentation.
-            Defaults to False.
-        flip_direction (str | list[str], optional): Flip augmentation
-            directions for images, options are "horizontal" and "vertical".
+        flip (bool): Whether apply flip augmentation. Defaults to False.
+        flip_direction (str | list[str]): Flip augmentation directions
+            for images, options are "horizontal" and "vertical".
            If flip_direction is list, multiple flip augmentations will
            be applied. It has no effect when ``flip == False``.
            Defaults to 'horizontal'.
-        pcd_horizontal_flip (bool, optional): Whether to apply horizontal
-            flip augmentation to point cloud. Defaults to True.
+        pcd_horizontal_flip (bool): Whether to apply horizontal flip
+            augmentation to point cloud. Defaults to False.
            Note that it works only when 'flip' is turned on.
-        pcd_vertical_flip (bool, optional): Whether to apply vertical flip
-            augmentation to point cloud. Defaults to True.
+        pcd_vertical_flip (bool): Whether to apply vertical flip
+            augmentation to point cloud. Defaults to False.
            Note that it works only when 'flip' is turned on.
    """

@@ -112,7 +111,7 @@ class MultiScaleFlipAug3D(BaseTransform):

        return aug_data_list

-    def __repr__(self):
+    def __repr__(self) -> str:
        """str: Return a string that describes the module."""
        repr_str = self.__class__.__name__
        repr_str += f'(transforms={self.transforms}, '

--- a/mmdet3d/datasets/transforms/transforms_3d.py
+++ b/mmdet3d/datasets/transforms/transforms_3d.py
@@ -6,7 +6,9 @@ from typing import List, Optional, Tuple, Union
 import cv2
 import mmcv
 import numpy as np
-from mmcv.transforms import BaseTransform, RandomResize, Resize
+from mmcv.transforms import BaseTransform, Compose, RandomResize, Resize
+from mmdet.datasets.transforms import (PhotoMetricDistortion, RandomCrop,
+                                       RandomFlip)
 from mmengine import is_tuple_of

 from mmdet3d.models.task_modules import VoxelGenerator
@@ -15,9 +17,6 @@ from mmdet3d.structures import (CameraInstance3DBoxes, DepthInstance3DBoxes,
                                LiDARInstance3DBoxes)
 from mmdet3d.structures.ops import box_np_ops
 from mmdet3d.structures.points import BasePoints
-from mmdet.datasets.transforms import (PhotoMetricDistortion, RandomCrop,
-                                       RandomFlip)
-from .compose import Compose
 from .data_augment_utils import noise_per_object_v3_


@@ -30,7 +29,7 @@ class RandomDropPointsColor(BaseTransform):
    util/transform.py#L223>`_ for more details.

    Args:
-        drop_ratio (float, optional): The probability of dropping point colors.
+        drop_ratio (float): The probability of dropping point colors.
            Defaults to 0.2.
    """

@@ -46,8 +45,8 @@ class RandomDropPointsColor(BaseTransform):
            input_dict (dict): Result dict from loading pipeline.

        Returns:
-            dict: Results after color dropping,
-                'points' key is updated in the result dict.
+            dict: Results after color dropping, 'points' key is updated
+            in the result dict.
        """
        points = input_dict['points']
        assert points.attribute_dims is not None and \
@@ -64,7 +63,7 @@ class RandomDropPointsColor(BaseTransform):
            points.color = points.color * 0.0
        return input_dict

-    def __repr__(self):
+    def __repr__(self) -> str:
        """str: Return a string that describes the module."""
        repr_str = self.__class__.__name__
        repr_str += f'(drop_ratio={self.drop_ratio})'
@@ -108,8 +107,8 @@ class RandomFlip3D(RandomFlip):
            in vertical direction. Defaults to 0.0.
        flip_box3d (bool): Whether to flip bounding box. In most of the case,
            the box should be fliped. In cam-based bev detection, this is set
-            to false, since the flip of 2D images does not influence the 3D
-            box. Default to True.
+            to False, since the flip of 2D images does not influence the 3D
+            box. Defaults to True.
    """

    def __init__(self,
@@ -150,8 +149,7 @@ class RandomFlip3D(RandomFlip):

        Args:
            input_dict (dict): Result dict from loading pipeline.
-            direction (str, optional): Flip direction.
-                Default: 'horizontal'.
+            direction (str): Flip direction. Defaults to 'horizontal'.

        Returns:
            dict: Flipped results, 'points', 'bbox3d_fields' keys are
@@ -241,7 +239,7 @@ class RandomFlip3D(RandomFlip):
            input_dict['transformation_3d_flow'].extend(['VF'])
        return input_dict

-    def __repr__(self):
+    def __repr__(self) -> str:
        """str: Return a string that describes the module."""
        repr_str = self.__class__.__name__
        repr_str += f'(sync_2d={self.sync_2d},'
@@ -309,7 +307,7 @@ class RandomJitterPoints(BaseTransform):
        points.translate(jitter_noise)
        return input_dict

-    def __repr__(self):
+    def __repr__(self) -> str:
        """str: Return a string that describes the module."""
        repr_str = self.__class__.__name__
        repr_str += f'(jitter_std={self.jitter_std},'
@@ -344,11 +342,11 @@ class ObjectSample(BaseTransform):

    Args:
        db_sampler (dict): Config dict of the database sampler.
-        sample_2d (bool): Whether to also paste 2D image patch to the images
+        sample_2d (bool): Whether to also paste 2D image patch to the images.
            This should be true when applying multi-modality cut-and-paste.
            Defaults to False.
        use_ground_plane (bool): Whether to use ground plane to adjust the
-            3D labels.
+            3D labels. Defaults to False.
    """

    def __init__(self,
@@ -445,12 +443,12 @@ class ObjectSample(BaseTransform):

        return input_dict

-    def __repr__(self):
+    def __repr__(self) -> str:
        """str: Return a string that describes the module."""
        repr_str = self.__class__.__name__
-        repr_str += f'db_sampler={self.db_sampler},'
+        repr_str += f'(db_sampler={self.db_sampler},'
        repr_str += f' sample_2d={self.sample_2d},'
-        repr_str += f' use_ground_plane={self.use_ground_plane}'
+        repr_str += f' use_ground_plane={self.use_ground_plane})'
        return repr_str


@@ -469,15 +467,15 @@ class ObjectNoise(BaseTransform):
    - gt_bboxes_3d

    Args:
-        translation_std (list[float], optional): Standard deviation of the
+        translation_std (list[float]): Standard deviation of the
            distribution where translation noise are sampled from.
            Defaults to [0.25, 0.25, 0.25].
-        global_rot_range (list[float], optional): Global rotation to the scene.
+        global_rot_range (list[float]): Global rotation to the scene.
            Defaults to [0.0, 0.0].
-        rot_range (list[float], optional): Object rotation range.
+        rot_range (list[float]): Object rotation range.
            Defaults to [-0.15707963267, 0.15707963267].
-        num_try (int, optional): Number of times to try if the noise applied is
-            invalid. Defaults to 100.
+        num_try (int): Number of times to try if the noise applied is invalid.
+            Defaults to 100.
    """

    def __init__(self,
@@ -519,7 +517,7 @@ class ObjectNoise(BaseTransform):
        input_dict['points'] = points.new_point(numpy_points)
        return input_dict

-    def __repr__(self):
+    def __repr__(self) -> str:
        """str: Return a string that describes the module."""
        repr_str = self.__class__.__name__
        repr_str += f'(num_try={self.num_try},'
@@ -610,7 +608,7 @@ class GlobalAlignment(BaseTransform):

        return results

-    def __repr__(self):
+    def __repr__(self) -> str:
        """str: Return a string that describes the module."""
        repr_str = self.__class__.__name__
        repr_str += f'(rotation_axis={self.rotation_axis})'
@@ -640,15 +638,15 @@ class GlobalRotScaleTrans(BaseTransform):
    - pcd_scale_factor (np.float32)

    Args:
-        rot_range (list[float], optional): Range of rotation angle.
+        rot_range (list[float]): Range of rotation angle.
            Defaults to [-0.78539816, 0.78539816] (close to [-pi/4, pi/4]).
-        scale_ratio_range (list[float], optional): Range of scale ratio.
+        scale_ratio_range (list[float]): Range of scale ratio.
            Defaults to [0.95, 1.05].
-        translation_std (list[float], optional): The standard deviation of
+        translation_std (list[float]): The standard deviation of
            translation noise applied to a scene, which
            is sampled from a gaussian distribution whose standard deviation
-            is set by ``translation_std``. Defaults to [0, 0, 0]
-        shift_height (bool, optional): Whether to shift height.
+            is set by ``translation_std``. Defaults to [0, 0, 0].
+        shift_height (bool): Whether to shift height.
            (the fourth dimension of indoor points) when scaling.
            Defaults to False.
    """
@@ -689,8 +687,7 @@ class GlobalRotScaleTrans(BaseTransform):

        Returns:
            dict: Results after translation, 'points', 'pcd_trans'
-            and `gt_bboxes_3d` is updated
-            in the result dict.
+            and `gt_bboxes_3d` is updated in the result dict.
        """
        translation_std = np.array(self.translation_std, dtype=np.float32)
        trans_factor = np.random.normal(scale=translation_std, size=3).T
@@ -708,8 +705,7 @@ class GlobalRotScaleTrans(BaseTransform):

        Returns:
            dict: Results after rotation, 'points', 'pcd_rotation'
-            and `gt_bboxes_3d` is updated
-            in the result dict.
+            and `gt_bboxes_3d` is updated in the result dict.
        """
        rotation = self.rot_range
        noise_rotation = np.random.uniform(rotation[0], rotation[1])
@@ -735,8 +731,7 @@ class GlobalRotScaleTrans(BaseTransform):

        Returns:
            dict: Results after scaling, 'points' and
-            `gt_bboxes_3d` is updated
-            in the result dict.
+            `gt_bboxes_3d` is updated in the result dict.
        """
        scale = input_dict['pcd_scale_factor']
        points = input_dict['points']
@@ -774,7 +769,7 @@ class GlobalRotScaleTrans(BaseTransform):

        Returns:
            dict: Results after scaling, 'points', 'pcd_rotation',
-            'pcd_scale_factor', 'pcd_trans' and `gt_bboxes_3d` is updated
+            'pcd_scale_factor', 'pcd_trans' and `gt_bboxes_3d` are updated
            in the result dict.
        """
        if 'transformation_3d_flow' not in input_dict:
@@ -791,7 +786,7 @@ class GlobalRotScaleTrans(BaseTransform):
        input_dict['transformation_3d_flow'].extend(['R', 'S', 'T'])
        return input_dict

-    def __repr__(self):
+    def __repr__(self) -> str:
        """str: Return a string that describes the module."""
        repr_str = self.__class__.__name__
        repr_str += f'(rot_range={self.rot_range},'
@@ -829,7 +824,7 @@ class PointShuffle(BaseTransform):

        return input_dict

-    def __repr__(self):
+    def __repr__(self) -> str:
        """str: Return a string that describes the module."""
        return self.__class__.__name__

@@ -850,7 +845,7 @@ class ObjectRangeFilter(BaseTransform):
        point_cloud_range (list[float]): Point cloud range.
    """

-    def __init__(self, point_cloud_range: List[float]):
+    def __init__(self, point_cloud_range: List[float]) -> None:
        self.pcd_range = np.array(point_cloud_range, dtype=np.float32)

    def transform(self, input_dict: dict) -> dict:
@@ -887,7 +882,7 @@ class ObjectRangeFilter(BaseTransform):

        return input_dict

-    def __repr__(self):
+    def __repr__(self) -> str:
        """str: Return a string that describes the module."""
        repr_str = self.__class__.__name__
        repr_str += f'(point_cloud_range={self.pcd_range.tolist()})'
@@ -942,7 +937,7 @@ class PointsRangeFilter(BaseTransform):

        return input_dict

-    def __repr__(self):
+    def __repr__(self) -> str:
        """str: Return a string that describes the module."""
        repr_str = self.__class__.__name__
        repr_str += f'(point_cloud_range={self.pcd_range.tolist()})'
@@ -987,7 +982,7 @@ class ObjectNameFilter(BaseTransform):

        return input_dict

-    def __repr__(self):
+    def __repr__(self) -> str:
        """str: Return a string that describes the module."""
        repr_str = self.__class__.__name__
        repr_str += f'(classes={self.classes})'
@@ -1017,8 +1012,8 @@ class PointSample(BaseTransform):
        sample_range (float, optional): The range where to sample points.
            If not None, the points with depth larger than `sample_range` are
            prior to be sampled. Defaults to None.
-        replace (bool, optional): Whether the sampling is with or without
-            replacement. Defaults to False.
+        replace (bool): Whether the sampling is with or without replacement.
+            Defaults to False.
    """

    def __init__(self,
@@ -1046,10 +1041,9 @@ class PointSample(BaseTransform):
            num_samples (int): Number of samples to be sampled.
            sample_range (float, optional): Indicating the range where the
                points will be sampled. Defaults to None.
-            replace (bool, optional): Sampling with or without replacement.
-                Defaults to False.
-            return_choices (bool, optional): Whether return choice.
+            replace (bool): Sampling with or without replacement.
                Defaults to False.
+            return_choices (bool): Whether return choice. Defaults to False.

        Returns:
            tuple[:obj:`BasePoints`, np.ndarray] | :obj:`BasePoints`:
@@ -1113,7 +1107,7 @@ class PointSample(BaseTransform):

        return input_dict

-    def __repr__(self):
+    def __repr__(self) -> str:
        """str: Return a string that describes the module."""
        repr_str = self.__class__.__name__
        repr_str += f'(num_points={self.num_points},'
@@ -1149,7 +1143,7 @@ class IndoorPatchPointSample(BaseTransform):

    Args:
        num_points (int): Number of points to be sampled.
-        block_size (float, optional): Size of a block to sample points from.
+        block_size (float): Size of a block to sample points from.
            Defaults to 1.5.
        sample_rate (float, optional): Stride used in sliding patch generation.
            This parameter is unused in `IndoorPatchPointSample` and thus has
@@ -1159,17 +1153,17 @@ class IndoorPatchPointSample(BaseTransform):
            segmentation task. This is set in PointSegClassMapping as neg_cls.
            If not None, will be used as a patch selection criterion.
            Defaults to None.
-        use_normalized_coord (bool, optional): Whether to use normalized xyz as
+        use_normalized_coord (bool): Whether to use normalized xyz as
            additional features. Defaults to False.
-        num_try (int, optional): Number of times to try if the patch selected
-            is invalid. Defaults to 10.
-        enlarge_size (float, optional): Enlarge the sampled patch to
+        num_try (int): Number of times to try if the patch selected is invalid.
+            Defaults to 10.
+        enlarge_size (float): Enlarge the sampled patch to
            [-block_size / 2 - enlarge_size, block_size / 2 + enlarge_size] as
            an augmentation. If None, set it as 0. Defaults to 0.2.
        min_unique_num (int, optional): Minimum number of unique points
            the sampled patch should contain. If None, use PointNet++'s method
            to judge uniqueness. Defaults to None.
-        eps (float, optional): A value added to patch boundary to guarantee
+        eps (float): A value added to patch boundary to guarantee
            points coverage. Defaults to 1e-2.

    Note:
@@ -1386,7 +1380,7 @@ class IndoorPatchPointSample(BaseTransform):

        return input_dict

-    def __repr__(self):
+    def __repr__(self) -> str:
        """str: Return a string that describes the module."""
        repr_str = self.__class__.__name__
        repr_str += f'(num_points={self.num_points},'
@@ -1405,7 +1399,7 @@ class BackgroundPointsFilter(BaseTransform):
    """Filter background points near the bounding box.

    Args:
-        bbox_enlarge_range (tuple[float], float): Bbox enlarge range.
+        bbox_enlarge_range (tuple[float] | float): Bbox enlarge range.
    """

    def __init__(self, bbox_enlarge_range: Union[Tuple[float], float]) -> None:
@@ -1458,7 +1452,7 @@ class BackgroundPointsFilter(BaseTransform):
            input_dict['pts_semantic_mask'] = pts_semantic_mask[valid_masks]
        return input_dict

-    def __repr__(self):
+    def __repr__(self) -> str:
        """str: Return a string that describes the module."""
        repr_str = self.__class__.__name__
        repr_str += f'(bbox_enlarge_range={self.bbox_enlarge_range.tolist()})'
@@ -1473,9 +1467,10 @@ class VoxelBasedPointSampler(BaseTransform):

    Args:
        cur_sweep_cfg (dict): Config for sampling current points.
-        prev_sweep_cfg (dict): Config for sampling previous points.
+        prev_sweep_cfg (dict, optional): Config for sampling previous points.
+            Defaults to None.
        time_dim (int): Index that indicate the time dimension
-            for input points.
+            for input points. Defaults to 3.
    """

    def __init__(self,
@@ -1502,7 +1497,7 @@ class VoxelBasedPointSampler(BaseTransform):
            points (np.ndarray): Points subset to be sampled.
            sampler (VoxelGenerator): Voxel based sampler for
                each points subset.
-            point_dim (int): The dimension of each points
+            point_dim (int): The dimension of each points.

        Returns:
            np.ndarray: Sampled points.
@@ -1589,7 +1584,7 @@ class VoxelBasedPointSampler(BaseTransform):

        return results

-    def __repr__(self):
+    def __repr__(self) -> str:
        """str: Return a string that describes the module."""

        def _auto_indent(repr_str, indent):
@@ -1625,7 +1620,7 @@ class AffineResize(BaseTransform):
        img_scale (tuple): Images scales for resizing.
        down_ratio (int): The down ratio of feature map.
            Actually the arg should be >= 1.
-        bbox_clip_border (bool, optional): Whether clip the objects
+        bbox_clip_border (bool): Whether clip the objects
            outside the border of the image. Defaults to True.
    """

@@ -1803,7 +1798,7 @@ class AffineResize(BaseTransform):
        ref_point3 = ref_point2 + np.array([-d[1], d[0]])
        return ref_point3

-    def __repr__(self):
+    def __repr__(self) -> str:
        """str: Return a string that describes the module."""
        repr_str = self.__class__.__name__
        repr_str += f'(img_scale={self.img_scale}, '
@@ -1863,7 +1858,7 @@ class RandomShiftScale(BaseTransform):

        return results

-    def __repr__(self):
+    def __repr__(self) -> str:
        """str: Return a string that describes the module."""
        repr_str = self.__class__.__name__
        repr_str += f'(shift_scale={self.shift_scale}, '
@@ -1874,7 +1869,7 @@ class RandomShiftScale(BaseTransform):
 @TRANSFORMS.register_module()
 class Resize3D(Resize):

-    def _resize_3d(self, results):
+    def _resize_3d(self, results: dict) -> None:
        """Resize centers_2d and modify camera intrinisc with
        ``results['scale']``."""
        if 'centers_2d' in results:
@@ -1888,6 +1883,7 @@ class Resize3D(Resize):

        Args:
            results (dict): Result dict from loading pipeline.
+
        Returns:
            dict: Resized results, 'img', 'gt_bboxes', 'gt_seg_map',
            'gt_keypoints', 'scale', 'scale_factor', 'img_shape',
@@ -1909,7 +1905,7 @@ class RandomResize3D(RandomResize):
        and cam2img with ``results['scale']``.
    """

-    def _resize_3d(self, results):
+    def _resize_3d(self, results: dict) -> None:
        """Resize centers_2d and modify camera intrinisc with
        ``results['scale']``."""
        if 'centers_2d' in results:
@@ -1917,7 +1913,7 @@ class RandomResize3D(RandomResize):
        results['cam2img'][0] *= np.array(results['scale_factor'][0])
        results['cam2img'][1] *= np.array(results['scale_factor'][1])

-    def transform(self, results):
+    def transform(self, results: dict) -> dict:
        """Transform function to resize images, bounding boxes, masks, semantic
        segmentation map. Compared to RandomResize, this function would further
        check if scale is already set in results.
@@ -1926,7 +1922,7 @@ class RandomResize3D(RandomResize):
            results (dict): Result dict from loading pipeline.

        Returns:
-            dict: Resized results, 'img_shape', 'pad_shape', 'scale_factor', \
+            dict: Resized results, 'img_shape', 'pad_shape', 'scale_factor',
            'keep_ratio' keys are added into result dict.
        """
        if 'scale' not in results:
@@ -1989,9 +1985,9 @@ class RandomCrop3D(RandomCrop):
            on cropped instance masks. Defaults to False.
        bbox_clip_border (bool): Whether clip the objects outside
            the border of the image. Defaults to True.
-        rel_offset_h (tuple): The cropping interval of image height. Default
+        rel_offset_h (tuple): The cropping interval of image height. Defaults
            to (0., 1.).
-        rel_offset_w (tuple): The cropping interval of image width. Default
+        rel_offset_w (tuple): The cropping interval of image width. Defaults
            to (0., 1.).

    Note:
@@ -2005,14 +2001,16 @@ class RandomCrop3D(RandomCrop):
          ``allow_negative_crop`` is set to False, skip this image.
    """

-    def __init__(self,
-                 crop_size,
-                 crop_type='absolute',
-                 allow_negative_crop=False,
-                 recompute_bbox=False,
-                 bbox_clip_border=True,
-                 rel_offset_h=(0., 1.),
-                 rel_offset_w=(0., 1.)):
+    def __init__(
+        self,
+        crop_size: tuple,
+        crop_type: str = 'absolute',
+        allow_negative_crop: bool = False,
+        recompute_bbox: bool = False,
+        bbox_clip_border: bool = True,
+        rel_offset_h: tuple = (0., 1.),
+        rel_offset_w: tuple = (0., 1.)
+    ) -> None:
        super().__init__(
            crop_size=crop_size,
            crop_type=crop_type,
@@ -2024,7 +2022,10 @@ class RandomCrop3D(RandomCrop):
        self.rel_offset_h = rel_offset_h
        self.rel_offset_w = rel_offset_w

-    def _crop_data(self, results, crop_size, allow_negative_crop):
+    def _crop_data(self,
+                   results: dict,
+                   crop_size: tuple,
+                   allow_negative_crop: bool = False) -> dict:
        """Function to randomly crop images, bounding boxes, masks, semantic
        segmentation maps.

@@ -2032,7 +2033,7 @@ class RandomCrop3D(RandomCrop):
            results (dict): Result dict from loading pipeline.
            crop_size (tuple): Expected absolute size after cropping, (h, w).
            allow_negative_crop (bool): Whether to allow a crop that does not
-                contain any bbox area. Default to False.
+                contain any bbox area. Defaults to False.

        Returns:
            dict: Randomly cropped results, 'img_shape' key in result dict is
@@ -2119,7 +2120,7 @@ class RandomCrop3D(RandomCrop):

        return results

-    def transform(self, results):
+    def transform(self, results: dict) -> dict:
        """Transform function to randomly crop images, bounding boxes, masks,
        semantic segmentation maps.

@@ -2139,7 +2140,8 @@ class RandomCrop3D(RandomCrop):
        results = self._crop_data(results, crop_size, self.allow_negative_crop)
        return results

-    def __repr__(self):
+    def __repr__(self) -> dict:
+        """str: Return a string that describes the module."""
        repr_str = self.__class__.__name__
        repr_str += f'(crop_size={self.crop_size}, '
        repr_str += f'crop_type={self.crop_type}, '
@@ -2260,43 +2262,44 @@ class MultiViewWrapper(BaseTransform):
        transforms (list[dict]): A list of dict specifying the transformations
            for the monocular situation.
        override_aug_config (bool): flag of whether to use the same aug config
-            for multiview image. Default to True.
+            for multiview image. Defaults to True.
        process_fields (list): Desired keys that the transformations should
-            be conducted on. Default to ['img', 'cam2img', 'lidar2cam'],
-
+            be conducted on. Defaults to ['img', 'cam2img', 'lidar2cam'].
        collected_keys (list): Collect information in transformation
-            like rotate angles, crop roi, and flip state. Default to
+            like rotate angles, crop roi, and flip state. Defaults to
                ['scale', 'scale_factor', 'crop',
                 'crop_offset', 'ori_shape',
                 'pad_shape', 'img_shape',
                 'pad_fixed_size', 'pad_size_divisor',
-                 'flip', 'flip_direction', 'rotate'],
+                 'flip', 'flip_direction', 'rotate'].
        randomness_keys (list): The keys that related to the randomness
-            in transformation Default to
+            in transformation. Defaults to
                    ['scale', 'scale_factor', 'crop_size', 'flip',
                     'flip_direction', 'photometric_param']
    """

-    def __init__(self,
+    def __init__(
+        self,
        transforms: dict,
        override_aug_config: bool = True,
        process_fields: list = ['img', 'cam2img', 'lidar2cam'],
        collected_keys: list = [
-                     'scale', 'scale_factor', 'crop', 'img_crop_offset',
-                     'ori_shape', 'pad_shape', 'img_shape', 'pad_fixed_size',
-                     'pad_size_divisor', 'flip', 'flip_direction', 'rotate'
+            'scale', 'scale_factor', 'crop', 'img_crop_offset', 'ori_shape',
+            'pad_shape', 'img_shape', 'pad_fixed_size', 'pad_size_divisor',
+            'flip', 'flip_direction', 'rotate'
        ],
        randomness_keys: list = [
-                     'scale', 'scale_factor', 'crop_size', 'img_crop_offset',
-                     'flip', 'flip_direction', 'photometric_param'
-                 ]):
+            'scale', 'scale_factor', 'crop_size', 'img_crop_offset', 'flip',
+            'flip_direction', 'photometric_param'
+        ]
+    ) -> None:
        self.transforms = Compose(transforms)
        self.override_aug_config = override_aug_config
        self.collected_keys = collected_keys
        self.process_fields = process_fields
        self.randomness_keys = randomness_keys

-    def transform(self, input_dict):
+    def transform(self, input_dict: dict) -> dict:
        """Transform function to do the transform for multiview image.

        Args:

--- a/mmdet3d/datasets/waymo_dataset.py
+++ b/mmdet3d/datasets/waymo_dataset.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import os.path as osp
-from typing import Callable, List, Optional, Union
+from typing import Callable, List, Union

 import numpy as np

@@ -24,20 +24,20 @@ class WaymoDataset(KittiDataset):
        data_root (str): Path of dataset root.
        ann_file (str): Path of annotation file.
        data_prefix (dict): data prefix for point cloud and
-            camera data dict. Default to dict(
+            camera data dict. Defaults to dict(
                                    pts='velodyne',
                                    CAM_FRONT='image_0',
                                    CAM_FRONT_RIGHT='image_1',
                                    CAM_FRONT_LEFT='image_2',
                                    CAM_SIDE_RIGHT='image_3',
                                    CAM_SIDE_LEFT='image_4')
-        pipeline (list[dict], optional): Pipeline used for data processing.
-            Defaults to None.
-        modality (dict, optional): Modality to specify the sensor data used
+        pipeline (List[dict]): Pipeline used for data processing.
+            Defaults to [].
+        modality (dict): Modality to specify the sensor data used
            as input. Defaults to dict(use_lidar=True).
-        default_cam_key (str, optional): Default camera key for lidar2img
+        default_cam_key (str): Default camera key for lidar2img
            association. Defaults to 'CAM_FRONT'.
-        box_type_3d (str, optional): Type of 3D box of this dataset.
+        box_type_3d (str): Type of 3D box of this dataset.
            Based on the `box_type_3d`, the dataset will encapsulate the box
            to its original format then converted them to `box_type_3d`.
            Defaults to 'LiDAR' in this dataset. Available options includes:
@@ -45,24 +45,30 @@ class WaymoDataset(KittiDataset):
            - 'LiDAR': Box in LiDAR coordinates.
            - 'Depth': Box in depth coordinates, usually for indoor dataset.
            - 'Camera': Box in camera coordinates.
-        filter_empty_gt (bool, optional): Whether to filter empty GT.
-            Defaults to True.
-        test_mode (bool, optional): Whether the dataset is in test mode.
+        load_type (str): Type of loading mode. Defaults to 'frame_based'.
+
+            - 'frame_based': Load all of the instances in the frame.
+            - 'mv_image_based': Load all of the instances in the frame and need
+                to convert to the FOV-based data type to support image-based
+                detector.
+            - 'fov_image_based': Only load the instances inside the default
+                cam, and need to convert to the FOV-based data type to support
+                image-based detector.
+        filter_empty_gt (bool): Whether to filter the data with empty GT.
+            If it's set to be True, the example with empty annotations after
+            data pipeline will be dropped and a random example will be chosen
+            in `__getitem__`. Defaults to True.
+        test_mode (bool): Whether the dataset is in test mode.
            Defaults to False.
-        pcd_limit_range (list[float], optional): The range of point cloud
+        pcd_limit_range (List[float]): The range of point cloud
            used to filter invalid predicted boxes.
            Defaults to [-85, -85, -5, 85, 85, 5].
-        cam_sync_instances (bool, optional): If use the camera sync label
+        cam_sync_instances (bool): If use the camera sync label
            supported from waymo version 1.3.1. Defaults to False.
-        load_interval (int, optional): load frame interval.
-            Defaults to 1.
-        task (str, optional): task for 3D detection (lidar, mono3d).
-            lidar: take all the ground trurh in the frame.
-            mono3d: take the groundtruth that can be seen in the cam.
-            Defaults to 'lidar'.
-        max_sweeps (int, optional): max sweep for each frame. Defaults to 0.
+        load_interval (int): load frame interval. Defaults to 1.
+        max_sweeps (int): max sweep for each frame. Defaults to 0.
    """
-    METAINFO = {'CLASSES': ('Car', 'Pedestrian', 'Cyclist')}
+    METAINFO = {'classes': ('Car', 'Pedestrian', 'Cyclist')}

    def __init__(self,
                 data_root: str,
@@ -75,28 +81,27 @@ class WaymoDataset(KittiDataset):
                     CAM_SIDE_RIGHT='image_3',
                     CAM_SIDE_LEFT='image_4'),
                 pipeline: List[Union[dict, Callable]] = [],
-                 modality: Optional[dict] = dict(use_lidar=True),
+                 modality: dict = dict(use_lidar=True),
                 default_cam_key: str = 'CAM_FRONT',
                 box_type_3d: str = 'LiDAR',
+                 load_type: str = 'frame_based',
                 filter_empty_gt: bool = True,
                 test_mode: bool = False,
                 pcd_limit_range: List[float] = [0, -40, -3, 70.4, 40, 0.0],
-                 cam_sync_instances=False,
-                 load_interval=1,
-                 task='lidar_det',
-                 max_sweeps=0,
-                 **kwargs):
+                 cam_sync_instances: bool = False,
+                 load_interval: int = 1,
+                 max_sweeps: int = 0,
+                 **kwargs) -> None:
        self.load_interval = load_interval
        # set loading mode for different task settings
        self.cam_sync_instances = cam_sync_instances
        # construct self.cat_ids for vision-only anns parsing
-        self.cat_ids = range(len(self.METAINFO['CLASSES']))
+        self.cat_ids = range(len(self.METAINFO['classes']))
        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
        self.max_sweeps = max_sweeps
-        self.task = task
        # we do not provide file_client_args to custom_3d init
        # because we want disk loading for info
-        # while ceph loading for KITTI2Waymo
+        # while ceph loading for Prediction2Waymo
        super().__init__(
            data_root=data_root,
            ann_file=ann_file,
@@ -108,16 +113,17 @@ class WaymoDataset(KittiDataset):
            default_cam_key=default_cam_key,
            data_prefix=data_prefix,
            test_mode=test_mode,
+            load_type=load_type,
            **kwargs)

    def parse_ann_info(self, info: dict) -> dict:
-        """Get annotation info according to the given index.
+        """Process the `instances` in data info to `ann_info`.

        Args:
            info (dict): Data information of single data sample.

        Returns:
-            dict: annotation information consists of the following keys:
+            dict: Annotation information consists of the following keys:

                - bboxes_3d (:obj:`LiDARInstance3DBoxes`):
                  3D ground truth bboxes.
@@ -150,7 +156,7 @@ class WaymoDataset(KittiDataset):
            centers_2d = np.zeros((0, 2), dtype=np.float32)
            depths = np.zeros((0), dtype=np.float32)

-        if self.task == 'mono_det':
+        if self.load_type in ['fov_image_based', 'mv_image_based']:
            gt_bboxes_3d = CameraInstance3DBoxes(
                ann_info['gt_bboxes_3d'],
                box_dim=ann_info['gt_bboxes_3d'].shape[-1],
@@ -182,13 +188,22 @@ class WaymoDataset(KittiDataset):
        data_list = data_list[::self.load_interval]
        return data_list

-    def parse_data_info(self, info: dict) -> dict:
+    def parse_data_info(self, info: dict) -> Union[dict, List[dict]]:
        """if task is lidar or multiview det, use super() method elif task is
        mono3d, split the info from frame-wise to img-wise."""
-        if self.task != 'mono_det':
+
        if self.cam_sync_instances:
-                # use the cam sync labels
            info['instances'] = info['cam_sync_instances']
+
+        if self.load_type == 'frame_based':
+            return super().parse_data_info(info)
+        elif self.load_type == 'fov_image_based':
+            # only loading the fov image and the fov instance
+            new_image_info = {}
+            new_image_info[self.default_cam_key] = \
+                info['images'][self.default_cam_key]
+            info['images'] = new_image_info
+            info['instances'] = info['cam_instances'][self.default_cam_key]
            return super().parse_data_info(info)
        else:
            # in the mono3d, the instances is from cam sync.

--- a/mmdet3d/evaluation/functional/waymo_utils/__init__.py
+++ b/mmdet3d/evaluation/functional/waymo_utils/__init__.py
 # Copyright (c) OpenMMLab. All rights reserved.

-from .prediction_kitti_to_waymo import KITTI2Waymo
+from .prediction_to_waymo import Prediction2Waymo

-__all__ = ['KITTI2Waymo']
+__all__ = ['Prediction2Waymo']
--- a/mmdet3d/evaluation/functional/waymo_utils/prediction_kitti_to_waymo.py
+++ b/mmdet3d/evaluation/functional/waymo_utils/prediction_kitti_to_waymo.py
@@ -5,29 +5,33 @@ r"""Adapted from `Waymo to KITTI converter

 try:
    from waymo_open_dataset import dataset_pb2 as open_dataset
+    from waymo_open_dataset import label_pb2
+    from waymo_open_dataset.protos import metrics_pb2
+    from waymo_open_dataset.protos.metrics_pb2 import Objects
 except ImportError:
+    Objects = None
    raise ImportError(
        'Please run "pip install waymo-open-dataset-tf-2-1-0==1.2.0" '
        'to install the official devkit first.')

 from glob import glob
 from os.path import join
+from typing import List, Optional

 import mmengine
 import numpy as np
 import tensorflow as tf
-from waymo_open_dataset import label_pb2
-from waymo_open_dataset.protos import metrics_pb2


-class KITTI2Waymo(object):
-    """KITTI predictions to Waymo converter.
+class Prediction2Waymo(object):
+    """Predictions to Waymo converter. The format of prediction results could
+    be original format or kitti-format.

    This class serves as the converter to change predictions from KITTI to
    Waymo format.

    Args:
-        kitti_result_files (list[dict]): Predictions in KITTI format.
+        results (list[dict]): Prediction results.
        waymo_tfrecords_dir (str): Directory to load waymo raw data.
        waymo_results_save_dir (str): Directory to save converted predictions
            in waymo format (.bin files).
@@ -35,33 +39,47 @@ class KITTI2Waymo(object):
            predictions in waymo format (.bin file), like 'a/b/c.bin'.
        prefix (str): Prefix of filename. In general, 0 for training, 1 for
            validation and 2 for testing.
-        workers (str): Number of parallel processes.
+        classes (dict): A list of class name.
+        workers (str): Number of parallel processes. Defaults to 2.
+        file_client_args (str): File client for reading gt in waymo format.
+            Defaults to ``dict(backend='disk')``.
+        from_kitti_format (bool, optional): Whether the reuslts are kitti
+            format. Defaults to False.
+        idx2metainfo (Optional[dict], optional): The mapping from sample_idx to
+            metainfo. The metainfo must contain the keys: 'idx2contextname' and
+            'idx2timestamp'. Defaults to None.
    """

    def __init__(self,
-                 kitti_result_files,
-                 waymo_tfrecords_dir,
-                 waymo_results_save_dir,
-                 waymo_results_final_path,
-                 prefix,
-                 workers=64,
-                 file_client_args=dict(backend='disk')):
-
-        self.kitti_result_files = kitti_result_files
+                 results: List[dict],
+                 waymo_tfrecords_dir: str,
+                 waymo_results_save_dir: str,
+                 waymo_results_final_path: str,
+                 prefix: str,
+                 classes: dict,
+                 workers: int = 2,
+                 file_client_args: dict = dict(backend='disk'),
+                 from_kitti_format: bool = False,
+                 idx2metainfo: Optional[dict] = None):
+
+        self.results = results
        self.waymo_tfrecords_dir = waymo_tfrecords_dir
        self.waymo_results_save_dir = waymo_results_save_dir
        self.waymo_results_final_path = waymo_results_final_path
        self.prefix = prefix
+        self.classes = classes
        self.workers = int(workers)
        self.file_client_args = file_client_args
-        self.name2idx = {}
-        for idx, result in enumerate(kitti_result_files):
-            if len(result['sample_id']) > 0:
-                self.name2idx[str(result['sample_id'][0])] = idx
+        self.from_kitti_format = from_kitti_format
+        if idx2metainfo is not None:
+            self.idx2metainfo = idx2metainfo
+            # If ``fast_eval``, the metainfo does not need to be read from
+            # original data online. It's preprocessed offline.
+            self.fast_eval = True
+        else:
+            self.fast_eval = False

-        # turn on eager execution for older tensorflow versions
-        if int(tf.__version__.split('.')[0]) < 2:
-            tf.enable_eager_execution()
+        self.name2idx = {}

        self.k2w_cls_map = {
            'Car': label_pb2.Label.TYPE_VEHICLE,
@@ -70,12 +88,28 @@ class KITTI2Waymo(object):
            'Cyclist': label_pb2.Label.TYPE_CYCLIST,
        }

+        if self.from_kitti_format:
            self.T_ref_to_front_cam = np.array([[0.0, 0.0, 1.0, 0.0],
                                                [-1.0, 0.0, 0.0, 0.0],
                                                [0.0, -1.0, 0.0, 0.0],
                                                [0.0, 0.0, 0.0, 1.0]])
+            # ``sample_idx`` of the sample in kitti-format is an array
+            for idx, result in enumerate(results):
+                if len(result['sample_idx']) > 0:
+                    self.name2idx[str(result['sample_idx'][0])] = idx
+        else:
+            # ``sample_idx`` of the sample in the original prediction
+            # is an int value.
+            for idx, result in enumerate(results):
+                self.name2idx[str(result['sample_idx'])] = idx

+        if not self.fast_eval:
+            # need to read original '.tfrecord' file
            self.get_file_names()
+            # turn on eager execution for older tensorflow versions
+            if int(tf.__version__.split('.')[0]) < 2:
+                tf.enable_eager_execution()
+
        self.create_folder()

    def get_file_names(self):
@@ -192,6 +226,13 @@ class KITTI2Waymo(object):
            file_idx (int): Index of the file to be converted.
        """
        file_pathname = self.waymo_tfrecord_pathnames[file_idx]
+        if 's3://' in file_pathname and tf.__version__ >= '2.6.0':
+            try:
+                import tensorflow_io as tfio  # noqa: F401
+            except ImportError:
+                raise ImportError(
+                    "Please run 'pip install tensorflow-io' to install tensorflow_io first."  # noqa: E501
+                )
        file_data = tf.data.TFRecordDataset(file_pathname, compression_type='')

        for frame_num, frame_data in enumerate(file_data):
@@ -200,6 +241,11 @@ class KITTI2Waymo(object):

            filename = f'{self.prefix}{file_idx:03d}{frame_num:03d}'

+            context_name = frame.context.name
+            frame_timestamp_micros = frame.timestamp_micros
+
+            if filename in self.name2idx:
+                if self.from_kitti_format:
                    for camera in frame.context.camera_calibrations:
                        # FRONT = 1, see dataset.proto for details
                        if camera.name == 1:
@@ -208,14 +254,17 @@ class KITTI2Waymo(object):

                    T_k2w = T_front_cam_to_vehicle @ self.T_ref_to_front_cam

-            context_name = frame.context.name
-            frame_timestamp_micros = frame.timestamp_micros
-
-            if filename in self.name2idx:
                    kitti_result = \
-                    self.kitti_result_files[self.name2idx[filename]]
-                objects = self.parse_objects(kitti_result, T_k2w, context_name,
+                        self.results[self.name2idx[filename]]
+                    objects = self.parse_objects(kitti_result, T_k2w,
+                                                 context_name,
                                                 frame_timestamp_micros)
+                else:
+                    index = self.name2idx[filename]
+                    objects = self.parse_objects_from_origin(
+                        self.results[index], context_name,
+                        frame_timestamp_micros)
+
            else:
                print(filename, 'not found.')
                objects = metrics_pb2.Objects()
@@ -225,11 +274,100 @@ class KITTI2Waymo(object):
                    'wb') as f:
                f.write(objects.SerializeToString())

+    def convert_one_fast(self, res_index: int):
+        """Convert action for single file. It read the metainfo from the
+        preprocessed file offline and will be faster.
+
+        Args:
+            res_index (int): The indices of the results.
+        """
+        sample_idx = self.results[res_index]['sample_idx']
+        if len(self.results[res_index]['pred_instances_3d']) > 0:
+            objects = self.parse_objects_from_origin(
+                self.results[res_index],
+                self.idx2metainfo[str(sample_idx)]['contextname'],
+                self.idx2metainfo[str(sample_idx)]['timestamp'])
+        else:
+            print(sample_idx, 'not found.')
+            objects = metrics_pb2.Objects()
+
+        with open(
+                join(self.waymo_results_save_dir, f'{sample_idx}.bin'),
+                'wb') as f:
+            f.write(objects.SerializeToString())
+
+    def parse_objects_from_origin(self, result: dict, contextname: str,
+                                  timestamp: str) -> Objects:
+        """Parse obejcts from the original prediction results.
+
+        Args:
+            result (dict): The original prediction results.
+            contextname (str): The ``contextname`` of sample in waymo.
+            timestamp (str): The ``timestamp`` of sample in waymo.
+
+        Returns:
+            metrics_pb2.Objects: The parsed object.
+        """
+        lidar_boxes = result['pred_instances_3d']['bboxes_3d'].tensor
+        scores = result['pred_instances_3d']['scores_3d']
+        labels = result['pred_instances_3d']['labels_3d']
+
+        def parse_one_object(index):
+            class_name = self.classes[labels[index].item()]
+
+            box = label_pb2.Label.Box()
+            height = lidar_boxes[index][5].item()
+            heading = lidar_boxes[index][6].item()
+
+            while heading < -np.pi:
+                heading += 2 * np.pi
+            while heading > np.pi:
+                heading -= 2 * np.pi
+
+            box.center_x = lidar_boxes[index][0].item()
+            box.center_y = lidar_boxes[index][1].item()
+            box.center_z = lidar_boxes[index][2].item() + height / 2
+            box.length = lidar_boxes[index][3].item()
+            box.width = lidar_boxes[index][4].item()
+            box.height = height
+            box.heading = heading
+
+            o = metrics_pb2.Object()
+            o.object.box.CopyFrom(box)
+            o.object.type = self.k2w_cls_map[class_name]
+            o.score = scores[index].item()
+            o.context_name = contextname
+            o.frame_timestamp_micros = timestamp
+
+            return o
+
+        objects = metrics_pb2.Objects()
+        for i in range(len(lidar_boxes)):
+            objects.objects.append(parse_one_object(i))
+
+        return objects
+
    def convert(self):
        """Convert action."""
        print('Start converting ...')
-        mmengine.track_parallel_progress(self.convert_one, range(len(self)),
-                                         self.workers)
+        convert_func = self.convert_one_fast if self.fast_eval else \
+            self.convert_one
+
+        # from torch.multiprocessing import set_sharing_strategy
+        # # Force using "file_system" sharing strategy for stability
+        # set_sharing_strategy("file_system")
+
+        # mmengine.track_parallel_progress(convert_func, range(len(self)),
+        #                                  self.workers)
+
+        # TODO: Support multiprocessing. Now, multiprocessing evaluation will
+        # cause shared memory error in torch-1.10 and torch-1.11. Details can
+        # be seen in https://github.com/pytorch/pytorch/issues/67864.
+        prog_bar = mmengine.ProgressBar(len(self))
+        for i in range(len(self)):
+            convert_func(i)
+            prog_bar.update()
+
        print('\nFinished ...')

        # combine all files into one .bin
@@ -241,7 +379,8 @@ class KITTI2Waymo(object):

    def __len__(self):
        """Length of the filename list."""
-        return len(self.waymo_tfrecord_pathnames)
+        return len(self.results) if self.fast_eval else len(
+            self.waymo_tfrecord_pathnames)

    def transform(self, T, x, y, z):
        """Transform the coordinates with matrix T.