Release v1.1.0rc1

Release v1.1.0rc1

Release v1.1.0rc1
6c03a971 · Tai-Wang · GitHub · 9611c2d0 · ca42c312 · 6c03a971
Unverified Commit 6c03a971 authored Oct 14, 2022 by Tai-Wang Committed by GitHub Oct 14, 2022
20 changed files
--- a/docs/zh_cn/user_guides/useful_tools.md
+++ b/docs/zh_cn/user_guides/useful_tools.md
@@ -102,10 +102,10 @@ python tools/misc/browse_dataset.py configs/_base_/datasets/kitti-3d-3class.py -
 python tools/misc/browse_dataset.py configs/_base_/datasets/kitti-3d-3class.py --task det --aug --output-dir ${OUTPUT_DIR} --online
 ```

-如果您还想显示 2D 图像以及投影的 3D 边界框，则需要找到支持多模态数据加载的配置文件，然后将 `--task` 参数更改为 `multi_modality-det`。一个例子如下所示
+如果您还想显示 2D 图像以及投影的 3D 边界框，则需要找到支持多模态数据加载的配置文件，然后将 `--task` 参数更改为 `multi-modality_det`。一个例子如下所示

 ```shell
-python tools/misc/browse_dataset.py configs/mvxnet/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class.py --task multi_modality-det --output-dir ${OUTPUT_DIR} --online
+python tools/misc/browse_dataset.py configs/mvxnet/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class.py --task multi-modality_det --output-dir ${OUTPUT_DIR} --online
 ```

 ![](../../resources/browse_dataset_multi_modality.png)
@@ -121,7 +121,7 @@ python tools/misc/browse_dataset.py configs/_base_/datasets/scannet-seg.py --tas
 在单目 3D 检测任务中浏览 nuScenes 数据集

 ```shell
-python tools/misc/browse_dataset.py configs/_base_/datasets/nus-mono3d.py --task mono-det --output-dir ${OUTPUT_DIR} --online
+python tools/misc/browse_dataset.py configs/_base_/datasets/nus-mono3d.py --task mono_det --output-dir ${OUTPUT_DIR} --online
 ```

 ![](../../resources/browse_dataset_mono.png)

--- a/mmdet3d/apis/inference.py
+++ b/mmdet3d/apis/inference.py
@@ -143,6 +143,7 @@ def inference_detector(model: nn.Module,
            # load from point cloud file
            data_ = dict(
                lidar_points=dict(lidar_path=pcd),
+                timestamp=1,
                # for ScanNet demo we need axis_align_matrix
                axis_align_matrix=np.eye(4),
                box_type_3d=box_type_3d,
@@ -151,6 +152,7 @@ def inference_detector(model: nn.Module,
            # directly use loaded point cloud
            data_ = dict(
                points=pcd,
+                timestamp=1,
                # for ScanNet demo we need axis_align_matrix
                axis_align_matrix=np.eye(4),
                box_type_3d=box_type_3d,

--- a/mmdet3d/datasets/__init__.py
+++ b/mmdet3d/datasets/__init__.py
 # Copyright (c) OpenMMLab. All rights reserved.
 from .builder import DATASETS, PIPELINES, build_dataset
-from .convert_utils import get_2d_boxes
 from .dataset_wrappers import CBGSDataset
 from .det3d_dataset import Det3DDataset
 from .kitti_dataset import KittiDataset
@@ -22,8 +21,8 @@ from .transforms import (AffineResize, BackgroundPointsFilter, GlobalAlignment,
                         ObjectNameFilter, ObjectNoise, ObjectRangeFilter,
                         ObjectSample, PointSample, PointShuffle,
                         PointsRangeFilter, RandomDropPointsColor,
-                         RandomFlip3D, RandomJitterPoints, RandomShiftScale,
-                         VoxelBasedPointSampler)
+                         RandomFlip3D, RandomJitterPoints, RandomResize3D,
+                         RandomShiftScale, Resize3D, VoxelBasedPointSampler)
 from .utils import get_loading_pipeline
 from .waymo_dataset import WaymoDataset

@@ -40,5 +39,6 @@ __all__ = [
    'LoadPointsFromMultiSweeps', 'WaymoDataset', 'BackgroundPointsFilter',
    'VoxelBasedPointSampler', 'get_loading_pipeline', 'RandomDropPointsColor',
    'RandomJitterPoints', 'ObjectNameFilter', 'AffineResize',
-    'RandomShiftScale', 'LoadPointsFromDict', 'PIPELINES', 'get_2d_boxes'
+    'RandomShiftScale', 'LoadPointsFromDict', 'PIPELINES',
+    'Resize3D', 'RandomResize3D',
 ]
--- a/mmdet3d/datasets/convert_utils.py
+++ b/mmdet3d/datasets/convert_utils.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import copy
 from collections import OrderedDict
-from typing import List, Tuple, Union
+from typing import List, Optional, Tuple, Union

 import numpy as np
 from nuscenes.utils.geometry_utils import view_points
@@ -11,6 +11,11 @@ from shapely.geometry import MultiPoint, box
 from mmdet3d.structures import Box3DMode, CameraInstance3DBoxes, points_cam2img
 from mmdet3d.structures.ops import box_np_ops

+kitti_categories = ('Pedestrian', 'Cyclist', 'Car', 'Van', 'Truck',
+                    'Person_sitting', 'Tram', 'Misc')
+
+waymo_categories = ('Car', 'Pedestrian', 'Cyclist')
+
 nus_categories = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
                  'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
                  'barrier')
@@ -48,8 +53,10 @@ LyftNameMapping = {
 }


-def get_2d_boxes(nusc, sample_data_token: str, visibilities: List[str]):
-    """Get the 2D annotation records for a given `sample_data_token`.
+def get_nuscenes_2d_boxes(nusc, sample_data_token: str,
+                          visibilities: List[str]):
+    """Get the 2d / mono3d annotation records for a given `sample_data_token of
+    nuscenes dataset.

    Args:
        sample_data_token (str): Sample data token belonging to a camera
@@ -57,7 +64,7 @@ def get_2d_boxes(nusc, sample_data_token: str, visibilities: List[str]):
        visibilities (list[str]): Visibility filter.

    Return:
-        list[dict]: List of 2D annotation record that belongs to the input
+        list[dict]: List of 2d annotation record that belongs to the input
            `sample_data_token`.
    """

@@ -128,7 +135,7 @@ def get_2d_boxes(nusc, sample_data_token: str, visibilities: List[str]):

        # Generate dictionary record to be included in the .json file.
        repro_rec = generate_record(ann_rec, min_x, min_y, max_x, max_y,
-                                    sample_data_token, sd_rec['filename'])
+                                    'nuscenes')

        # if repro_rec is None, we do not append it into repre_recs
        if repro_rec is not None:
@@ -178,23 +185,36 @@ def get_2d_boxes(nusc, sample_data_token: str, visibilities: List[str]):
    return repro_recs


-def get_waymo_2d_boxes(info, cam_idx, occluded, annos=None, mono3d=True):
-    """Get the 2D annotation records for a given info.
+def get_kitti_style_2d_boxes(info: dict,
+                             cam_idx: int = 2,
+                             occluded: Tuple[int] = (0, 1, 2, 3),
+                             annos: Optional[dict] = None,
+                             mono3d: bool = True,
+                             dataset: str = 'kitti'):
+    """Get the 2d / mono3d annotation records for a given info.

-    This function is used to get 2D annotations when loading annotations from
-    a dataset class. The original version in the data converter will be
-    deprecated in the future.
+    This function is used to get 2D/Mono3D annotations when loading annotations
+    from a kitti-style dataset class, such as KITTI and Waymo dataset.

    Args:
-        info: Information of the given sample data.
-        occluded: Integer (0, 1, 2, 3) indicating occlusion state:
+        info (dict): Information of the given sample data.
+        cam_idx (int): Camera id which the 2d / mono3d annotations to obtain
+            belong to. In KITTI, typically only CAM 2 will be used,
+            and in Waymo, multi cameras could be used.
+            Defaults to 2.
+        occluded (tuple[int]): Integer (0, 1, 2, 3) indicating occlusion state:
            0 = fully visible, 1 = partly occluded, 2 = largely occluded,
-            3 = unknown, -1 = DontCare
+            3 = unknown, -1 = DontCare.
+            Defaults to (0, 1, 2, 3).
+        annos (dict, optional): Original annotations.
        mono3d (bool): Whether to get boxes with mono3d annotation.
+            Defaults to True.
+        dataset (str): Dataset name of getting 2d bboxes.
+            Defaults to `kitti`.

    Return:
-        list[dict]: List of 2D annotation record that belongs to the input
-            `sample_data_token`.
+        list[dict]: List of 2d / mono3d annotation record that
+            belongs to the input camera id.
    """
    # Get calibration information
    camera_intrinsic = info['calib'][f'P{cam_idx}']
@@ -224,7 +244,6 @@ def get_waymo_2d_boxes(info, cam_idx, occluded, annos=None, mono3d=True):
        ann_rec['sample_annotation_token'] = \
            f"{info['image']['image_idx']}.{ann_idx}"
        ann_rec['sample_data_token'] = info['image']['image_idx']
-        sample_data_token = info['image']['image_idx']

        loc = ann_rec['location'][np.newaxis, :]
        dim = ann_rec['dimensions'][np.newaxis, :]
@@ -266,9 +285,8 @@ def get_waymo_2d_boxes(info, cam_idx, occluded, annos=None, mono3d=True):
            min_x, min_y, max_x, max_y = final_coords

        # Generate dictionary record to be included in the .json file.
-        repro_rec = generate_waymo_mono3d_record(ann_rec, min_x, min_y, max_x,
-                                                 max_y, sample_data_token,
-                                                 info['image']['image_path'])
+        repro_rec = generate_record(ann_rec, min_x, min_y, max_x, max_y,
+                                    dataset)

        # If mono3d=True, add 3D annotations in camera coordinates
        if mono3d and (repro_rec is not None):
@@ -288,11 +306,7 @@ def get_waymo_2d_boxes(info, cam_idx, occluded, annos=None, mono3d=True):
            # samples with depth < 0 will be removed
            if repro_rec['depth'] <= 0:
                continue
-
-            repro_rec['attribute_name'] = -1  # no attribute in KITTI
-            repro_rec['attribute_id'] = -1
-
-        repro_recs.append(repro_rec)
+            repro_recs.append(repro_rec)

    return repro_recs

@@ -355,7 +369,7 @@ def post_process_coords(


 def generate_record(ann_rec: dict, x1: float, y1: float, x2: float, y2: float,
-                    sample_data_token: str, filename: str) -> OrderedDict:
+                    dataset: str) -> OrderedDict:
    """Generate one 2D annotation record given various information on top of
    the 2D bounding box coordinates.

@@ -365,112 +379,40 @@ def generate_record(ann_rec: dict, x1: float, y1: float, x2: float, y2: float,
        y1 (float): Minimum value of the y coordinate.
        x2 (float): Maximum value of the x coordinate.
        y2 (float): Maximum value of the y coordinate.
-        sample_data_token (str): Sample data token.
-        filename (str):The corresponding image file where the annotation
-            is present.
+        dataset (str): Name of dataset.

    Returns:
-        dict: A sample mono3D annotation record.
-            - bbox_label (int): 2d box label id
-            - bbox_label_3d (int): 3d box label id
-            - bbox (list[float]): left x, top y, right x, bottom y
-                of 2d box
-            - bbox_3d_isvalid (bool): whether the box is valid
+        dict: A sample 2d annotation record.
+                - bbox_label (int): 2d box label id
+                - bbox_label_3d (int): 3d box label id
+                - bbox (list[float]): left x, top y, right x, bottom y
+                    of 2d box
+                - bbox_3d_isvalid (bool): whether the box is valid
    """
-    repro_rec = OrderedDict()
-    repro_rec['sample_data_token'] = sample_data_token
-    coco_rec = dict()
-
-    relevant_keys = [
-        'attribute_tokens',
-        'category_name',
-        'instance_token',
-        'next',
-        'num_lidar_pts',
-        'num_radar_pts',
-        'prev',
-        'sample_annotation_token',
-        'sample_data_token',
-        'visibility_token',
-    ]

-    for key, value in ann_rec.items():
-        if key in relevant_keys:
-            repro_rec[key] = value
-
-    repro_rec['bbox_corners'] = [x1, y1, x2, y2]
-    repro_rec['filename'] = filename
-
-    if repro_rec['category_name'] not in NuScenesNameMapping:
-        return None
-    cat_name = NuScenesNameMapping[repro_rec['category_name']]
-    coco_rec['bbox_label'] = nus_categories.index(cat_name)
-    coco_rec['bbox_label_3d'] = nus_categories.index(cat_name)
-    coco_rec['bbox'] = [x1, y1, x2, y2]
-    coco_rec['bbox_3d_isvalid'] = True
-
-    return coco_rec
-
-
-def generate_waymo_mono3d_record(ann_rec, x1, y1, x2, y2, sample_data_token,
-                                 filename):
-    """Generate one 2D annotation record given various information on top of
-    the 2D bounding box coordinates.
-
-    The original version in the data converter will be deprecated in the
-    future.
+    if dataset == 'nuscenes':
+        cat_name = ann_rec['category_name']
+        if cat_name not in NuScenesNameMapping:
+            return None
+        else:
+            cat_name = NuScenesNameMapping[cat_name]
+            categories = nus_categories
+    else:
+        cat_name = ann_rec['name']
+        if cat_name not in categories:
+            return None
+
+        if dataset == 'kitti':
+            categories = kitti_categories
+        elif dataset == 'waymo':
+            categories = waymo_categories
+        else:
+            raise NotImplementedError('Unsupported dataset!')

-    Args:
-        ann_rec (dict): Original 3d annotation record.
-        x1 (float): Minimum value of the x coordinate.
-        y1 (float): Minimum value of the y coordinate.
-        x2 (float): Maximum value of the x coordinate.
-        y2 (float): Maximum value of the y coordinate.
-        sample_data_token (str): Sample data token.
-        filename (str):The corresponding image file where the annotation
-            is present.
+    rec = dict()
+    rec['bbox_label'] = categories.index(cat_name)
+    rec['bbox_label_3d'] = rec['bbox_label']
+    rec['bbox'] = [x1, y1, x2, y2]
+    rec['bbox_3d_isvalid'] = True

-    Returns:
-        dict: A sample 2D annotation record.
-            - file_name (str): file name
-            - image_id (str): sample data token
-            - area (float): 2d box area
-            - category_name (str): category name
-            - category_id (int): category id
-            - bbox (list[float]): left x, top y, x_size, y_size of 2d box
-            - iscrowd (int): whether the area is crowd
-    """
-    kitti_categories = ('Car', 'Pedestrian', 'Cyclist')
-    repro_rec = OrderedDict()
-    repro_rec['sample_data_token'] = sample_data_token
-    coco_rec = dict()
-
-    key_mapping = {
-        'name': 'category_name',
-        'num_points_in_gt': 'num_lidar_pts',
-        'sample_annotation_token': 'sample_annotation_token',
-        'sample_data_token': 'sample_data_token',
-    }
-
-    for key, value in ann_rec.items():
-        if key in key_mapping.keys():
-            repro_rec[key_mapping[key]] = value
-
-    repro_rec['bbox_corners'] = [x1, y1, x2, y2]
-    repro_rec['filename'] = filename
-
-    coco_rec['file_name'] = filename
-    coco_rec['image_id'] = sample_data_token
-    coco_rec['area'] = (y2 - y1) * (x2 - x1)
-
-    if repro_rec['category_name'] not in kitti_categories:
-        return None
-    cat_name = repro_rec['category_name']
-    coco_rec['category_name'] = cat_name
-    coco_rec['category_id'] = kitti_categories.index(cat_name)
-    coco_rec['bbox_label'] = coco_rec['category_id']
-    coco_rec['bbox_label_3d'] = coco_rec['bbox_label']
-    coco_rec['bbox'] = [x1, y1, x2 - x1, y2 - y1]
-    coco_rec['iscrowd'] = 0
-
-    return coco_rec
+    return rec
--- a/mmdet3d/datasets/det3d_dataset.py
+++ b/mmdet3d/datasets/det3d_dataset.py
@@ -26,11 +26,11 @@ class Det3DDataset(BaseDataset):
        metainfo (dict, optional): Meta information for dataset, such as class
            information. Defaults to None.
        data_prefix (dict, optional): Prefix for training data. Defaults to
-            dict(pts='velodyne', img="").
+            dict(pts='velodyne', img='').
        pipeline (list[dict], optional): Pipeline used for data processing.
            Defaults to None.
        modality (dict, optional): Modality to specify the sensor data used
-            as input, it usually has following keys.
+            as input, it usually has following keys:

                - use_camera: bool
                - use_lidar: bool
@@ -40,7 +40,7 @@ class Det3DDataset(BaseDataset):
        box_type_3d (str, optional): Type of 3D box of this dataset.
            Based on the `box_type_3d`, the dataset will encapsulate the box
            to its original format then converted them to `box_type_3d`.
-            Defaults to 'LiDAR'. Available options includes
+            Defaults to 'LiDAR'. Available options includes:

            - 'LiDAR': Box in LiDAR coordinates, usually for
              outdoor point cloud 3d detection.
@@ -49,15 +49,15 @@ class Det3DDataset(BaseDataset):
            - 'Camera': Box in camera coordinates, usually
              for vision-based 3d detection.

-        filter_empty_gt (bool): Whether to filter the data with
+        filter_empty_gt (bool, optional): Whether to filter the data with
            empty GT. Defaults to True.
-        test_mode (bool): Whether the dataset is in test mode.
+        test_mode (bool, optional): Whether the dataset is in test mode.
            Defaults to False.
-        load_eval_anns (bool): Whether to load annotations
-            in test_mode, the annotation will be save in
-            `eval_ann_infos`, which can be use in Evaluator.
-        file_client_args (dict): Configuration of file client.
-            Defaults to `dict(backend='disk')`.
+        load_eval_anns (bool, optional): Whether to load annotations
+            in test_mode, the annotation will be save in `eval_ann_infos`,
+            which can be used in Evaluator. Defaults to True.
+        file_client_args (dict, optional): Configuration of file client.
+            Defaults to dict(backend='disk').
    """

    def __init__(self,
@@ -73,7 +73,7 @@ class Det3DDataset(BaseDataset):
                 test_mode: bool = False,
                 load_eval_anns=True,
                 file_client_args: dict = dict(backend='disk'),
-                 **kwargs):
+                 **kwargs) -> None:
        # init file client
        self.file_client = mmengine.FileClient(**file_client_args)
        self.filter_empty_gt = filter_empty_gt
@@ -125,7 +125,7 @@ class Det3DDataset(BaseDataset):
        self.metainfo['box_type_3d'] = box_type_3d
        self.metainfo['label_mapping'] = self.label_mapping

-    def _remove_dontcare(self, ann_info):
+    def _remove_dontcare(self, ann_info: dict) -> dict:
        """Remove annotations that do not need to be cared.

        -1 indicate dontcare in MMDet3d.
@@ -192,7 +192,8 @@ class Det3DDataset(BaseDataset):
            'bbox_3d': 'gt_bboxes_3d',
            'depth': 'depths',
            'center_2d': 'centers_2d',
-            'attr_label': 'attr_labels'
+            'attr_label': 'attr_labels',
+            'velocity': 'velocities',
        }
        instances = info['instances']
        # empty gt
@@ -209,14 +210,18 @@ class Det3DDataset(BaseDataset):
                        self.label_mapping[item] for item in temp_anns
                    ]
                if ann_name in name_mapping:
-                    ann_name = name_mapping[ann_name]
+                    mapped_ann_name = name_mapping[ann_name]
+                else:
+                    mapped_ann_name = ann_name

                if 'label' in ann_name:
                    temp_anns = np.array(temp_anns).astype(np.int64)
-                else:
+                elif ann_name in name_mapping:
                    temp_anns = np.array(temp_anns).astype(np.float32)
+                else:
+                    temp_anns = np.array(temp_anns)

-                ann_info[ann_name] = temp_anns
+                ann_info[mapped_ann_name] = temp_anns
            ann_info['instances'] = info['instances']
        return ann_info

@@ -241,6 +246,7 @@ class Det3DDataset(BaseDataset):
                    self.data_prefix.get('pts', ''),
                    info['lidar_points']['lidar_path'])

+            info['num_pts_feats'] = info['lidar_points']['num_pts_feats']
            info['lidar_path'] = info['lidar_points']['lidar_path']
            if 'lidar_sweeps' in info:
                for sweep in info['lidar_sweeps']:
@@ -285,7 +291,7 @@ class Det3DDataset(BaseDataset):

        return info

-    def prepare_data(self, index):
+    def prepare_data(self, index: int) -> Optional[dict]:
        """Data preparation for both training and testing stage.

        Called by `__getitem__`  of dataset.
@@ -294,7 +300,7 @@ class Det3DDataset(BaseDataset):
            index (int): Index for accessing the target data.

        Returns:
-            dict: Data dict of the corresponding index.
+            dict | None: Data dict of the corresponding index.
        """
        input_dict = self.get_data_info(index)


--- a/mmdet3d/datasets/kitti_dataset.py
+++ b/mmdet3d/datasets/kitti_dataset.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Callable, List, Optional, Union
+from typing import Callable, List, Union

 import numpy as np

@@ -22,11 +22,12 @@ class KittiDataset(Det3DDataset):
            Defaults to None.
        modality (dict, optional): Modality to specify the sensor data used
            as input. Defaults to `dict(use_lidar=True)`.
-
+        default_cam_key (str, optional): The default camera name adopted.
+            Defaults to 'CAM2'.
        box_type_3d (str, optional): Type of 3D box of this dataset.
            Based on the `box_type_3d`, the dataset will encapsulate the box
            to its original format then converted them to `box_type_3d`.
-            Defaults to 'LiDAR' in this dataset. Available options includes
+            Defaults to 'LiDAR' in this dataset. Available options includes:

            - 'LiDAR': Box in LiDAR coordinates.
            - 'Depth': Box in depth coordinates, usually for indoor dataset.
@@ -35,9 +36,9 @@ class KittiDataset(Det3DDataset):
            Defaults to True.
        test_mode (bool, optional): Whether the dataset is in test mode.
            Defaults to False.
-        pcd_limit_range (list, optional): The range of point cloud used to
-            filter invalid predicted boxes.
-            Default: [0, -40, -3, 70.4, 40, 0.0].
+        pcd_limit_range (list[float], optional): The range of point cloud
+            used to filter invalid predicted boxes.
+            Defaults to [0, -40, -3, 70.4, 40, 0.0].
    """
    # TODO: use full classes of kitti
    METAINFO = {
@@ -49,15 +50,18 @@ class KittiDataset(Det3DDataset):
                 data_root: str,
                 ann_file: str,
                 pipeline: List[Union[dict, Callable]] = [],
-                 modality: Optional[dict] = dict(use_lidar=True),
+                 modality: dict = dict(use_lidar=True),
                 default_cam_key: str = 'CAM2',
+                 task: str = 'lidar_det',
                 box_type_3d: str = 'LiDAR',
                 filter_empty_gt: bool = True,
                 test_mode: bool = False,
                 pcd_limit_range: List[float] = [0, -40, -3, 70.4, 40, 0.0],
-                 **kwargs):
+                 **kwargs) -> None:

        self.pcd_limit_range = pcd_limit_range
+        assert task in ('lidar_det', 'mono_det')
+        self.task = task
        super().__init__(
            data_root=data_root,
            ann_file=ann_file,
@@ -107,11 +111,14 @@ class KittiDataset(Det3DDataset):

            info['plane'] = plane_lidar

+        if self.task == 'mono_det':
+            info['instances'] = info['cam_instances'][self.default_cam_key]
+
        info = super().parse_data_info(info)

        return info

-    def parse_ann_info(self, info):
+    def parse_ann_info(self, info: dict) -> dict:
        """Get annotation info according to the given index.

        Args:
@@ -135,6 +142,12 @@ class KittiDataset(Det3DDataset):
            ann_info['gt_bboxes_3d'] = np.zeros((0, 7), dtype=np.float32)
            ann_info['gt_labels_3d'] = np.zeros(0, dtype=np.int64)

+            if self.task == 'mono_det':
+                ann_info['gt_bboxes'] = np.zeros((0, 4), dtype=np.float32)
+                ann_info['gt_bboxes_labels'] = np.array(0, dtype=np.int64)
+                ann_info['centers_2d'] = np.zeros((0, 2), dtype=np.float32)
+                ann_info['depths'] = np.zeros((0), dtype=np.float32)
+
        ann_info = self._remove_dontcare(ann_info)
        # in kitti, lidar2cam = R0_rect @ Tr_velo_to_cam
        lidar2cam = np.array(info['images']['CAM2']['lidar2cam'])

--- a/mmdet3d/datasets/lyft_dataset.py
+++ b/mmdet3d/datasets/lyft_dataset.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Dict, List
+from typing import Callable, List, Union

 import numpy as np

@@ -24,18 +24,18 @@ class LyftDataset(Det3DDataset):
        pipeline (list[dict], optional): Pipeline used for data processing.
            Defaults to None.
        modality (dict, optional): Modality to specify the sensor data used
-            as input. Defaults to None.
+            as input. Defaults to dict(use_camera=False, use_lidar=True).
        box_type_3d (str): Type of 3D box of this dataset.
            Based on the `box_type_3d`, the dataset will encapsulate the box
            to its original format then converted them to `box_type_3d`.
-            Defaults to 'LiDAR' in this dataset. Available options includes
+            Defaults to 'LiDAR' in this dataset. Available options includes:

            - 'LiDAR': Box in LiDAR coordinates.
            - 'Depth': Box in depth coordinates, usually for indoor dataset.
            - 'Camera': Box in camera coordinates.
-        filter_empty_gt (bool): Whether to filter empty GT.
+        filter_empty_gt (bool, optional): Whether to filter empty GT.
            Defaults to True.
-        test_mode (bool): Whether the dataset is in test mode.
+        test_mode (bool, optional): Whether the dataset is in test mode.
            Defaults to False.
    """

@@ -48,8 +48,8 @@ class LyftDataset(Det3DDataset):
    def __init__(self,
                 data_root: str,
                 ann_file: str,
-                 pipeline: List[dict] = None,
-                 modality: Dict = dict(use_camera=False, use_lidar=True),
+                 pipeline: List[Union[dict, Callable]] = [],
+                 modality: dict = dict(use_camera=False, use_lidar=True),
                 box_type_3d: str = 'LiDAR',
                 filter_empty_gt: bool = True,
                 test_mode: bool = False,

--- a/mmdet3d/datasets/nuscenes_dataset.py
+++ b/mmdet3d/datasets/nuscenes_dataset.py
 # Copyright (c) OpenMMLab. All rights reserved.
 from os import path as osp
-from typing import Dict, List
+from typing import Callable, List, Union

 import numpy as np

@@ -22,25 +22,26 @@ class NuScenesDataset(Det3DDataset):
    Args:
        data_root (str): Path of dataset root.
        ann_file (str): Path of annotation file.
+        task (str, optional): Detection task. Defaults to 'lidar_det'.
        pipeline (list[dict], optional): Pipeline used for data processing.
            Defaults to None.
        box_type_3d (str): Type of 3D box of this dataset.
            Based on the `box_type_3d`, the dataset will encapsulate the box
            to its original format then converted them to `box_type_3d`.
-            Defaults to 'LiDAR' in this dataset. Available options includes.
+            Defaults to 'LiDAR' in this dataset. Available options includes:

            - 'LiDAR': Box in LiDAR coordinates.
            - 'Depth': Box in depth coordinates, usually for indoor dataset.
            - 'Camera': Box in camera coordinates.
        modality (dict, optional): Modality to specify the sensor data used
-            as input. Defaults to dict(use_camera=False,use_lidar=True).
-        filter_empty_gt (bool): Whether to filter empty GT.
+            as input. Defaults to dict(use_camera=False, use_lidar=True).
+        filter_empty_gt (bool, optional): Whether to filter empty GT.
            Defaults to True.
-        test_mode (bool): Whether the dataset is in test mode.
+        test_mode (bool, optional): Whether the dataset is in test mode.
            Defaults to False.
-        with_velocity (bool): Whether include velocity prediction
+        with_velocity (bool, optional): Whether to include velocity prediction
            into the experiments. Defaults to True.
-        use_valid_flag (bool): Whether to use `use_valid_flag` key
+        use_valid_flag (bool, optional): Whether to use `use_valid_flag` key
            in the info file as mask to filter gt_boxes and gt_names.
            Defaults to False.
    """
@@ -55,10 +56,10 @@ class NuScenesDataset(Det3DDataset):
    def __init__(self,
                 data_root: str,
                 ann_file: str,
-                 task: str = '3d',
-                 pipeline: List[dict] = None,
+                 task: str = 'lidar_det',
+                 pipeline: List[Union[dict, Callable]] = [],
                 box_type_3d: str = 'LiDAR',
-                 modality: Dict = dict(
+                 modality: dict = dict(
                     use_camera=False,
                     use_lidar=True,
                 ),
@@ -66,12 +67,12 @@ class NuScenesDataset(Det3DDataset):
                 test_mode: bool = False,
                 with_velocity: bool = True,
                 use_valid_flag: bool = False,
-                 **kwargs):
+                 **kwargs) -> None:
        self.use_valid_flag = use_valid_flag
        self.with_velocity = with_velocity

        # TODO: Redesign multi-view data process in the future
-        assert task in ('3d', 'mono3d', 'multi-view')
+        assert task in ('lidar_det', 'mono_det', 'multi-view_det')
        self.task = task

        assert box_type_3d.lower() in ('lidar', 'camera')
@@ -85,6 +86,27 @@ class NuScenesDataset(Det3DDataset):
            test_mode=test_mode,
            **kwargs)

+    def _filter_with_mask(self, ann_info: dict) -> dict:
+        """Remove annotations that do not need to be cared.
+
+        Args:
+            ann_info (dict): Dict of annotation infos.
+
+        Returns:
+            dict: Annotations after filtering.
+        """
+        filtered_annotations = {}
+        if self.use_valid_flag:
+            filter_mask = ann_info['bbox_3d_isvalid']
+        else:
+            filter_mask = ann_info['num_lidar_pts'] > 0
+        for key in ann_info.keys():
+            if key != 'instances':
+                filtered_annotations[key] = (ann_info[key][filter_mask])
+            else:
+                filtered_annotations[key] = ann_info[key]
+        return filtered_annotations
+
    def parse_ann_info(self, info: dict) -> dict:
        """Get annotation info according to the given index.

@@ -99,66 +121,51 @@ class NuScenesDataset(Det3DDataset):
                - gt_labels_3d (np.ndarray): Labels of ground truths.
        """
        ann_info = super().parse_ann_info(info)
-        if ann_info is None:
-            # empty instance
-            anns_results = dict()
-            anns_results['gt_bboxes_3d'] = np.zeros((0, 7), dtype=np.float32)
-            anns_results['gt_labels_3d'] = np.zeros(0, dtype=np.int64)
-            return anns_results
-
-        if self.use_valid_flag:
-            mask = ann_info['bbox_3d_isvalid']
-        else:
-            mask = ann_info['num_lidar_pts'] > 0
-        gt_bboxes_3d = ann_info['gt_bboxes_3d'][mask]
-        gt_labels_3d = ann_info['gt_labels_3d'][mask]
-
-        if 'gt_bboxes' in ann_info:
-            gt_bboxes = ann_info['gt_bboxes'][mask]
-            gt_labels = ann_info['gt_labels'][mask]
-            attr_labels = ann_info['attr_labels'][mask]
+        if ann_info is not None:
+
+            ann_info = self._filter_with_mask(ann_info)
+
+            if self.with_velocity:
+                gt_bboxes_3d = ann_info['gt_bboxes_3d']
+                gt_velocities = ann_info['velocities']
+                nan_mask = np.isnan(gt_velocities[:, 0])
+                gt_velocities[nan_mask] = [0.0, 0.0]
+                gt_bboxes_3d = np.concatenate([gt_bboxes_3d, gt_velocities],
+                                              axis=-1)
+                ann_info['gt_bboxes_3d'] = gt_bboxes_3d
        else:
-            gt_bboxes = np.zeros((0, 4), dtype=np.float32)
-            gt_labels = np.array([], dtype=np.int64)
-            attr_labels = np.array([], dtype=np.int64)
-
-        if 'centers_2d' in ann_info:
-            centers_2d = ann_info['centers_2d'][mask]
-            depths = ann_info['depths'][mask]
-        else:
-            centers_2d = np.zeros((0, 2), dtype=np.float32)
-            depths = np.zeros((0), dtype=np.float32)
-
-        if self.with_velocity:
-            gt_velocity = ann_info['velocity'][mask]
-            nan_mask = np.isnan(gt_velocity[:, 0])
-            gt_velocity[nan_mask] = [0.0, 0.0]
-            gt_bboxes_3d = np.concatenate([gt_bboxes_3d, gt_velocity], axis=-1)
+            # empty instance
+            ann_info = dict()
+            if self.with_velocity:
+                ann_info['gt_bboxes_3d'] = np.zeros((0, 9), dtype=np.float32)
+            else:
+                ann_info['gt_bboxes_3d'] = np.zeros((0, 7), dtype=np.float32)
+            ann_info['gt_labels_3d'] = np.zeros(0, dtype=np.int64)
+
+            if self.task == 'mono3d':
+                ann_info['gt_bboxes'] = np.zeros((0, 4), dtype=np.float32)
+                ann_info['gt_bboxes_labels'] = np.array(0, dtype=np.int64)
+                ann_info['attr_labels'] = np.array(0, dtype=np.int64)
+                ann_info['centers_2d'] = np.zeros((0, 2), dtype=np.float32)
+                ann_info['depths'] = np.zeros((0), dtype=np.float32)

        # the nuscenes box center is [0.5, 0.5, 0.5], we change it to be
        # the same as KITTI (0.5, 0.5, 0)
        # TODO: Unify the coordinates
-        if self.task == 'mono3d':
+        if self.task == 'mono_det':
            gt_bboxes_3d = CameraInstance3DBoxes(
-                gt_bboxes_3d,
-                box_dim=gt_bboxes_3d.shape[-1],
+                ann_info['gt_bboxes_3d'],
+                box_dim=ann_info['gt_bboxes_3d'].shape[-1],
                origin=(0.5, 0.5, 0.5))
        else:
            gt_bboxes_3d = LiDARInstance3DBoxes(
-                gt_bboxes_3d,
-                box_dim=gt_bboxes_3d.shape[-1],
+                ann_info['gt_bboxes_3d'],
+                box_dim=ann_info['gt_bboxes_3d'].shape[-1],
                origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)

-        anns_results = dict(
-            gt_bboxes_3d=gt_bboxes_3d,
-            gt_labels_3d=gt_labels_3d,
-            gt_bboxes=gt_bboxes,
-            gt_labels=gt_labels,
-            attr_labels=attr_labels,
-            centers_2d=centers_2d,
-            depths=depths)
+        ann_info['gt_bboxes_3d'] = gt_bboxes_3d

-        return anns_results
+        return ann_info

    def parse_data_info(self, info: dict) -> dict:
        """Process the raw data info.
@@ -173,7 +180,7 @@ class NuScenesDataset(Det3DDataset):
            dict: Has `ann_info` in training stage. And
            all path has been converted to absolute path.
        """
-        if self.task == 'mono3d':
+        if self.task == 'mono_det':
            data_list = []
            if self.modality['use_lidar']:
                info['lidar_points']['lidar_path'] = \

--- a/mmdet3d/datasets/scannet_dataset.py
+++ b/mmdet3d/datasets/scannet_dataset.py
@@ -36,7 +36,7 @@ class ScanNetDataset(Det3DDataset):
        box_type_3d (str): Type of 3D box of this dataset.
            Based on the `box_type_3d`, the dataset will encapsulate the box
            to its original format then converted them to `box_type_3d`.
-            Defaults to 'Depth' in this dataset. Available options includes
+            Defaults to 'Depth' in this dataset. Available options includes:

            - 'LiDAR': Box in LiDAR coordinates.
            - 'Depth': Box in depth coordinates, usually for indoor dataset.
@@ -61,13 +61,13 @@ class ScanNetDataset(Det3DDataset):
    def __init__(self,
                 data_root: str,
                 ann_file: str,
-                 metainfo: dict = None,
+                 metainfo: Optional[dict] = None,
                 data_prefix: dict = dict(
                     pts='points',
                     pts_instance_mask='instance_mask',
                     pts_semantic_mask='semantic_mask'),
                 pipeline: List[Union[dict, Callable]] = [],
-                 modality=dict(use_camera=False, use_lidar=True),
+                 modality: dict = dict(use_camera=False, use_lidar=True),
                 box_type_3d: str = 'Depth',
                 filter_empty_gt: bool = True,
                 test_mode: bool = False,
@@ -101,7 +101,7 @@ class ScanNetDataset(Det3DDataset):
        assert self.modality['use_camera'] or self.modality['use_lidar']

    @staticmethod
-    def _get_axis_align_matrix(info: dict) -> dict:
+    def _get_axis_align_matrix(info: dict) -> np.ndarray:
        """Get axis_align_matrix from info. If not exist, return identity mat.

        Args:

--- a/mmdet3d/datasets/sunrgbd_dataset.py
+++ b/mmdet3d/datasets/sunrgbd_dataset.py
@@ -24,25 +24,25 @@ class SUNRGBDDataset(Det3DDataset):
        ann_file (str): Path of annotation file.
        metainfo (dict, optional): Meta information for dataset, such as class
            information. Defaults to None.
-        data_prefix (dict): Prefix for data. Defaults to
-            `dict(pts='points',img='sunrgbd_trainval')`.
+        data_prefix (dict, optiona;): Prefix for data. Defaults to
+            dict(pts='points',img='sunrgbd_trainval').
        pipeline (list[dict], optional): Pipeline used for data processing.
            Defaults to None.
        modality (dict, optional): Modality to specify the sensor data used
-            as input. Defaults to `dict(use_camera=True, use_lidar=True)`.
-        default_cam_key (str): The default camera name adopted.
-            Defaults to "CAM0".
+            as input. Defaults to dict(use_camera=True, use_lidar=True).
+        default_cam_key (str, optional): The default camera name adopted.
+            Defaults to 'CAM0'.
        box_type_3d (str): Type of 3D box of this dataset.
            Based on the `box_type_3d`, the dataset will encapsulate the box
            to its original format then converted them to `box_type_3d`.
-            Defaults to 'Depth' in this dataset. Available options includes
+            Defaults to 'Depth' in this dataset. Available options includes:

            - 'LiDAR': Box in LiDAR coordinates.
            - 'Depth': Box in depth coordinates, usually for indoor dataset.
            - 'Camera': Box in camera coordinates.
-        filter_empty_gt (bool): Whether to filter empty GT.
+        filter_empty_gt (bool, optional): Whether to filter empty GT.
            Defaults to True.
-        test_mode (bool): Whether the dataset is in test mode.
+        test_mode (bool, optional): Whether the dataset is in test mode.
            Defaults to False.
    """
    METAINFO = {

--- a/mmdet3d/datasets/transforms/__init__.py
+++ b/mmdet3d/datasets/transforms/__init__.py
@@ -11,11 +11,12 @@ from .test_time_aug import MultiScaleFlipAug3D
 from .transforms_3d import (AffineResize, BackgroundPointsFilter,
                            GlobalAlignment, GlobalRotScaleTrans,
                            IndoorPatchPointSample, IndoorPointSample,
-                            ObjectNameFilter, ObjectNoise, ObjectRangeFilter,
-                            ObjectSample, PointSample, PointShuffle,
+                            MultiViewWrapper, ObjectNameFilter, ObjectNoise,
+                            ObjectRangeFilter, ObjectSample,
+                            PhotoMetricDistortion3D, PointSample, PointShuffle,
                            PointsRangeFilter, RandomDropPointsColor,
-                            RandomFlip3D, RandomJitterPoints, RandomShiftScale,
-                            VoxelBasedPointSampler)
+                            RandomFlip3D, RandomJitterPoints, RandomResize3D,
+                            RandomShiftScale, Resize3D, VoxelBasedPointSampler)

 __all__ = [
    'ObjectSample', 'RandomFlip3D', 'ObjectNoise', 'GlobalRotScaleTrans',
@@ -29,5 +30,6 @@ __all__ = [
    'VoxelBasedPointSampler', 'GlobalAlignment', 'IndoorPatchPointSample',
    'LoadImageFromFileMono3D', 'ObjectNameFilter', 'RandomDropPointsColor',
    'RandomJitterPoints', 'AffineResize', 'RandomShiftScale',
-    'LoadPointsFromDict'
+    'LoadPointsFromDict', 'Resize3D', 'RandomResize3D',
+    'MultiViewWrapper', 'PhotoMetricDistortion3D'
 ]
--- a/mmdet3d/datasets/transforms/compose.py
+++ b/mmdet3d/datasets/transforms/compose.py
@@ -32,7 +32,7 @@ class Compose:
            data (dict): A result dict contains the data to transform.

        Returns:
-           dict: Transformed data.
+            dict: Transformed data.
        """

        for t in self.transforms:

--- a/mmdet3d/datasets/transforms/dbsampler.py
+++ b/mmdet3d/datasets/transforms/dbsampler.py
--- a/mmdet3d/datasets/transforms/formating.py
+++ b/mmdet3d/datasets/transforms/formating.py
@@ -63,15 +63,20 @@ class Pack3DDetInputs(BaseTransform):

    def __init__(
        self,
-        keys: dict,
-        meta_keys: dict = ('img_path', 'ori_shape', 'img_shape', 'lidar2img',
-                           'depth2img', 'cam2img', 'pad_shape', 'scale_factor',
-                           'flip', 'pcd_horizontal_flip', 'pcd_vertical_flip',
-                           'box_mode_3d', 'box_type_3d', 'img_norm_cfg',
-                           'pcd_trans', 'sample_idx', 'pcd_scale_factor',
-                           'pcd_rotation', 'pcd_rotation_angle', 'lidar_path',
-                           'transformation_3d_flow', 'trans_mat',
-                           'affine_aug')):
+        keys: tuple,
+        meta_keys: tuple = ('img_path', 'ori_shape', 'img_shape', 'lidar2img',
+                            'depth2img', 'cam2img', 'pad_shape',
+                            'scale_factor', 'flip', 'pcd_horizontal_flip',
+                            'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d',
+                            'img_norm_cfg', 'num_pts_feats', 'pcd_trans',
+                            'sample_idx', 'pcd_scale_factor', 'pcd_rotation',
+                            'pcd_rotation_angle', 'lidar_path',
+                            'transformation_3d_flow', 'trans_mat',
+                            'affine_aug', 'sweep_img_metas', 'ori_cam2img',
+                            'cam2global', 'crop_offset', 'img_crop_offset',
+                            'resize_img_shape', 'lidar2cam', 'ori_lidar2img',
+                            'num_ref_frames', 'num_views', 'ego2global')
+    ) -> None:
        self.keys = keys
        self.meta_keys = meta_keys

@@ -98,7 +103,7 @@ class Pack3DDetInputs(BaseTransform):
                - img

            - 'data_samples' (obj:`Det3DDataSample`): The annotation info of
-                the sample.
+              the sample.
        """
        # augtest
        if isinstance(results, list):
@@ -115,7 +120,7 @@ class Pack3DDetInputs(BaseTransform):
        else:
            raise NotImplementedError

-    def pack_single_results(self, results):
+    def pack_single_results(self, results: dict) -> dict:
        """Method to pack the single input data. when the value in this dict is
        a list, it usually is in Augmentations Testing.

@@ -131,7 +136,7 @@ class Pack3DDetInputs(BaseTransform):
                - points
                - img

-            - 'data_samples' (obj:`Det3DDataSample`): The annotation info
+            - 'data_samples' (:obj:`Det3DDataSample`): The annotation info
              of the sample.
        """
        # Format 3D data
@@ -219,6 +224,7 @@ class Pack3DDetInputs(BaseTransform):
        return packed_results

    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
        repr_str = self.__class__.__name__
        repr_str += f'(keys={self.keys})'
        repr_str += f'(meta_keys={self.meta_keys})'

--- a/mmdet3d/datasets/transforms/loading.py
+++ b/mmdet3d/datasets/transforms/loading.py
--- a/mmdet3d/datasets/transforms/test_time_aug.py
+++ b/mmdet3d/datasets/transforms/test_time_aug.py
@@ -16,7 +16,7 @@ class MultiScaleFlipAug3D(BaseTransform):

    Args:
        transforms (list[dict]): Transforms to apply in each augmentation.
-        img_scale (tuple | list[tuple]: Images scales for resizing.
+        img_scale (tuple | list[tuple]): Images scales for resizing.
        pts_scale_ratio (float | list[float]): Points scale ratios for
            resizing.
        flip (bool, optional): Whether apply flip augmentation.
@@ -25,11 +25,11 @@ class MultiScaleFlipAug3D(BaseTransform):
            directions for images, options are "horizontal" and "vertical".
            If flip_direction is list, multiple flip augmentations will
            be applied. It has no effect when ``flip == False``.
-            Defaults to "horizontal".
-        pcd_horizontal_flip (bool, optional): Whether apply horizontal
+            Defaults to 'horizontal'.
+        pcd_horizontal_flip (bool, optional): Whether to apply horizontal
            flip augmentation to point cloud. Defaults to True.
            Note that it works only when 'flip' is turned on.
-        pcd_vertical_flip (bool, optional): Whether apply vertical flip
+        pcd_vertical_flip (bool, optional): Whether to apply vertical flip
            augmentation to point cloud. Defaults to True.
            Note that it works only when 'flip' is turned on.
    """
@@ -46,7 +46,7 @@ class MultiScaleFlipAug3D(BaseTransform):
        self.img_scale = img_scale if isinstance(img_scale,
                                                 list) else [img_scale]
        self.pts_scale_ratio = pts_scale_ratio \
-            if isinstance(pts_scale_ratio, list) else[float(pts_scale_ratio)]
+            if isinstance(pts_scale_ratio, list) else [float(pts_scale_ratio)]

        assert mmengine.is_list_of(self.img_scale, tuple)
        assert mmengine.is_list_of(self.pts_scale_ratio, float)

--- a/mmdet3d/datasets/transforms/transforms_3d.py
+++ b/mmdet3d/datasets/transforms/transforms_3d.py
--- a/mmdet3d/datasets/waymo_dataset.py
+++ b/mmdet3d/datasets/waymo_dataset.py
--- a/mmdet3d/engine/__init__.py
+++ b/mmdet3d/engine/__init__.py
--- a/mmdet3d/engine/hooks/__init__.py
+++ b/mmdet3d/engine/hooks/__init__.py