Refactor part of transforms

39b294f5 · jshilong · ChaimZhu · 7bb011af · 39b294f5 · 39b294f5
Commit 39b294f5 authored Jun 06, 2022 by jshilong Committed by ChaimZhu Jul 20, 2022
9 changed files
--- a/mmdet3d/datasets/pipelines/__init__.py
+++ b/mmdet3d/datasets/pipelines/__init__.py
 # Copyright (c) OpenMMLab. All rights reserved.
 from .compose import Compose
 from .dbsampler import DataBaseSampler
-from .formating import Collect3D, DefaultFormatBundle, DefaultFormatBundle3D
+from .formating import Pack3DDetInputs
 from .loading import (LoadAnnotations3D, LoadImageFromFileMono3D,
                      LoadMultiViewImageFromFiles, LoadPointsFromDict,
                      LoadPointsFromFile, LoadPointsFromMultiSweeps,
@@ -19,9 +19,10 @@ from .transforms_3d import (AffineResize, BackgroundPointsFilter,

 __all__ = [
    'ObjectSample', 'RandomFlip3D', 'ObjectNoise', 'GlobalRotScaleTrans',
-    'PointShuffle', 'ObjectRangeFilter', 'PointsRangeFilter', 'Collect3D',
+    'PointShuffle', 'ObjectRangeFilter', 'PointsRangeFilter',
+    'Pack3DDetInputs',
    'Compose', 'LoadMultiViewImageFromFiles', 'LoadPointsFromFile',
-    'DefaultFormatBundle', 'DefaultFormatBundle3D', 'DataBaseSampler',
+    'DataBaseSampler',
    'NormalizePointsColor', 'LoadAnnotations3D', 'IndoorPointSample',
    'PointSample', 'PointSegClassMapping', 'MultiScaleFlipAug3D',
    'LoadPointsFromMultiSweeps', 'BackgroundPointsFilter',

--- a/mmdet3d/datasets/pipelines/formating.py
+++ b/mmdet3d/datasets/pipelines/formating.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import numpy as np
-from mmcv.parallel import DataContainer as DC
+from mmcv import BaseTransform
 from mmcv.transforms import to_tensor
+from mmengine import InstanceData

+from mmdet3d.core import Det3DDataSample
 from mmdet3d.core.bbox import BaseInstance3DBoxes
 from mmdet3d.core.points import BasePoints
 from mmdet3d.registry import TRANSFORMS


 @TRANSFORMS.register_module()
-class DefaultFormatBundle(object):
-    """Default formatting bundle.
+class Pack3DDetInputs(BaseTransform):
+    INPUTS_KEYS = ['points', 'img']
+    INSTANCEDATA_3D_KEYS = [
+        'gt_bboxes_3d', 'gt_labels_3d', 'attr_labels', 'depths', 'centers_2d'
+    ]
+    INSTANCEDATA_2D_KEYS = [
+        'gt_bboxes',
+        'gt_labels',
+    ]

-    It simplifies the pipeline of formatting common fields, including "img",
-    "proposals", "gt_bboxes", "gt_labels", "gt_masks" and "gt_semantic_seg".
-    These fields are formatted as follows.
+    SEG_KEYS = [
+        'gt_seg_map', 'pts_instance_mask', 'pts_semantic_mask',
+        'gt_semantic_seg'
+    ]

-    - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
-    - proposals: (1)to tensor, (2)to DataContainer
-    - gt_bboxes: (1)to tensor, (2)to DataContainer
-    - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
-    - gt_labels: (1)to tensor, (2)to DataContainer
-    - gt_masks: (1)to tensor, (2)to DataContainer (cpu_only=True)
-    - gt_semantic_seg: (1)unsqueeze dim-0 (2)to tensor,
-                       (3)to DataContainer (stack=True)
-    """
+    def __init__(
+        self,
+        keys: dict,
+        meta_keys: dict = ('filename', 'ori_shape', 'img_shape', 'lidar2img',
+                           'depth2img', 'cam2img', 'pad_shape', 'scale_factor',
+                           'flip', 'pcd_horizontal_flip', 'pcd_vertical_flip',
+                           'box_mode_3d', 'box_type_3d', 'img_norm_cfg',
+                           'pcd_trans', 'sample_idx', 'pcd_scale_factor',
+                           'pcd_rotation', 'pcd_rotation_angle',
+                           'pts_filename', 'transformation_3d_flow',
+                           'trans_mat', 'affine_aug')):
+        self.keys = keys
+        self.meta_keys = meta_keys

-    def __init__(self, ):
-        return
+    def _remove_prefix(self, key: str) -> str:
+        if key.startswith('gt_'):
+            key = key[3:]
+        return key

-    def __call__(self, results):
-        """Call function to transform and format common fields in results.
+    def transform(self, results: dict) -> dict:
+        """Method to pack the input data.

        Args:
-            results (dict): Result dict contains the data to convert.
+            results (dict): Result dict from the data pipeline.

        Returns:
-            dict: The result dict contains the data that is formatted with
-                default bundle.
+            dict:
+
+            - 'inputs' (dict): The forward data of models. It usually contains
+              following keys:
+
+                - points
+                - img
+
+            - 'data_sample' (obj:`Det3DDataSample`): The annotation info of the
+              sample.
        """
+        packed_results = dict()
+
+        # Format 3D data
+        if 'points' in results:
+            assert isinstance(results['points'], BasePoints)
+            results['points'] = results['points'].tensor
+
        if 'img' in results:
            if isinstance(results['img'], list):
                # process multiple imgs in single frame
                imgs = [img.transpose(2, 0, 1) for img in results['img']]
                imgs = np.ascontiguousarray(np.stack(imgs, axis=0))
-                results['img'] = DC(to_tensor(imgs), stack=True)
+                results['img'] = to_tensor(imgs)
            else:
-                img = np.ascontiguousarray(results['img'].transpose(2, 0, 1))
-                results['img'] = DC(to_tensor(img), stack=True)
+                img = results['img']
+                if len(img.shape) < 3:
+                    img = np.expand_dims(img, -1)
+                results['img'] = np.ascontiguousarray(img.transpose(2, 0, 1))
+
        for key in [
                'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels',
                'gt_labels_3d', 'attr_labels', 'pts_instance_mask',
@@ -56,211 +90,57 @@ class DefaultFormatBundle(object):
            if key not in results:
                continue
            if isinstance(results[key], list):
-                results[key] = DC([to_tensor(res) for res in results[key]])
+                results[key] = [to_tensor(res) for res in results[key]]
            else:
-                results[key] = DC(to_tensor(results[key]))
+                results[key] = to_tensor(results[key])
        if 'gt_bboxes_3d' in results:
-            if isinstance(results['gt_bboxes_3d'], BaseInstance3DBoxes):
-                results['gt_bboxes_3d'] = DC(
-                    results['gt_bboxes_3d'], cpu_only=True)
-            else:
-                results['gt_bboxes_3d'] = DC(
-                    to_tensor(results['gt_bboxes_3d']))
+            if not isinstance(results['gt_bboxes_3d'], BaseInstance3DBoxes):
+                results['gt_bboxes_3d'] = to_tensor(results['gt_bboxes_3d'])

-        if 'gt_masks' in results:
-            results['gt_masks'] = DC(results['gt_masks'], cpu_only=True)
        if 'gt_semantic_seg' in results:
-            results['gt_semantic_seg'] = DC(
-                to_tensor(results['gt_semantic_seg'][None, ...]), stack=True)
-
-        return results
-
-    def __repr__(self):
-        return self.__class__.__name__
-
-
-@TRANSFORMS.register_module()
-class Collect3D(object):
-    """Collect data from the loader relevant to the specific task.
-
-    This is usually the last stage of the data loader pipeline. Typically keys
-    is set to some subset of "img", "proposals", "gt_bboxes",
-    "gt_bboxes_ignore", "gt_labels", and/or "gt_masks".
-
-    The "img_meta" item is always populated.  The contents of the "img_meta"
-    dictionary depends on "meta_keys". By default this includes:
-
-        - 'img_shape': shape of the image input to the network as a tuple
-            (h, w, c).  Note that images may be zero padded on the
-            bottom/right if the batch tensor is larger than this shape.
-        - 'scale_factor': a float indicating the preprocessing scale
-        - 'flip': a boolean indicating if image flip transform was used
-        - 'filename': path to the image file
-        - 'ori_shape': original shape of the image as a tuple (h, w, c)
-        - 'pad_shape': image shape after padding
-        - 'lidar2img': transform from lidar to image
-        - 'depth2img': transform from depth to image
-        - 'cam2img': transform from camera to image
-        - 'pcd_horizontal_flip': a boolean indicating if point cloud is
-            flipped horizontally
-        - 'pcd_vertical_flip': a boolean indicating if point cloud is
-            flipped vertically
-        - 'box_mode_3d': 3D box mode
-        - 'box_type_3d': 3D box type
-        - 'img_norm_cfg': a dict of normalization information:
-            - mean: per channel mean subtraction
-            - std: per channel std divisor
-            - to_rgb: bool indicating if bgr was converted to rgb
-        - 'pcd_trans': point cloud transformations
-        - 'sample_idx': sample index
-        - 'pcd_scale_factor': point cloud scale factor
-        - 'pcd_rotation': rotation applied to point cloud
-        - 'pts_filename': path to point cloud file.
-
-    Args:
-        keys (Sequence[str]): Keys of results to be collected in ``data``.
-        meta_keys (Sequence[str], optional): Meta keys to be converted to
-            ``mmcv.DataContainer`` and collected in ``data[img_metas]``.
-            Default: ('filename', 'ori_shape', 'img_shape', 'lidar2img',
-            'depth2img', 'cam2img', 'pad_shape', 'scale_factor', 'flip',
-            'pcd_horizontal_flip', 'pcd_vertical_flip', 'box_mode_3d',
-            'box_type_3d', 'img_norm_cfg', 'pcd_trans',
-            'sample_idx', 'pcd_scale_factor', 'pcd_rotation', 'pts_filename')
-    """
-
-    def __init__(
-        self,
-        keys,
-        meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img',
-                   'depth2img', 'cam2img', 'pad_shape', 'scale_factor', 'flip',
-                   'pcd_horizontal_flip', 'pcd_vertical_flip', 'box_mode_3d',
-                   'box_type_3d', 'img_norm_cfg', 'pcd_trans', 'sample_idx',
-                   'pcd_scale_factor', 'pcd_rotation', 'pcd_rotation_angle',
-                   'pts_filename', 'transformation_3d_flow', 'trans_mat',
-                   'affine_aug')):
-        self.keys = keys
-        self.meta_keys = meta_keys
+            results['gt_semantic_seg'] = to_tensor(
+                results['gt_semantic_seg'][None])
+        if 'gt_seg_map' in results:
+            results['gt_seg_map'] = results['gt_seg_map'][None, ...]

-    def __call__(self, results):
-        """Call function to collect keys in results. The keys in ``meta_keys``
-        will be converted to :obj:`mmcv.DataContainer`.
+        data_sample = Det3DDataSample()
+        gt_instances_3d = InstanceData()
+        gt_instances = InstanceData()
+        seg_data = dict()

-        Args:
-            results (dict): Result dict contains the data to collect.
-
-        Returns:
-            dict: The result dict contains the following keys
-                - keys in ``self.keys``
-                - ``img_metas``
-        """
-        data = {}
        img_metas = {}
        for key in self.meta_keys:
            if key in results:
                img_metas[key] = results[key]
+        data_sample.set_metainfo(img_metas)

-        data['img_metas'] = DC(img_metas, cpu_only=True)
+        inputs = {}
        for key in self.keys:
-            data[key] = results[key]
-        return data
-
-    def __repr__(self):
-        """str: Return a string that describes the module."""
-        return self.__class__.__name__ + \
-            f'(keys={self.keys}, meta_keys={self.meta_keys})'
-
-
-@TRANSFORMS.register_module()
-class DefaultFormatBundle3D(DefaultFormatBundle):
-    """Default formatting bundle.
-
-    It simplifies the pipeline of formatting common fields for voxels,
-    including "proposals", "gt_bboxes", "gt_labels", "gt_masks" and
-    "gt_semantic_seg".
-    These fields are formatted as follows.
-
-    - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
-    - proposals: (1)to tensor, (2)to DataContainer
-    - gt_bboxes: (1)to tensor, (2)to DataContainer
-    - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
-    - gt_labels: (1)to tensor, (2)to DataContainer
-    """
-
-    def __init__(self, class_names, with_gt=True, with_label=True):
-        super(DefaultFormatBundle3D, self).__init__()
-        self.class_names = class_names
-        self.with_gt = with_gt
-        self.with_label = with_label
-
-    def __call__(self, results):
-        """Call function to transform and format common fields in results.
-
-        Args:
-            results (dict): Result dict contains the data to convert.
-
-        Returns:
-            dict: The result dict contains the data that is formatted with
-                default bundle.
-        """
-        # Format 3D data
-        if 'points' in results:
-            assert isinstance(results['points'], BasePoints)
-            results['points'] = DC(results['points'].tensor)
+            if key in results:
+                if key in self.INPUTS_KEYS:
+                    inputs[key] = results[key]
+                elif key in self.INSTANCEDATA_3D_KEYS:
+                    gt_instances_3d[self._remove_prefix(key)] = results[key]
+                elif key in self.INSTANCEDATA_2D_KEYS:
+                    gt_instances[self._remove_prefix(key)] = results[key]
+                elif key in self.SEG_KEYS:
+                    seg_data[self._remove_prefix(key)] = results[key]
+                else:
+                    raise NotImplementedError(f'Please modified '
+                                              f'`Pack3DDetInputs` '
+                                              f'to put {key} to '
+                                              f'corresponding field')

-        for key in ['voxels', 'coors', 'voxel_centers', 'num_points']:
-            if key not in results:
-                continue
-            results[key] = DC(to_tensor(results[key]), stack=False)
+        data_sample.gt_instances_3d = gt_instances_3d
+        data_sample.gt_instances = gt_instances
+        data_sample.seg_data = seg_data
+        packed_results['data_sample'] = data_sample
+        packed_results['inputs'] = inputs

-        if self.with_gt:
-            # Clean GT bboxes in the final
-            if 'gt_bboxes_3d_mask' in results:
-                gt_bboxes_3d_mask = results['gt_bboxes_3d_mask']
-                results['gt_bboxes_3d'] = results['gt_bboxes_3d'][
-                    gt_bboxes_3d_mask]
-                if 'gt_names_3d' in results:
-                    results['gt_names_3d'] = results['gt_names_3d'][
-                        gt_bboxes_3d_mask]
-                if 'centers2d' in results:
-                    results['centers2d'] = results['centers2d'][
-                        gt_bboxes_3d_mask]
-                if 'depths' in results:
-                    results['depths'] = results['depths'][gt_bboxes_3d_mask]
-            if 'gt_bboxes_mask' in results:
-                gt_bboxes_mask = results['gt_bboxes_mask']
-                if 'gt_bboxes' in results:
-                    results['gt_bboxes'] = results['gt_bboxes'][gt_bboxes_mask]
-                results['gt_names'] = results['gt_names'][gt_bboxes_mask]
-            if self.with_label:
-                if 'gt_names' in results and len(results['gt_names']) == 0:
-                    results['gt_labels'] = np.array([], dtype=np.int64)
-                    results['attr_labels'] = np.array([], dtype=np.int64)
-                elif 'gt_names' in results and isinstance(
-                        results['gt_names'][0], list):
-                    # gt_labels might be a list of list in multi-view setting
-                    results['gt_labels'] = [
-                        np.array([self.class_names.index(n) for n in res],
-                                 dtype=np.int64) for res in results['gt_names']
-                    ]
-                elif 'gt_names' in results:
-                    results['gt_labels'] = np.array([
-                        self.class_names.index(n) for n in results['gt_names']
-                    ],
-                                                    dtype=np.int64)
-                # we still assume one pipeline for one frame LiDAR
-                # thus, the 3D name is list[string]
-                if 'gt_names_3d' in results:
-                    results['gt_labels_3d'] = np.array([
-                        self.class_names.index(n)
-                        for n in results['gt_names_3d']
-                    ],
-                                                       dtype=np.int64)
-        results = super(DefaultFormatBundle3D, self).__call__(results)
-        return results
+        return packed_results

-    def __repr__(self):
-        """str: Return a string that describes the module."""
+    def __repr__(self) -> str:
        repr_str = self.__class__.__name__
-        repr_str += f'(class_names={self.class_names}, '
-        repr_str += f'with_gt={self.with_gt}, with_label={self.with_label})'
+        repr_str += f'(keys={self.keys})'
+        repr_str += f'(meta_keys={self.meta_keys})'
        return repr_str
--- a/mmdet3d/datasets/pipelines/loading.py
+++ b/mmdet3d/datasets/pipelines/loading.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import mmcv
 import numpy as np
+from mmcv import BaseTransform
 from mmcv.transforms import LoadImageFromFile

 from mmdet3d.core.points import BasePoints, get_points_type
@@ -336,10 +337,18 @@ class NormalizePointsColor(object):


 @TRANSFORMS.register_module()
-class LoadPointsFromFile(object):
+class LoadPointsFromFile(BaseTransform):
    """Load Points From File.

-    Load points from file.
+    Required Keys:
+
+    - lidar_points (dict)
+
+        - lidar_path (str)
+
+    Added Keys:
+
+    - points (np.float32)

    Args:
        coord_type (str): The type of coordinates of points cloud.
@@ -362,13 +371,15 @@ class LoadPointsFromFile(object):
            for more details. Defaults to dict(backend='disk').
    """

-    def __init__(self,
-                 coord_type,
-                 load_dim=6,
-                 use_dim=[0, 1, 2],
-                 shift_height=False,
-                 use_color=False,
-                 file_client_args=dict(backend='disk')):
+    def __init__(
+        self,
+        coord_type: str,
+        load_dim: int = 6,
+        use_dim: list = [0, 1, 2],
+        shift_height: bool = False,
+        use_color: bool = False,
+        file_client_args: dict = dict(backend='disk')
+    ) -> None:
        self.shift_height = shift_height
        self.use_color = use_color
        if isinstance(use_dim, int):
@@ -383,7 +394,7 @@ class LoadPointsFromFile(object):
        self.file_client_args = file_client_args.copy()
        self.file_client = None

-    def _load_points(self, pts_filename):
+    def _load_points(self, pts_filename: str) -> np.ndarray:
        """Private function to load point clouds data.

        Args:
@@ -406,8 +417,8 @@ class LoadPointsFromFile(object):

        return points

-    def __call__(self, results):
-        """Call function to load points data from file.
+    def transform(self, results: dict) -> dict:
+        """Method to load points data from file.

        Args:
            results (dict): Result dict containing point clouds data.
@@ -418,8 +429,8 @@ class LoadPointsFromFile(object):

                - points (:obj:`BasePoints`): Point clouds data.
        """
-        pts_filename = results['pts_filename']
-        points = self._load_points(pts_filename)
+        pts_file_path = results['lidar_points']['lidar_path']
+        points = self._load_points(pts_file_path)
        points = points.reshape(-1, self.load_dim)
        points = points[:, self.use_dim]
        attribute_dims = None
@@ -477,6 +488,52 @@ class LoadAnnotations3D(LoadAnnotations):
    Load instance mask and semantic mask of points and
    encapsulate the items into related fields.

+    Required Keys:
+
+    - ann_info (dict)
+        - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes` |
+          :obj:`DepthInstance3DBoxes` | :obj:`CameraInstance3DBoxes`):
+          3D ground truth bboxes. Only when `with_bbox_3d` is True
+        - gt_labels_3d (np.int64): Labels of ground truths.
+          Only when `with_label_3d` is True.
+        - gt_bboxes (np.float32): 2D ground truth bboxes.
+          Only when `with_bbox` is True.
+        - gt_labels (np.ndarray): Labels of ground truths.
+          Only when `with_label` is True.
+        - depths (np.ndarray): Only when
+          `with_bbox_depth` is True.
+        - centers_2d (np.ndarray): Only when
+          `with_bbox_depth` is True.
+        - attr_labels (np.ndarray): Attribute labels of instances.
+          Only when `with_attr_label` is True.
+
+    - pts_instance_mask_path (str): Path of instance mask file.
+      Only when `with_mask_3d` is True.
+    - pts_semantic_mask_path (str): Path of semantic mask file.
+      Only when
+
+    Added Keys:
+
+    - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes` |
+      :obj:`DepthInstance3DBoxes` | :obj:`CameraInstance3DBoxes`):
+      3D ground truth bboxes. Only when `with_bbox_3d` is True
+    - gt_labels_3d (np.int64): Labels of ground truths.
+      Only when `with_label_3d` is True.
+    - gt_bboxes (np.float32): 2D ground truth bboxes.
+      Only when `with_bbox` is True.
+    - gt_labels (np.int64): Labels of ground truths.
+      Only when `with_label` is True.
+    - depths (np.float32): Only when
+      `with_bbox_depth` is True.
+    - centers_2d (np.ndarray): Only when
+      `with_bbox_depth` is True.
+    - attr_labels (np.int64): Attribute labels of instances.
+      Only when `with_attr_label` is True.
+    - pts_instance_mask (np.int64): Instance mask of each point.
+      Only when `with_mask_3d` is True.
+    - pts_semantic_mask (np.int64): Semantic mask of each point.
+      Only when `with_seg_3d` is True.
+
    Args:
        with_bbox_3d (bool, optional): Whether to load 3D boxes.
            Defaults to True.
@@ -501,32 +558,34 @@ class LoadAnnotations3D(LoadAnnotations):
        poly2mask (bool, optional): Whether to convert polygon annotations
            to bitmasks. Defaults to True.
        seg_3d_dtype (dtype, optional): Dtype of 3D semantic masks.
-            Defaults to int64
+            Defaults to int64.
        file_client_args (dict): Config dict of file clients, refer to
            https://github.com/open-mmlab/mmcv/blob/master/mmcv/fileio/file_client.py
            for more details.
    """

-    def __init__(self,
-                 with_bbox_3d=True,
-                 with_label_3d=True,
-                 with_attr_label=False,
-                 with_mask_3d=False,
-                 with_seg_3d=False,
-                 with_bbox=False,
-                 with_label=False,
-                 with_mask=False,
-                 with_seg=False,
-                 with_bbox_depth=False,
-                 poly2mask=True,
-                 seg_3d_dtype=np.int64,
-                 file_client_args=dict(backend='disk')):
+    def __init__(
+        self,
+        with_bbox_3d: bool = True,
+        with_label_3d: bool = True,
+        with_attr_label: bool = False,
+        with_mask_3d: bool = False,
+        with_seg_3d: bool = False,
+        with_bbox: bool = False,
+        with_label: bool = False,
+        with_mask: bool = False,
+        with_seg: bool = False,
+        with_bbox_depth: bool = False,
+        poly2mask: bool = True,
+        seg_3d_dtype: np.dtype = np.int64,
+        file_client_args: dict = dict(backend='disk')
+    ) -> None:
        super().__init__(
-            with_bbox,
-            with_label,
-            with_mask,
-            with_seg,
-            poly2mask,
+            with_bbox=with_bbox,
+            with_label=with_label,
+            with_mask=with_mask,
+            with_seg=with_seg,
+            poly2mask=poly2mask,
            file_client_args=file_client_args)
        self.with_bbox_3d = with_bbox_3d
        self.with_bbox_depth = with_bbox_depth
@@ -536,8 +595,9 @@ class LoadAnnotations3D(LoadAnnotations):
        self.with_seg_3d = with_seg_3d
        self.seg_3d_dtype = seg_3d_dtype

-    def _load_bboxes_3d(self, results):
-        """Private function to load 3D bounding box annotations.
+    def _load_bboxes_3d(self, results: dict) -> dict:
+        """Private function to move the 3D bounding box annotation from
+        `ann_info` field to the root of `results`.

        Args:
            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
@@ -545,11 +605,11 @@ class LoadAnnotations3D(LoadAnnotations):
        Returns:
            dict: The dict containing loaded 3D bounding box annotations.
        """
+
        results['gt_bboxes_3d'] = results['ann_info']['gt_bboxes_3d']
-        results['bbox3d_fields'].append('gt_bboxes_3d')
        return results

-    def _load_bboxes_depth(self, results):
+    def _load_bboxes_depth(self, results: dict) -> dict:
        """Private function to load 2.5D bounding box annotations.

        Args:
@@ -558,11 +618,12 @@ class LoadAnnotations3D(LoadAnnotations):
        Returns:
            dict: The dict containing loaded 2.5D bounding box annotations.
        """
-        results['centers2d'] = results['ann_info']['centers2d']
+
        results['depths'] = results['ann_info']['depths']
+        results['centers_2d'] = results['ann_info']['centers_2d']
        return results

-    def _load_labels_3d(self, results):
+    def _load_labels_3d(self, results: dict) -> dict:
        """Private function to load label annotations.

        Args:
@@ -571,10 +632,11 @@ class LoadAnnotations3D(LoadAnnotations):
        Returns:
            dict: The dict containing loaded label annotations.
        """
+
        results['gt_labels_3d'] = results['ann_info']['gt_labels_3d']
        return results

-    def _load_attr_labels(self, results):
+    def _load_attr_labels(self, results: dict) -> dict:
        """Private function to load label annotations.

        Args:
@@ -586,7 +648,7 @@ class LoadAnnotations3D(LoadAnnotations):
        results['attr_labels'] = results['ann_info']['attr_labels']
        return results

-    def _load_masks_3d(self, results):
+    def _load_masks_3d(self, results: dict) -> dict:
        """Private function to load 3D mask annotations.

        Args:
@@ -595,7 +657,7 @@ class LoadAnnotations3D(LoadAnnotations):
        Returns:
            dict: The dict containing loaded 3D mask annotations.
        """
-        pts_instance_mask_path = results['ann_info']['pts_instance_mask_path']
+        pts_instance_mask_path = results['pts_instance_mask_path']

        if self.file_client is None:
            self.file_client = mmcv.FileClient(**self.file_client_args)
@@ -608,10 +670,9 @@ class LoadAnnotations3D(LoadAnnotations):
                pts_instance_mask_path, dtype=np.int64)

        results['pts_instance_mask'] = pts_instance_mask
-        results['pts_mask_fields'].append('pts_instance_mask')
        return results

-    def _load_semantic_seg_3d(self, results):
+    def _load_semantic_seg_3d(self, results: dict) -> dict:
        """Private function to load 3D semantic segmentation annotations.

        Args:
@@ -620,7 +681,7 @@ class LoadAnnotations3D(LoadAnnotations):
        Returns:
            dict: The dict containing the semantic segmentation annotations.
        """
-        pts_semantic_mask_path = results['ann_info']['pts_semantic_mask_path']
+        pts_semantic_mask_path = results['pts_semantic_mask_path']

        if self.file_client is None:
            self.file_client = mmcv.FileClient(**self.file_client_args)
@@ -635,11 +696,10 @@ class LoadAnnotations3D(LoadAnnotations):
                pts_semantic_mask_path, dtype=np.int64)

        results['pts_semantic_mask'] = pts_semantic_mask
-        results['pts_seg_fields'].append('pts_semantic_mask')
        return results

-    def __call__(self, results):
-        """Call function to load multiple types annotations.
+    def transform(self, results: dict) -> dict:
+        """Function to load multiple types annotations.

        Args:
            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
@@ -648,15 +708,11 @@ class LoadAnnotations3D(LoadAnnotations):
            dict: The dict containing loaded 3D bounding box, label, mask and
            semantic segmentation annotations.
        """
-        results = super().__call__(results)
+        results = super().transform(results)
        if self.with_bbox_3d:
            results = self._load_bboxes_3d(results)
-            if results is None:
-                return None
        if self.with_bbox_depth:
            results = self._load_bboxes_depth(results)
-            if results is None:
-                return None
        if self.with_label_3d:
            results = self._load_labels_3d(results)
        if self.with_attr_label:

--- a/mmdet3d/datasets/pipelines/transforms_3d.py
+++ b/mmdet3d/datasets/pipelines/transforms_3d.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import random
 import warnings
+from typing import List

 import cv2
 import numpy as np
@@ -75,6 +76,25 @@ class RandomFlip3D(RandomFlip):
    otherwise it will be randomly decided by a ratio specified in the init
    method.

+
+    Required Keys:
+
+    - points (np.float32)
+    - gt_bboxes_3d (np.float32)
+
+    Modified Keys:
+
+    - points (np.float32)
+    - gt_bboxes_3d (np.float32)
+
+    Added Keys:
+
+    - points (np.float32)
+    - pcd_trans (np.float32)
+    - pcd_rotation (np.float32)
+    - pcd_rotation_angle (np.float32)
+    - pcd_scale_factor (np.float32)
+
    Args:
        sync_2d (bool, optional): Whether to apply flip according to the 2D
            images. If True, it will apply the same flip as that to 2D images.
@@ -87,13 +107,17 @@ class RandomFlip3D(RandomFlip):
    """

    def __init__(self,
-                 sync_2d=True,
-                 flip_ratio_bev_horizontal=0.0,
-                 flip_ratio_bev_vertical=0.0,
-                 **kwargs):
+                 sync_2d: bool = True,
+                 flip_ratio_bev_horizontal: float = 0.0,
+                 flip_ratio_bev_vertical: float = 0.0,
+                 **kwargs) -> None:
+        # `flip_ratio_bev_horizontal` is equal to
+        # for flip prob of 2d image when
+        # `sync_2d` is True
        super(RandomFlip3D, self).__init__(
-            flip_ratio=flip_ratio_bev_horizontal, **kwargs)
+            prob=flip_ratio_bev_horizontal, direction='horizontal', **kwargs)
        self.sync_2d = sync_2d
+        self.flip_ratio_bev_horizontal = flip_ratio_bev_horizontal
        self.flip_ratio_bev_vertical = flip_ratio_bev_vertical
        if flip_ratio_bev_horizontal is not None:
            assert isinstance(
@@ -104,9 +128,18 @@ class RandomFlip3D(RandomFlip):
                flip_ratio_bev_vertical,
                (int, float)) and 0 <= flip_ratio_bev_vertical <= 1

-    def random_flip_data_3d(self, input_dict, direction='horizontal'):
+    def random_flip_data_3d(self,
+                            input_dict: dict,
+                            direction: str = 'horizontal') -> None:
        """Flip 3D data randomly.

+        `random_flip_data_3d` should take these situations into consideration:
+
+        - 1. LIDAR-based 3d detection
+        - 2. LIDAR-based 3d segmentation
+        - 3. vision-only detection
+        - 4. multi-modality 3d detection.
+
        Args:
            input_dict (dict): Result dict from loading pipeline.
            direction (str, optional): Flip direction.
@@ -117,27 +150,25 @@ class RandomFlip3D(RandomFlip):
                updated in the result dict.
        """
        assert direction in ['horizontal', 'vertical']
-        # for semantic segmentation task, only points will be flipped.
-        if 'bbox3d_fields' not in input_dict:
-            input_dict['points'].flip(direction)
-            return
-        if len(input_dict['bbox3d_fields']) == 0:  # test mode
-            input_dict['bbox3d_fields'].append('empty_box3d')
-            input_dict['empty_box3d'] = input_dict['box_type_3d'](
-                np.array([], dtype=np.float32))
-        assert len(input_dict['bbox3d_fields']) == 1
-        for key in input_dict['bbox3d_fields']:
+
+        if 'gt_bboxes_3d' in input_dict:
            if 'points' in input_dict:
-                input_dict['points'] = input_dict[key].flip(
+                input_dict['points'] = input_dict['gt_bboxes_3d'].flip(
                    direction, points=input_dict['points'])
            else:
-                input_dict[key].flip(direction)
-        if 'centers2d' in input_dict:
+                # vision-only detection
+                input_dict['gt_bboxes_3d'].flip(direction)
+        else:
+            input_dict['points'].flip(direction)
+
+        if 'centers_2d' in input_dict:
            assert self.sync_2d is True and direction == 'horizontal', \
                'Only support sync_2d=True and horizontal flip with images'
+            # TODO fix this ori_shape and other keys in vision based model
+            # TODO ori_shape to img_shape
            w = input_dict['ori_shape'][1]
-            input_dict['centers2d'][..., 0] = \
-                w - input_dict['centers2d'][..., 0]
+            input_dict['centers_2d'][..., 0] = \
+                w - input_dict['centers_2d'][..., 0]
            # need to modify the horizontal position of camera center
            # along u-axis in the image (flip like centers2d)
            # ['cam2img'][0][2] = c_u
@@ -145,7 +176,7 @@ class RandomFlip3D(RandomFlip):
            # https://github.com/open-mmlab/mmdetection3d/pull/744
            input_dict['cam2img'][0][2] = w - input_dict['cam2img'][0][2]

-    def __call__(self, input_dict):
+    def transform(self, input_dict: dict) -> dict:
        """Call function to flip points, values in the ``bbox3d_fields`` and
        also flip 2D image and its annotations.

@@ -158,15 +189,16 @@ class RandomFlip3D(RandomFlip):
                into result dict.
        """
        # flip 2D image and its annotations
-        super(RandomFlip3D, self).__call__(input_dict)
+        if 'img' in input_dict:
+            super(RandomFlip3D, self).transform(input_dict)

-        if self.sync_2d:
+        if self.sync_2d and 'img' in input_dict:
            input_dict['pcd_horizontal_flip'] = input_dict['flip']
            input_dict['pcd_vertical_flip'] = False
        else:
            if 'pcd_horizontal_flip' not in input_dict:
                flip_horizontal = True if np.random.rand(
-                ) < self.flip_ratio else False
+                ) < self.flip_ratio_bev_horizontal else False
                input_dict['pcd_horizontal_flip'] = flip_horizontal
            if 'pcd_vertical_flip' not in input_dict:
                flip_vertical = True if np.random.rand(
@@ -563,9 +595,27 @@ class GlobalAlignment(object):


 @TRANSFORMS.register_module()
-class GlobalRotScaleTrans(object):
+class GlobalRotScaleTrans(BaseTransform):
    """Apply global rotation, scaling and translation to a 3D scene.

+    Required Keys:
+
+    - points (np.float32)
+    - gt_bboxes_3d (np.float32)
+
+    Modified Keys:
+
+    - points (np.float32)
+    - gt_bboxes_3d (np.float32)
+
+    Added Keys:
+
+    - points (np.float32)
+    - pcd_trans (np.float32)
+    - pcd_rotation (np.float32)
+    - pcd_rotation_angle (np.float32)
+    - pcd_scale_factor (np.float32)
+
    Args:
        rot_range (list[float], optional): Range of rotation angle.
            Defaults to [-0.78539816, 0.78539816] (close to [-pi/4, pi/4]).
@@ -581,10 +631,10 @@ class GlobalRotScaleTrans(object):
    """

    def __init__(self,
-                 rot_range=[-0.78539816, 0.78539816],
-                 scale_ratio_range=[0.95, 1.05],
-                 translation_std=[0, 0, 0],
-                 shift_height=False):
+                 rot_range: List[float] = [-0.78539816, 0.78539816],
+                 scale_ratio_range: List[float] = [0.95, 1.05],
+                 translation_std: List[int] = [0, 0, 0],
+                 shift_height: bool = False) -> None:
        seq_types = (list, tuple, np.ndarray)
        if not isinstance(rot_range, seq_types):
            assert isinstance(rot_range, (int, float)), \
@@ -594,6 +644,7 @@ class GlobalRotScaleTrans(object):

        assert isinstance(scale_ratio_range, seq_types), \
            f'unsupported scale_ratio_range type {type(scale_ratio_range)}'
+
        self.scale_ratio_range = scale_ratio_range

        if not isinstance(translation_std, seq_types):
@@ -607,7 +658,7 @@ class GlobalRotScaleTrans(object):
        self.translation_std = translation_std
        self.shift_height = shift_height

-    def _trans_bbox_points(self, input_dict):
+    def _trans_bbox_points(self, input_dict: dict) -> None:
        """Private function to translate bounding boxes and points.

        Args:
@@ -615,7 +666,7 @@ class GlobalRotScaleTrans(object):

        Returns:
            dict: Results after translation, 'points', 'pcd_trans'
-                and keys in input_dict['bbox3d_fields'] are updated
+            and `gt_bboxes_3d` is updated
            in the result dict.
        """
        translation_std = np.array(self.translation_std, dtype=np.float32)
@@ -623,10 +674,10 @@ class GlobalRotScaleTrans(object):

        input_dict['points'].translate(trans_factor)
        input_dict['pcd_trans'] = trans_factor
-        for key in input_dict['bbox3d_fields']:
-            input_dict[key].translate(trans_factor)
+        if 'gt_bboxes_3d' in input_dict:
+            input_dict['gt_bboxes_3d'].translate(trans_factor)

-    def _rot_bbox_points(self, input_dict):
+    def _rot_bbox_points(self, input_dict: dict) -> None:
        """Private function to rotate bounding boxes and points.

        Args:
@@ -634,37 +685,35 @@ class GlobalRotScaleTrans(object):

        Returns:
            dict: Results after rotation, 'points', 'pcd_rotation'
-                and keys in input_dict['bbox3d_fields'] are updated
+            and `gt_bboxes_3d` is updated
            in the result dict.
        """
        rotation = self.rot_range
        noise_rotation = np.random.uniform(rotation[0], rotation[1])

-        # if no bbox in input_dict, only rotate points
-        if len(input_dict['bbox3d_fields']) == 0:
-            rot_mat_T = input_dict['points'].rotate(noise_rotation)
-            input_dict['pcd_rotation'] = rot_mat_T
-            input_dict['pcd_rotation_angle'] = noise_rotation
-            return
-
+        if 'gt_bboxes_3d' in input_dict and \
+                len(input_dict['gt_bboxes_3d'].tensor) != 0:
            # rotate points with bboxes
-        for key in input_dict['bbox3d_fields']:
-            if len(input_dict[key].tensor) != 0:
-                points, rot_mat_T = input_dict[key].rotate(
+            points, rot_mat_T = input_dict['gt_bboxes_3d'].rotate(
                noise_rotation, input_dict['points'])
            input_dict['points'] = points
+        else:
+            # if no bbox in input_dict, only rotate points
+            rot_mat_T = input_dict['points'].rotate(noise_rotation)
+
        input_dict['pcd_rotation'] = rot_mat_T
        input_dict['pcd_rotation_angle'] = noise_rotation

-    def _scale_bbox_points(self, input_dict):
+    def _scale_bbox_points(self, input_dict: dict) -> None:
        """Private function to scale bounding boxes and points.

        Args:
            input_dict (dict): Result dict from loading pipeline.

        Returns:
-            dict: Results after scaling, 'points'and keys in
-                input_dict['bbox3d_fields'] are updated in the result dict.
+            dict: Results after scaling, 'points' and
+            `gt_bboxes_3d` is updated
+            in the result dict.
        """
        scale = input_dict['pcd_scale_factor']
        points = input_dict['points']
@@ -675,24 +724,25 @@ class GlobalRotScaleTrans(object):
            points.tensor[:, points.attribute_dims['height']] *= scale
        input_dict['points'] = points

-        for key in input_dict['bbox3d_fields']:
-            input_dict[key].scale(scale)
+        if 'gt_bboxes_3d' in input_dict and \
+                len(input_dict['gt_bboxes_3d'].tensor) != 0:
+            input_dict['gt_bboxes_3d'].scale(scale)

-    def _random_scale(self, input_dict):
+    def _random_scale(self, input_dict: dict) -> None:
        """Private function to randomly set the scale factor.

        Args:
            input_dict (dict): Result dict from loading pipeline.

        Returns:
-            dict: Results after scaling, 'pcd_scale_factor' are updated
-                in the result dict.
+            dict: Results after scaling, 'pcd_scale_factor'
+            are updated in the result dict.
        """
        scale_factor = np.random.uniform(self.scale_ratio_range[0],
                                         self.scale_ratio_range[1])
        input_dict['pcd_scale_factor'] = scale_factor

-    def __call__(self, input_dict):
+    def transform(self, input_dict: dict) -> dict:
        """Private function to rotate, scale and translate bounding boxes and
        points.

@@ -701,8 +751,8 @@ class GlobalRotScaleTrans(object):

        Returns:
            dict: Results after scaling, 'points', 'pcd_rotation',
-                'pcd_scale_factor', 'pcd_trans' and keys in
-                input_dict['bbox3d_fields'] are updated in the result dict.
+            'pcd_scale_factor', 'pcd_trans' and `gt_bboxes_3d` is updated
+            in the result dict.
        """
        if 'transformation_3d_flow' not in input_dict:
            input_dict['transformation_3d_flow'] = []

--- a/mmdet3d/datasets/utils.py
+++ b/mmdet3d/datasets/utils.py
@@ -3,13 +3,12 @@ import mmcv
 from mmcv.transforms import LoadImageFromFile

 # yapf: disable
-from mmdet3d.datasets.pipelines import (Collect3D, DefaultFormatBundle3D,
-                                        LoadAnnotations3D,
+from mmdet3d.datasets.pipelines import (LoadAnnotations3D,
                                        LoadImageFromFileMono3D,
                                        LoadMultiViewImageFromFiles,
                                        LoadPointsFromFile,
                                        LoadPointsFromMultiSweeps,
-                                        MultiScaleFlipAug3D,
+                                        MultiScaleFlipAug3D, Pack3DDetInputs,
                                        PointSegClassMapping)
 # yapf: enable
 from mmdet3d.registry import TRANSFORMS
@@ -32,9 +31,8 @@ def is_loading_function(transform):
    # TODO: use more elegant way to distinguish loading modules
    loading_functions = (LoadImageFromFile, LoadPointsFromFile,
                         LoadAnnotations3D, LoadMultiViewImageFromFiles,
-                         LoadPointsFromMultiSweeps, DefaultFormatBundle3D,
-                         Collect3D, LoadImageFromFileMono3D,
-                         PointSegClassMapping)
+                         LoadPointsFromMultiSweeps, Pack3DDetInputs,
+                         LoadImageFromFileMono3D, PointSegClassMapping)
    if isinstance(transform, dict):
        obj_cls = TRANSFORMS.get(transform['type'])
        if obj_cls is None:

--- a/tests/test_data/test_transforms/test_augs.py
+++ b/tests/test_data/test_transforms/test_augs.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import unittest
+
+import torch
+from mmengine.testing import assert_allclose
+from utils import create_data_info_after_loading
+
+from mmdet3d.datasets import RandomFlip3D
+from mmdet3d.datasets.pipelines import GlobalRotScaleTrans
+
+
+class TestGlobalRotScaleTrans(unittest.TestCase):
+
+    def test_globle_rotation_scale_trans(self):
+        rot_trans = GlobalRotScaleTrans(
+            rot_range=[-0.78, 0.78], scale_ratio_range=[1, 1])
+        scale_trans = GlobalRotScaleTrans(
+            rot_range=[0, 0], scale_ratio_range=[0.95, 1.05])
+
+        ori_data_info = create_data_info_after_loading()
+
+        data_info = copy.deepcopy(ori_data_info)
+        rot_data_info = rot_trans(data_info)
+        self.assertIn('pcd_rotation', rot_data_info)
+        self.assertIn('pcd_rotation_angle', rot_data_info)
+        self.assertIn('pcd_scale_factor', rot_data_info)
+        self.assertEqual(rot_data_info['pcd_scale_factor'], 1)
+        self.assertIs(-0.79 < rot_data_info['pcd_rotation_angle'] < 0.79, True)
+
+        # assert the rot angle should in rot_range
+        before_rot_gt_bbox_3d = ori_data_info['gt_bboxes_3d']
+        after_rot_gt_bbox_3d = rot_data_info['gt_bboxes_3d']
+        assert (after_rot_gt_bbox_3d.tensor[:, -1] -
+                before_rot_gt_bbox_3d.tensor[:, -1]).abs().max() < 0.79
+
+        data_info = copy.deepcopy(ori_data_info)
+        scale_data_info = scale_trans(data_info)
+        # assert the rot angle should in rot_range
+        before_scale_gt_bbox_3d = ori_data_info['gt_bboxes_3d'].tensor
+        after_scale_gt_bbox_3d = scale_data_info['gt_bboxes_3d'].tensor
+        before_scale_points = ori_data_info['points'].tensor
+        after_scale_points = scale_data_info['points'].tensor
+        self.assertEqual(scale_data_info['pcd_rotation_angle'], 0)
+        # assert  scale_factor range
+        assert (0.94 < (after_scale_points / before_scale_points)).all()
+        assert (1.06 >
+                (after_scale_gt_bbox_3d / before_scale_gt_bbox_3d)).all()
+
+
+class TestRandomFlip3D(unittest.TestCase):
+
+    def test_random_flip3d(self):
+        ori_data_info = create_data_info_after_loading()
+        no_flip_transform = RandomFlip3D(flip_ratio_bev_horizontal=0.)
+        always_flip_transform = RandomFlip3D(flip_ratio_bev_horizontal=1.)
+        data_info = copy.deepcopy(ori_data_info)
+        data_info = no_flip_transform(data_info)
+        self.assertIn('pcd_horizontal_flip', data_info)
+        assert_allclose(data_info['points'].tensor,
+                        ori_data_info['points'].tensor)
+
+        torch.allclose(data_info['gt_bboxes_3d'].tensor,
+                       ori_data_info['gt_bboxes_3d'].tensor)
+        data_info = copy.deepcopy(ori_data_info)
+        data_info = always_flip_transform(data_info)
+        assert_allclose(data_info['points'].tensor[:, 0],
+                        ori_data_info['points'].tensor[:, 0])
+        assert_allclose(data_info['points'].tensor[:, 1],
+                        -ori_data_info['points'].tensor[:, 1])
+        assert_allclose(data_info['points'].tensor[:, 2],
+                        ori_data_info['points'].tensor[:, 2])
+
+        assert_allclose(data_info['gt_bboxes_3d'].tensor[:, 0],
+                        ori_data_info['gt_bboxes_3d'].tensor[:, 0])
+        assert_allclose(data_info['gt_bboxes_3d'].tensor[:, 1],
+                        -ori_data_info['gt_bboxes_3d'].tensor[:, 1])
+        assert_allclose(data_info['gt_bboxes_3d'].tensor[:, 2],
+                        ori_data_info['gt_bboxes_3d'].tensor[:, 2])
--- a/tests/test_data/test_transforms/test_loading.py
+++ b/tests/test_data/test_transforms/test_loading.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import unittest
+
+import torch
+from mmengine.testing import assert_allclose
+from utils import create_dummy_data_info
+
+from mmdet3d.core import DepthPoints, LiDARPoints
+from mmdet3d.datasets.pipelines.loading import (LoadAnnotations3D,
+                                                LoadPointsFromFile)
+
+
+class TestLoadPointsFromFile(unittest.TestCase):
+
+    def test_load_points_from_file(self):
+        use_dim = 3
+        file_client_args = dict(backend='disk')
+        load_points_transform = LoadPointsFromFile(
+            coord_type='LIDAR',
+            load_dim=4,
+            use_dim=use_dim,
+            file_client_args=file_client_args)
+        data_info = create_dummy_data_info()
+        info = load_points_transform(data_info)
+        self.assertIn('points', info)
+        self.assertIsInstance(info['points'], LiDARPoints)
+        load_points_transform = LoadPointsFromFile(
+            coord_type='DEPTH',
+            load_dim=4,
+            use_dim=use_dim,
+            file_client_args=file_client_args)
+        info = load_points_transform(data_info)
+        self.assertIsInstance(info['points'], DepthPoints)
+        self.assertEqual(info['points'].shape[-1], use_dim)
+        load_points_transform = LoadPointsFromFile(
+            coord_type='DEPTH',
+            load_dim=4,
+            use_dim=use_dim,
+            shift_height=True,
+            file_client_args=file_client_args)
+        info = load_points_transform(data_info)
+        # extra height dim
+        self.assertEqual(info['points'].shape[-1], use_dim + 1)
+
+        repr_str = repr(load_points_transform)
+        self.assertIn('shift_height=True', repr_str)
+        self.assertIn('use_color=False', repr_str)
+        self.assertIn('load_dim=4', repr_str)
+
+
+class TestLoadAnnotations3D(unittest.TestCase):
+
+    def test_load_points_from_file(self):
+        file_client_args = dict(backend='disk')
+
+        load_anns_transform = LoadAnnotations3D(
+            with_bbox_3d=True,
+            with_label_3d=True,
+            file_client_args=file_client_args)
+        self.assertIs(load_anns_transform.with_seg, False)
+        self.assertIs(load_anns_transform.with_bbox_3d, True)
+        self.assertIs(load_anns_transform.with_label_3d, True)
+        data_info = create_dummy_data_info()
+        info = load_anns_transform(data_info)
+        self.assertIn('gt_bboxes_3d', info)
+        assert_allclose(info['gt_bboxes_3d'].tensor.sum(),
+                        torch.tensor(7.2650))
+        self.assertIn('gt_labels_3d', info)
+        assert_allclose(info['gt_labels_3d'], torch.tensor([1]))
+        repr_str = repr(load_anns_transform)
+        self.assertIn('with_bbox_3d=True', repr_str)
+        self.assertIn('with_label_3d=True', repr_str)
+        self.assertIn('with_bbox_depth=False', repr_str)
--- a/tests/test_data/test_transforms/test_pack3d.py
+++ b/tests/test_data/test_transforms/test_pack3d.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import unittest
+
+import torch
+from mmengine.testing import assert_allclose
+from utils import create_data_info_after_loading
+
+from mmdet3d.core import LiDARInstance3DBoxes
+from mmdet3d.datasets.pipelines.formating import Pack3DDetInputs
+
+
+class TestPack3DDetInputs(unittest.TestCase):
+
+    def test_packinputs(self):
+        ori_data_info = create_data_info_after_loading()
+        pack_input = Pack3DDetInputs(
+            keys=['points', 'gt_labels_3d', 'gt_bboxes_3d'])
+        packed_results = pack_input(ori_data_info)
+        inputs = packed_results['inputs']
+
+        # annotations
+        gt_instances = packed_results['data_sample'].gt_instances_3d
+        self.assertIn('points', inputs)
+        self.assertIsInstance(inputs['points'], torch.Tensor)
+        assert_allclose(inputs['points'].sum(), torch.tensor(13062.6436))
+        # assert to_tensor
+        self.assertIsInstance(inputs['points'], torch.Tensor)
+        self.assertIn('labels_3d', gt_instances)
+        assert_allclose(gt_instances.labels_3d, torch.tensor([1]))
+        # assert to_tensor
+        self.assertIsInstance(gt_instances.labels_3d, torch.Tensor)
+
+        self.assertIn('bboxes_3d', gt_instances)
+        self.assertIsInstance(gt_instances.bboxes_3d, LiDARInstance3DBoxes)
+        assert_allclose(gt_instances.bboxes_3d.tensor.sum(),
+                        torch.tensor(7.2650))
--- a/tests/test_data/test_transforms/utils.py
+++ b/tests/test_data/test_transforms/utils.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+
+from mmdet3d.core import LiDARInstance3DBoxes
+# create a dummy `results` to test the pipeline
+from mmdet3d.datasets import LoadAnnotations3D, LoadPointsFromFile
+
+
+def create_dummy_data_info(with_ann=True):
+
+    ann_info = {
+        'gt_bboxes':
+        np.array([[712.4, 143., 810.73, 307.92]]),
+        'gt_labels':
+        np.array([1]),
+        'gt_bboxes_3d':
+        LiDARInstance3DBoxes(
+            np.array(
+                [[8.7314, -1.8559, -1.5997, 1.2000, 0.4800, 1.8900,
+                  -1.5808]])),
+        'gt_labels_3d':
+        np.array([1]),
+        'num_lidar_pts':
+        np.array([377]),
+        'difficulty':
+        np.array([0]),
+        'truncated':
+        np.array([0]),
+        'occluded':
+        np.array([0]),
+        'alpha':
+        np.array([-0.2]),
+        'score':
+        np.array([0.]),
+        'index':
+        np.array([0]),
+        'group_id':
+        np.array([0])
+    }
+    data_info = {
+        'sample_id':
+        0,
+        'images': {
+            'CAM0': {
+                'cam2img': [[707.0493, 0.0, 604.0814, 0.0],
+                            [0.0, 707.0493, 180.5066, 0.0],
+                            [0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 1.0]]
+            },
+            'CAM1': {
+                'cam2img': [[707.0493, 0.0, 604.0814, -379.7842],
+                            [0.0, 707.0493, 180.5066, 0.0],
+                            [0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 1.0]]
+            },
+            'CAM2': {
+                'img_path':
+                'tests/data/kitti/training/image_2/000000.png',
+                'height':
+                370,
+                'width':
+                1224,
+                'cam2img': [[707.0493, 0.0, 604.0814, 45.75831],
+                            [0.0, 707.0493, 180.5066, -0.3454157],
+                            [0.0, 0.0, 1.0, 0.004981016], [0.0, 0.0, 0.0, 1.0]]
+            },
+            'CAM3': {
+                'cam2img': [[707.0493, 0.0, 604.0814, -334.1081],
+                            [0.0, 707.0493, 180.5066, 2.33066],
+                            [0.0, 0.0, 1.0, 0.003201153], [0.0, 0.0, 0.0, 1.0]]
+            },
+            'R0_rect': [[
+                0.9999127984046936, 0.010092630051076412,
+                -0.008511931635439396, 0.0
+            ],
+                        [
+                            -0.010127290152013302, 0.9999405741691589,
+                            -0.004037670791149139, 0.0
+                        ],
+                        [
+                            0.008470674976706505, 0.0041235219687223434,
+                            0.9999555945396423, 0.0
+                        ], [0.0, 0.0, 0.0, 1.0]]
+        },
+        'lidar_points': {
+            'num_pts_feats':
+            4,
+            'lidar_path':
+            'tests/data/kitti/training/velodyne_reduced/000000.bin',
+            'lidar2cam': [[
+                -0.0015960992313921452, -0.9999162554740906,
+                -0.012840436771512032, -0.022366708144545555
+            ],
+                          [
+                              -0.00527064548805356, 0.012848696671426296,
+                              -0.9999035596847534, -0.05967890843749046
+                          ],
+                          [
+                              0.9999848008155823, -0.0015282672829926014,
+                              -0.005290712229907513, -0.33254900574684143
+                          ], [0.0, 0.0, 0.0, 1.0]],
+            'Tr_velo_to_cam': [[
+                0.006927963811904192, -0.9999722242355347, -0.0027578289154917,
+                -0.024577289819717407
+            ],
+                               [
+                                   -0.0011629819637164474,
+                                   0.0027498360723257065, -0.9999955296516418,
+                                   -0.06127237156033516
+                               ],
+                               [
+                                   0.999975323677063, 0.006931141018867493,
+                                   -0.0011438990477472544, -0.33210289478302
+                               ], [0.0, 0.0, 0.0, 1.0]],
+            'Tr_imu_to_velo': [[
+                0.999997615814209, 0.0007553070900030434,
+                -0.002035825978964567, -0.8086758852005005
+            ],
+                               [
+                                   -0.0007854027207940817, 0.9998897910118103,
+                                   -0.014822980388998985, 0.3195559084415436
+                               ],
+                               [
+                                   0.002024406101554632, 0.014824540354311466,
+                                   0.9998881220817566, -0.7997230887413025
+                               ], [0.0, 0.0, 0.0, 1.0]]
+        },
+        'instances': [{
+            'bbox': [712.4, 143.0, 810.73, 307.92],
+            'bbox_label':
+            -1,
+            'bbox_3d': [
+                1.840000033378601, 1.4700000286102295, 8.40999984741211,
+                1.2000000476837158, 1.8899999856948853, 0.47999998927116394,
+                0.009999999776482582
+            ],
+            'bbox_label_3d':
+            -1,
+            'num_lidar_pts':
+            377,
+            'difficulty':
+            0,
+            'truncated':
+            0,
+            'occluded':
+            0,
+            'alpha':
+            -0.2,
+            'score':
+            0.0,
+            'index':
+            0,
+            'group_id':
+            0
+        }],
+        'plane':
+        None
+    }
+    if with_ann:
+        data_info['ann_info'] = ann_info
+    return data_info
+
+
+def create_data_info_after_loading():
+    load_anns_transform = LoadAnnotations3D(
+        with_bbox_3d=True, with_label_3d=True)
+    load_points_transform = LoadPointsFromFile(
+        coord_type='LIDAR', load_dim=4, use_dim=3)
+    data_info = create_dummy_data_info()
+    data_info = load_points_transform(data_info)
+    data_info_after_loading = load_anns_transform(data_info)
+    return data_info_after_loading