[Fix] Unify camera poses (#653)

* refactor K and Rt to depth2img for SUN RGB-D * fix lint * update 3 tests * fix extra calib key and comments * remove calib from browse_dataset * fix cam to depth; rename return_z

[Fix] Unify camera poses (#653)
* refactor K and Rt to depth2img for SUN RGB-D * fix lint * update 3 tests * fix extra calib key and comments * remove calib from browse_dataset * fix cam to depth; rename return_z
ff62af6b · Danila Rukhovich · GitHub · 23071a56 · ff62af6b · ff62af6b
Unverified Commit ff62af6b authored Jun 30, 2021 by Danila Rukhovich Committed by GitHub Jun 30, 2021
15 changed files
--- a/configs/imvotenet/imvotenet_stage2_16x8_sunrgbd-3d-10class.py
+++ b/configs/imvotenet/imvotenet_stage2_16x8_sunrgbd-3d-10class.py
@@ -193,7 +193,7 @@ train_pipeline = [
        type='Collect3D',
        keys=[
            'img', 'gt_bboxes', 'gt_labels', 'points', 'gt_bboxes_3d',
-            'gt_labels_3d', 'calib'
+            'gt_labels_3d'
        ])
 ]
@@ -230,7 +230,7 @@ test_pipeline = [
                type='DefaultFormatBundle3D',
                class_names=class_names,
                with_label=False),
-            dict(type='Collect3D', keys=['img', 'points', 'calib'])
+            dict(type='Collect3D', keys=['img', 'points'])
        ]),
 ]
 # construct a pipeline for data and gt loading in show function
@@ -247,7 +247,7 @@ eval_pipeline = [
        type='DefaultFormatBundle3D',
        class_names=class_names,
        with_label=False),
-    dict(type='Collect3D', keys=['img', 'points', 'calib'])
+    dict(type='Collect3D', keys=['img', 'points'])
 ]
 data = dict(

--- a/mmdet3d/apis/inference.py
+++ b/mmdet3d/apis/inference.py
@@ -155,13 +155,10 @@ def inference_multi_modality_detector(model, pcd, image, ann_file):
        bbox_fields=[],
        mask_fields=[],
        seg_fields=[])
-    # depth map points to image conversion
-    if box_mode_3d == Box3DMode.DEPTH:
-        data.update(dict(calib=info['calib']))
    data = test_pipeline(data)
+    # TODO: this code is dataset-specific. Move lidar2img and
+    #       depth2img to .pkl annotations in the future.
    # LiDAR to image conversion
    if box_mode_3d == Box3DMode.LIDAR:
        rect = info['calib']['R0_rect'].astype(np.float32)
@@ -169,9 +166,14 @@ def inference_multi_modality_detector(model, pcd, image, ann_file):
        P2 = info['calib']['P2'].astype(np.float32)
        lidar2img = P2 @ rect @ Trv2c
        data['img_metas'][0].data['lidar2img'] = lidar2img
+    # Depth to image conversion
    elif box_mode_3d == Box3DMode.DEPTH:
-        data['calib'][0]['Rt'] = data['calib'][0]['Rt'].astype(np.float32)
+        rt_mat = info['calib']['Rt']
-        data['calib'][0]['K'] = data['calib'][0]['K'].astype(np.float32)
+        # follow Coord3DMode.convert_point
+        rt_mat = np.array([[1, 0, 0], [0, 0, -1], [0, 1, 0]
+                           ]) @ rt_mat.transpose(1, 0)
+        depth2img = info['calib']['K'] @ rt_mat
+        data['img_metas'][0].data['depth2img'] = depth2img
    data = collate([data], samples_per_gpu=1)
    if next(model.parameters()).is_cuda:
@@ -182,9 +184,6 @@ def inference_multi_modality_detector(model, pcd, image, ann_file):
        data['img_metas'] = data['img_metas'][0].data
        data['points'] = data['points'][0].data
        data['img'] = data['img'][0].data
-        if box_mode_3d == Box3DMode.DEPTH:
-            data['calib'][0]['Rt'] = data['calib'][0]['Rt'][0].data
-            data['calib'][0]['K'] = data['calib'][0]['K'][0].data
    # forward the model
    with torch.no_grad():
@@ -411,17 +410,13 @@ def show_proj_det_result_meshlab(data,
            box_mode='lidar',
            show=show)
    elif box_mode == Box3DMode.DEPTH:
-        if 'calib' not in data.keys():
-            raise NotImplementedError(
-                'camera calibration information is not provided')
        show_bboxes = DepthInstance3DBoxes(pred_bboxes, origin=(0.5, 0.5, 0))
        show_multi_modality_result(
            img,
            None,
            show_bboxes,
-            data['calib'][0],
+            None,
            out_dir,
            file_name,
            box_mode='depth',

--- a/mmdet3d/core/bbox/box_np_ops.py
+++ b/mmdet3d/core/bbox/box_np_ops.py
@@ -50,7 +50,8 @@ def corners_nd(dims, origin=0.5):
    Args:
        dims (np.ndarray, shape=[N, ndim]): Array of length per dim
-        origin (list or array or float): origin point relate to smallest point.
+        origin (list or array or float, optional): origin point relate to
+            smallest point. Defaults to 0.5
    Returns:
        np.ndarray, shape=[N, 2 ** ndim, ndim]: Returned corners.
@@ -102,7 +103,10 @@ def center_to_corner_box2d(centers, dims, angles=None, origin=0.5):
    Args:
        centers (np.ndarray): Locations in kitti label file with shape (N, 2).
        dims (np.ndarray): Dimensions in kitti label file with shape (N, 2).
-        angles (np.ndarray): Rotation_y in kitti label file with shape (N).
+        angles (np.ndarray, optional): Rotation_y in kitti label file with
+            shape (N). Defaults to None.
+        origin (list or array or float, optional): origin point relate to
+            smallest point. Defaults to 0.5.
    Returns:
        np.ndarray: Corners with the shape of (N, 4, 2).
@@ -173,7 +177,7 @@ def rotation_3d_in_axis(points, angles, axis=0):
    Args:
        points (np.ndarray, shape=[N, point_size, 3]]):
        angles (np.ndarray, shape=[N]]):
-        axis (int): Axis to rotate at.
+        axis (int, optional): Axis to rotate at. Defaults to 0.
    Returns:
        np.ndarray: Rotated points.
@@ -208,10 +212,13 @@ def center_to_corner_box3d(centers,
    Args:
        centers (np.ndarray): Locations in kitti label file with shape (N, 3).
        dims (np.ndarray): Dimensions in kitti label file with shape (N, 3).
-        angles (np.ndarray): Rotation_y in kitti label file with shape (N).
+        angles (np.ndarray, optional): Rotation_y in kitti label file with
-        origin (list or array or float): Origin point relate to smallest point.
+            shape (N). Defaults to None.
-            use (0.5, 1.0, 0.5) in camera and (0.5, 0.5, 0) in lidar.
+        origin (list or array or float, optional): Origin point relate to
-        axis (int): Rotation axis. 1 for camera and 2 for lidar.
+            smallest point. Use (0.5, 1.0, 0.5) in camera and (0.5, 0.5, 0)
+            in lidar. Defaults to (0.5, 1.0, 0.5).
+        axis (int, optional): Rotation axis. 1 for camera and 2 for lidar.
+            Defaults to 1.
    Returns:
        np.ndarray: Corners with the shape of (N, 8, 3).
@@ -308,8 +315,8 @@ def rotation_points_single_angle(points, angle, axis=0):
    Args:
        points (np.ndarray, shape=[N, 3]]):
-        angles (np.ndarray, shape=[1]]):
+        angle (np.ndarray, shape=[1]]):
-        axis (int): Axis to rotate at.
+        axis (int, optional): Axis to rotate at. Defaults to 0.
    Returns:
        np.ndarray: Rotated points.
@@ -341,7 +348,8 @@ def points_cam2img(points_3d, proj_mat, with_depth=False):
    Args:
        points_3d (np.ndarray): Points in shape (N, 3)
        proj_mat (np.ndarray): Transformation matrix between coordinates.
-        with_depth (bool): Whether to keep depth in the output.
+        with_depth (bool, optional): Whether to keep depth in the output.
+            Defaults to False.
    Returns:
        np.ndarray: Points in image coordinates with shape [N, 2].
@@ -420,8 +428,10 @@ def points_in_rbbox(points, rbbox, z_axis=2, origin=(0.5, 0.5, 0)):
    Args:
        points (np.ndarray, shape=[N, 3+dim]): Points to query.
        rbbox (np.ndarray, shape=[M, 7]): Boxes3d with rotation.
-        z_axis (int): Indicate which axis is height.
+        z_axis (int, optional): Indicate which axis is height.
-        origin (tuple[int]): Indicate the position of box center.
+            Defaults to 2.
+        origin (tuple[int], optional): Indicate the position of
+            box center. Defaults to (0.5, 0.5, 0).
    Returns:
        np.ndarray, shape=[N, M]: Indices of points in each box.
@@ -479,11 +489,13 @@ def create_anchors_3d_range(feature_size,
        anchor_range (torch.Tensor | list[float]): Range of anchors with
            shape [6]. The order is consistent with that of anchors, i.e.,
            (x_min, y_min, z_min, x_max, y_max, z_max).
-        sizes (list[list] | np.ndarray | torch.Tensor): Anchor size with
+        sizes (list[list] | np.ndarray | torch.Tensor, optional):
-            shape [N, 3], in order of x, y, z.
+            Anchor size with shape [N, 3], in order of x, y, z.
-        rotations (list[float] | np.ndarray | torch.Tensor): Rotations of
+            Defaults to ((1.6, 3.9, 1.56), ).
-            anchors in a single feature grid.
+        rotations (list[float] | np.ndarray | torch.Tensor, optional):
-        dtype (type): Data type. Default to np.float32.
+            Rotations of anchors in a single feature grid.
+            Defaults to (0, np.pi / 2).
+        dtype (type, optional): Data type. Default to np.float32.
    Returns:
        np.ndarray: Range based anchors with shape of \
@@ -520,7 +532,8 @@ def center_to_minmax_2d(centers, dims, origin=0.5):
    Args:
        centers (np.ndarray): Center points.
        dims (np.ndarray): Dimensions.
-        origin (list or array or float): origin point relate to smallest point.
+        origin (list or array or float, optional): Origin point relate
+            to smallest point. Defaults to 0.5.
    Returns:
        np.ndarray: Minmax points.
@@ -559,6 +572,8 @@ def iou_jit(boxes, query_boxes, mode='iou', eps=0.0):
    Args:
        boxes (np.ndarray): Input bounding boxes with shape of (N, 4).
        query_boxes (np.ndarray): Query boxes with shape of (K, 4).
+        mode (str, optional): IoU mode. Defaults to 'iou'.
+        eps (float, optional): Value added to denominator. Defaults to 0.
    Returns:
        np.ndarray: Overlap between boxes and query_boxes
@@ -648,8 +663,10 @@ def get_frustum(bbox_image, C, near_clip=0.001, far_clip=100):
    Args:
        bbox_image (list[int]): box in image coordinates.
        C (np.ndarray): Intrinsics.
-        near_clip (float): Nearest distance of frustum.
+        near_clip (float, optional): Nearest distance of frustum.
-        far_clip (float): Farthest distance of frustum.
+            Defaults to 0.001.
+        far_clip (float, optional): Farthest distance of frustum.
+            Defaults to 100.
    Returns:
        np.ndarray, shape=[8, 3]: coordinates of frustum corners.
@@ -742,12 +759,12 @@ def points_in_convex_polygon_3d_jit(points,
    Args:
        points (np.ndarray): Input points with shape of (num_points, 3).
-        polygon_surfaces (np.ndarray): Polygon surfaces with shape of \
+        polygon_surfaces (np.ndarray): Polygon surfaces with shape of
-            (num_polygon, max_num_surfaces, max_num_points_of_surface, 3). \
+            (num_polygon, max_num_surfaces, max_num_points_of_surface, 3).
-            All surfaces' normal vector must direct to internal. \
+            All surfaces' normal vector must direct to internal.
            Max_num_points_of_surface must at least 3.
-        num_surfaces (np.ndarray): Number of surfaces a polygon contains \
+        num_surfaces (np.ndarray, optional): Number of surfaces a polygon
-            shape of (num_polygon).
+            contains shape of (num_polygon). Defaults to None.
    Returns:
        np.ndarray: Result matrix with the shape of [num_points, num_polygon].
@@ -772,7 +789,8 @@ def points_in_convex_polygon_jit(points, polygon, clockwise=True):
        points (np.ndarray): Input points with the shape of [num_points, 2].
        polygon (np.ndarray): Input polygon with the shape of
            [num_polygon, num_points_of_polygon, 2].
-        clockwise (bool): Indicate polygon is clockwise.
+        clockwise (bool, optional): Indicate polygon is clockwise. Defaults
+            to True.
    Returns:
        np.ndarray: Result matrix with the shape of [num_points, num_polygon].
@@ -821,10 +839,11 @@ def boxes3d_to_corners3d_lidar(boxes3d, bottom_center=True):
      2 -------- 1
    Args:
-        boxes3d (np.ndarray): Boxes with shape of (N, 7) \
+        boxes3d (np.ndarray): Boxes with shape of (N, 7)
-            [x, y, z, w, l, h, ry] in LiDAR coords, see the definition of ry \
+            [x, y, z, w, l, h, ry] in LiDAR coords, see the definition of ry
            in KITTI dataset.
-        bottom_center (bool): Whether z is on the bottom center of object.
+        bottom_center (bool, optional): Whether z is on the bottom center
+            of object. Defaults to True.
    Returns:
        np.ndarray: Box corners with the shape of [N, 8, 3].

--- a/mmdet3d/core/bbox/structures/coord_3d_mode.py
+++ b/mmdet3d/core/bbox/structures/coord_3d_mode.py
@@ -227,21 +227,11 @@ class Coord3DMode(IntEnum):
            if rt_mat is None:
                rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]])
        elif src == Coord3DMode.DEPTH and dst == Coord3DMode.CAM:
-            # LIDAR-CAM conversion is different from DEPTH-CAM conversion
-            # because SUNRGB-D camera calibration files are different from
-            # that of KITTI, and currently we keep this hack
            if rt_mat is None:
                rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]])
-            else:
-                rt_mat = rt_mat.new_tensor(
-                    [[1, 0, 0], [0, 0, -1], [0, 1, 0]]) @ \
-                    rt_mat.transpose(1, 0)
        elif src == Coord3DMode.CAM and dst == Coord3DMode.DEPTH:
            if rt_mat is None:
                rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]])
-            else:
-                rt_mat = rt_mat @ rt_mat.new_tensor([[1, 0, 0], [0, 0, 1],
-                                                     [0, -1, 0]])
        elif src == Coord3DMode.LIDAR and dst == Coord3DMode.DEPTH:
            if rt_mat is None:
                rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]])

--- a/mmdet3d/core/bbox/structures/utils.py
+++ b/mmdet3d/core/bbox/structures/utils.py
@@ -111,12 +111,14 @@ def get_box_type(box_type):
    return box_type_3d, box_mode_3d
-def points_cam2img(points_3d, proj_mat):
+def points_cam2img(points_3d, proj_mat, with_depth=False):
    """Project points from camera coordicates to image coordinates.
    Args:
-        points_3d (torch.Tensor): Points in shape (N, 3)
+        points_3d (torch.Tensor): Points in shape (N, 3).
        proj_mat (torch.Tensor): Transformation matrix between coordinates.
+        with_depth (bool, optional): Whether to keep depth in the output.
+            Defaults to False.
    Returns:
        torch.Tensor: Points in image coordinates with shape [N, 2].
@@ -141,6 +143,9 @@ def points_cam2img(points_3d, proj_mat):
        [points_3d, points_3d.new_ones(*points_shape)], dim=-1)
    point_2d = torch.matmul(points_4, proj_mat.t())
    point_2d_res = point_2d[..., :2] / point_2d[..., 2:3]
+    if with_depth:
+        return torch.cat([point_2d_res, point_2d[..., 2:3]], dim=-1)
    return point_2d_res

--- a/mmdet3d/core/visualizer/image_vis.py
+++ b/mmdet3d/core/visualizer/image_vis.py
@@ -120,6 +120,7 @@ def draw_lidar_bbox3d_on_img(bboxes3d,
    return plot_rect3d_on_img(img, num_bbox, imgfov_pts_2d, color, thickness)
+# TODO: remove third parameter in all functions here in favour of img_metas
 def draw_depth_bbox3d_on_img(bboxes3d,
                             raw_img,
                             calibs,
@@ -137,35 +138,22 @@ def draw_depth_bbox3d_on_img(bboxes3d,
        color (tuple[int]): The color to draw bboxes. Default: (0, 255, 0).
        thickness (int, optional): The thickness of bboxes. Default: 1.
    """
-    from mmdet3d.core import Coord3DMode
    from mmdet3d.core.bbox import points_cam2img
    from mmdet3d.models import apply_3d_transformation
    img = raw_img.copy()
-    calibs = copy.deepcopy(calibs)
    img_metas = copy.deepcopy(img_metas)
    corners_3d = bboxes3d.corners
    num_bbox = corners_3d.shape[0]
    points_3d = corners_3d.reshape(-1, 3)
-    assert ('Rt' in calibs.keys() and 'K' in calibs.keys()), \
-        'Rt and K matrix should be provided as camera caliberation information'
-    if not isinstance(calibs['Rt'], torch.Tensor):
-        calibs['Rt'] = torch.from_numpy(np.array(calibs['Rt']))
-    if not isinstance(calibs['K'], torch.Tensor):
-        calibs['K'] = torch.from_numpy(np.array(calibs['K']))
-    calibs['Rt'] = calibs['Rt'].reshape(3, 3).float().cpu()
-    calibs['K'] = calibs['K'].reshape(3, 3).float().cpu()
    # first reverse the data transformations
    xyz_depth = apply_3d_transformation(
        points_3d, 'DEPTH', img_metas, reverse=True)
-    # then convert from depth coords to camera coords
-    xyz_cam = Coord3DMode.convert_point(
-        xyz_depth, Coord3DMode.DEPTH, Coord3DMode.CAM, rt_mat=calibs['Rt'])
    # project to 2d to get image coords (uv)
-    uv_origin = points_cam2img(xyz_cam, calibs['K'])
+    uv_origin = points_cam2img(xyz_depth,
+                               xyz_depth.new_tensor(img_metas['depth2img']))
    uv_origin = (uv_origin - 1).round()
    imgfov_pts_2d = uv_origin[..., :2].reshape(num_bbox, 8, 2).numpy()

--- a/mmdet3d/datasets/pipelines/formating.py
+++ b/mmdet3d/datasets/pipelines/formating.py
@@ -100,6 +100,7 @@ class Collect3D(object):
        - 'ori_shape': original shape of the image as a tuple (h, w, c)
        - 'pad_shape': image shape after padding
        - 'lidar2img': transform from lidar to image
+        - 'depth2img': transform from depth to image
        - 'pcd_horizontal_flip': a boolean indicating if point cloud is \
            flipped horizontally
        - 'pcd_vertical_flip': a boolean indicating if point cloud is \
@@ -134,7 +135,7 @@ class Collect3D(object):
    def __init__(self,
                 keys,
                 meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img',
-                            'pad_shape', 'scale_factor', 'flip',
+                            'depth2img', 'pad_shape', 'scale_factor', 'flip',
                            'cam_intrinsic', 'pcd_horizontal_flip',
                            'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d',
                            'img_norm_cfg', 'rect', 'Trv2c', 'P2', 'pcd_trans',

--- a/mmdet3d/datasets/sunrgbd_dataset.py
+++ b/mmdet3d/datasets/sunrgbd_dataset.py
@@ -101,7 +101,12 @@ class SUNRGBDDataset(Custom3DDataset):
            input_dict['img_prefix'] = None
            input_dict['img_info'] = dict(filename=img_filename)
            calib = info['calib']
-            input_dict['calib'] = calib
+            rt_mat = calib['Rt']
+            # follow Coord3DMode.convert_point
+            rt_mat = np.array([[1, 0, 0], [0, 0, -1], [0, 1, 0]
+                               ]) @ rt_mat.transpose(1, 0)
+            depth2img = calib['K'] @ rt_mat
+            input_dict['depth2img'] = depth2img
        if not self.test_mode:
            annos = self.get_ann_info(index)
@@ -187,8 +192,8 @@ class SUNRGBDDataset(Custom3DDataset):
            data_info = self.data_infos[i]
            pts_path = data_info['pts_path']
            file_name = osp.split(pts_path)[-1].split('.')[0]
-            points, img_metas, img, calib = self._extract_data(
+            points, img_metas, img = self._extract_data(
-                i, pipeline, ['points', 'img_metas', 'img', 'calib'])
+                i, pipeline, ['points', 'img_metas', 'img'])
            # scale colors to [0, 255]
            points = points.numpy()
            points[:, 3:] *= 255
@@ -199,7 +204,7 @@ class SUNRGBDDataset(Custom3DDataset):
                        file_name, show)
            # multi-modality visualization
-            if self.modality['use_camera'] and 'calib' in data_info.keys():
+            if self.modality['use_camera']:
                img = img.numpy()
                # need to transpose channel to first dim
                img = img.transpose(1, 2, 0)
@@ -211,7 +216,7 @@ class SUNRGBDDataset(Custom3DDataset):
                    img,
                    gt_bboxes,
                    pred_bboxes,
-                    calib,
+                    None,
                    out_dir,
                    file_name,
                    box_mode='depth',

--- a/mmdet3d/models/detectors/imvotenet.py
+++ b/mmdet3d/models/detectors/imvotenet.py
@@ -378,7 +378,6 @@ class ImVoteNet(Base3DDetector):
                      gt_bboxes_ignore=None,
                      gt_masks=None,
                      proposals=None,
-                      calib=None,
                      bboxes_2d=None,
                      gt_bboxes_3d=None,
                      gt_labels_3d=None,
@@ -405,8 +404,6 @@ class ImVoteNet(Base3DDetector):
                2d bbox, used if the architecture supports a segmentation task.
            proposals: override rpn proposals (2d) with custom proposals.
                Use when `with_rpn` is False.
-            calib (dict[str, torch.Tensor]): camera calibration matrices,
-                Rt and K.
            bboxes_2d (list[torch.Tensor]): provided 2d bboxes,
                not supported yet.
            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): 3d gt bboxes.
@@ -452,7 +449,7 @@ class ImVoteNet(Base3DDetector):
                self.extract_pts_feat(points)
            img_features, masks = self.fusion_layer(img, bboxes_2d, seeds_3d,
-                                                    img_metas, calib)
+                                                    img_metas)
            inds = sample_valid_seeds(masks, self.num_sampled_seed)
            batch_size, img_feat_size = img_features.shape[:2]
@@ -528,7 +525,6 @@ class ImVoteNet(Base3DDetector):
                     points=None,
                     img_metas=None,
                     img=None,
-                     calib=None,
                     bboxes_2d=None,
                     **kwargs):
        """Forwarding of test for image branch pretrain or stage 2 train.
@@ -546,9 +542,6 @@ class ImVoteNet(Base3DDetector):
                list indicates test-time augmentations and inner Tensor
                should have a shape NxCxHxW, which contains all images
                in the batch. Defaults to None. Defaults to None.
-            calibs (list[dict[str, torch.Tensor]], optional): camera
-                calibration matrices, Rt and K.
-                List indicates test-time augs. Defaults to None.
            bboxes_2d (list[list[torch.Tensor]], optional):
                Provided 2d bboxes, not supported yet. Defaults to None.
@@ -602,11 +595,10 @@ class ImVoteNet(Base3DDetector):
                    points[0],
                    img_metas[0],
                    img[0],
-                    calibs=calib[0],
                    bboxes_2d=bboxes_2d[0] if bboxes_2d is not None else None,
                    **kwargs)
            else:
-                return self.aug_test(points, img_metas, img, calib, bboxes_2d,
+                return self.aug_test(points, img_metas, img, bboxes_2d,
                                     **kwargs)
    def simple_test_img_only(self,
@@ -652,7 +644,6 @@ class ImVoteNet(Base3DDetector):
                    points=None,
                    img_metas=None,
                    img=None,
-                    calibs=None,
                    bboxes_2d=None,
                    rescale=False,
                    **kwargs):
@@ -666,8 +657,6 @@ class ImVoteNet(Base3DDetector):
                images in a batch. Defaults to None.
            img (torch.Tensor, optional): Should have a shape NxCxHxW,
                which contains all images in the batch. Defaults to None.
-            calibs (dict[str, torch.Tensor], optional): camera
-                calibration matrices, Rt and K. Defaults to None.
            bboxes_2d (list[torch.Tensor], optional):
                Provided 2d bboxes, not supported yet. Defaults to None.
            rescale (bool, optional): Whether or not rescale bboxes.
@@ -684,7 +673,7 @@ class ImVoteNet(Base3DDetector):
            self.extract_pts_feat(points)
        img_features, masks = self.fusion_layer(img, bboxes_2d, seeds_3d,
-                                                img_metas, calibs)
+                                                img_metas)
        inds = sample_valid_seeds(masks, self.num_sampled_seed)
        batch_size, img_feat_size = img_features.shape[:2]
@@ -755,7 +744,6 @@ class ImVoteNet(Base3DDetector):
                 points=None,
                 img_metas=None,
                 imgs=None,
-                 calibs=None,
                 bboxes_2d=None,
                 rescale=False,
                 **kwargs):
@@ -774,9 +762,6 @@ class ImVoteNet(Base3DDetector):
                list indicates test-time augmentations and inner Tensor
                should have a shape NxCxHxW, which contains all images
                in the batch. Defaults to None. Defaults to None.
-            calibs (list[dict[str, torch.Tensor]], optional): camera
-                calibration matrices, Rt and K.
-                List indicates test-time augs. Defaults to None.
            bboxes_2d (list[list[torch.Tensor]], optional):
                Provided 2d bboxes, not supported yet. Defaults to None.
            rescale (bool, optional): Whether or not rescale bboxes.
@@ -790,8 +775,9 @@ class ImVoteNet(Base3DDetector):
        # only support aug_test for one sample
        aug_bboxes = []
-        for x, pts_cat, img_meta, bbox_2d, img, calib in zip(
+        for x, pts_cat, img_meta, bbox_2d, img in zip(feats, points_cat,
-                feats, points_cat, img_metas, bboxes_2d, imgs, calibs):
+                                                      img_metas, bboxes_2d,
+                                                      imgs):
            bbox_2d = self.extract_bboxes_2d(
                img, img_metas, train=False, bboxes_2d=bbox_2d, **kwargs)
@@ -799,7 +785,7 @@ class ImVoteNet(Base3DDetector):
            seeds_3d, seed_3d_features, seed_indices = x
            img_features, masks = self.fusion_layer(img, bbox_2d, seeds_3d,
-                                                    img_metas, calib)
+                                                    img_metas)
            inds = sample_valid_seeds(masks, self.num_sampled_seed)
            batch_size, img_feat_size = img_features.shape[:2]

--- a/mmdet3d/models/fusion_layers/vote_fusion.py
+++ b/mmdet3d/models/fusion_layers/vote_fusion.py
 import torch
 from torch import nn as nn
-from mmdet3d.core.bbox import Coord3DMode, points_cam2img
+from mmdet3d.core.bbox import points_cam2img
 from ..builder import FUSION_LAYERS
 from . import apply_3d_transformation, bbox_2d_transform, coord_2d_transform
@@ -22,8 +22,7 @@ class VoteFusion(nn.Module):
        self.num_classes = num_classes
        self.max_imvote_per_pixel = max_imvote_per_pixel
-    def forward(self, imgs, bboxes_2d_rescaled, seeds_3d_depth, img_metas,
+    def forward(self, imgs, bboxes_2d_rescaled, seeds_3d_depth, img_metas):
-                calibs):
        """Forward function.
        Args:
@@ -31,7 +30,6 @@ class VoteFusion(nn.Module):
            bboxes_2d_rescaled (list[torch.Tensor]): 2D bboxes.
            seeds_3d_depth (torch.Tensor): 3D seeds.
            img_metas (list[dict]): Meta information of images.
-            calibs: Camera calibration information of the images.
        Returns:
            torch.Tensor: Concatenated cues of each point.
@@ -52,16 +50,11 @@ class VoteFusion(nn.Module):
            xyz_depth = apply_3d_transformation(
                seed_3d_depth, 'DEPTH', img_meta, reverse=True)
-            # then convert from depth coords to camera coords
+            # project points from depth to image
-            xyz_cam = Coord3DMode.convert_point(
+            depth2img = xyz_depth.new_tensor(img_meta['depth2img'])
-                xyz_depth,
+            uvz_origin = points_cam2img(xyz_depth, depth2img, True)
-                Coord3DMode.DEPTH,
+            z_cam = uvz_origin[..., 2]
-                Coord3DMode.CAM,
+            uv_origin = (uvz_origin[..., :2] - 1).round()
-                rt_mat=calibs['Rt'][i])
-            # project to 2d to get image coords (uv)
-            uv_origin = points_cam2img(xyz_cam, calibs['K'][i])
-            uv_origin = (uv_origin - 1).round()
            # rescale 2d coordinates and bboxes
            uv_rescaled = coord_2d_transform(img_meta, uv_origin, True)
@@ -113,22 +106,12 @@ class VoteFusion(nn.Module):
                seed_3d_expanded = seed_3d_depth.view(seed_num, 1, -1).expand(
                    -1, bbox_num, -1)
-                z_cam = xyz_cam[..., 2:3].view(seed_num, 1,
+                z_cam = z_cam.view(seed_num, 1, 1).expand(-1, bbox_num, -1)
-                                               1).expand(-1, bbox_num, -1)
-                delta_u = delta_u * z_cam / calibs['K'][i, 0, 0]
-                delta_v = delta_v * z_cam / calibs['K'][i, 0, 0]
                imvote = torch.cat(
                    [delta_u, delta_v,
                     torch.zeros_like(delta_v)], dim=-1).view(-1, 3)
+                imvote = imvote * z_cam.reshape(-1, 1)
-                # convert from camera coords to depth coords
+                imvote = imvote @ torch.inverse(depth2img.t())
-                imvote = Coord3DMode.convert_point(
-                    imvote.view((-1, 3)),
-                    Coord3DMode.CAM,
-                    Coord3DMode.DEPTH,
-                    rt_mat=calibs['Rt'][i])
                # apply transformation to lifted imvotes
                imvote = apply_3d_transformation(

--- a/tests/test_data/test_datasets/test_sunrgbd_dataset.py
+++ b/tests/test_data/test_datasets/test_sunrgbd_dataset.py
@@ -79,7 +79,7 @@ def _generate_sunrgbd_multi_modality_dataset_config():
            type='Collect3D',
            keys=[
                'img', 'gt_bboxes', 'gt_labels', 'points', 'gt_bboxes_3d',
-                'gt_labels_3d', 'calib'
+                'gt_labels_3d'
            ])
    ]
    modality = dict(use_lidar=True, use_camera=True)
@@ -158,20 +158,23 @@ def test_getitem():
    points = data['points']._data
    gt_bboxes_3d = data['gt_bboxes_3d']._data
    gt_labels_3d = data['gt_labels_3d']._data
-    calib = data['calib']
    img = data['img']._data
+    depth2img = data['img_metas']._data['depth2img']
-    expected_Rt = np.array([[0.97959, 0.012593, -0.20061],
+    expected_rt_mat = np.array([[0.97959, 0.012593, -0.20061],
-                            [0.012593, 0.99223, 0.12377],
+                                [0.012593, 0.99223, 0.12377],
-                            [0.20061, -0.12377, 0.97182]])
+                                [0.20061, -0.12377, 0.97182]])
-    expected_K = np.array([[529.5, 0., 0.], [0., 529.5, 0.], [365., 265., 1.]])
+    expected_k_mat = np.array([[529.5, 0., 0.], [0., 529.5, 0.],
+                               [365., 265., 1.]])
+    rt_mat = np.array([[1, 0, 0], [0, 0, -1], [0, 1, 0]
+                       ]) @ expected_rt_mat.transpose(1, 0)
+    expected_depth2img = expected_k_mat @ rt_mat
    assert torch.allclose(points, expected_points, 1e-2)
    assert torch.allclose(gt_bboxes_3d.tensor, expected_gt_bboxes_3d, 1e-3)
    assert np.all(gt_labels_3d.numpy() == expected_gt_labels)
    assert img.shape[:] == (3, 608, 832)
-    assert np.allclose(calib['Rt'], expected_Rt)
+    assert np.allclose(depth2img, expected_depth2img)
-    assert np.allclose(calib['K'], expected_K)
 def test_evaluate():
@@ -295,7 +298,7 @@ def test_show():
            type='DefaultFormatBundle3D',
            class_names=class_names,
            with_label=False),
-        dict(type='Collect3D', keys=['points', 'img', 'calib'])
+        dict(type='Collect3D', keys=['points', 'img'])
    ]
    tmp_dir = tempfile.TemporaryDirectory()
    temp_dir = tmp_dir.name

--- a/tests/test_models/test_fusion/test_vote_fusion.py
+++ b/tests/test_models/test_fusion/test_vote_fusion.py
@@ -32,16 +32,16 @@ def test_vote_fusion():
        'transformation_3d_flow': ['HF', 'R', 'S', 'T']
    }
-    calibs = {
+    rt_mat = torch.tensor([[0.979570, 0.047954, -0.195330],
-        'Rt':
+                           [0.047954, 0.887470, 0.458370],
-        torch.tensor([[[0.979570, 0.047954, -0.195330],
+                           [0.195330, -0.458370, 0.867030]])
-                       [0.047954, 0.887470, 0.458370],
+    k_mat = torch.tensor([[529.5000, 0.0000, 365.0000],
-                       [0.195330, -0.458370, 0.867030]]]),
+                          [0.0000, 529.5000, 265.0000],
-        'K':
+                          [0.0000, 0.0000, 1.0000]])
-        torch.tensor([[[529.5000, 0.0000, 365.0000],
+    rt_mat = rt_mat.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]
-                       [0.0000, 529.5000, 265.0000], [0.0000, 0.0000,
+                                ]) @ rt_mat.transpose(1, 0)
-                                                      1.0000]]])
+    depth2img = k_mat @ rt_mat
-    }
+    img_meta['depth2img'] = depth2img
    bboxes = torch.tensor([[[
        5.4286e+02, 9.8283e+01, 6.1700e+02, 1.6742e+02, 9.7922e-01, 3.0000e+00
@@ -309,12 +309,12 @@ def test_vote_fusion():
          ]]])
    fusion = VoteFusion()
-    out1, out2 = fusion(imgs, bboxes, seeds_3d, [img_meta], calibs)
+    out1, out2 = fusion(imgs, bboxes, seeds_3d, [img_meta])
    assert torch.allclose(expected_tensor1, out1[:, :, :15], 1e-3)
    assert torch.allclose(expected_tensor2.float(), out2.float(), 1e-3)
    assert torch.allclose(expected_tensor3, out1[:, :, 30:45], 1e-3)
-    out1, out2 = fusion(imgs, bboxes[:, :2], seeds_3d, [img_meta], calibs)
+    out1, out2 = fusion(imgs, bboxes[:, :2], seeds_3d, [img_meta])
    out1 = out1[:, :15, 30:45]
    out2 = out2[:, 30:45].float()
    assert torch.allclose(torch.zeros_like(out1), out1, 1e-3)

--- a/tests/test_runtime/test_apis.py
+++ b/tests/test_runtime/test_apis.py
@@ -90,12 +90,16 @@ def test_show_result_meshlab():
        torch.tensor(
            [[-1.1580, 3.3041, -0.9961, 0.3829, 0.4647, 0.5574, 1.1213]]))
    img = np.random.randn(1, 3, 608, 832)
-    K = np.array([[[529.5000, 0.0000, 365.0000], [0.0000, 529.5000, 265.0000],
+    k_mat = np.array([[529.5000, 0.0000, 365.0000],
-                   [0.0000, 0.0000, 1.0000]]])
+                      [0.0000, 529.5000, 265.0000], [0.0000, 0.0000, 1.0000]])
-    Rt = torch.tensor([[[0.9980, 0.0058, -0.0634], [0.0058, 0.9835, 0.1808],
+    rt_mat = np.array([[0.9980, 0.0058, -0.0634], [0.0058, 0.9835, 0.1808],
-                        [0.0634, -0.1808, 0.9815]]])
+                       [0.0634, -0.1808, 0.9815]])
+    rt_mat = np.array([[1, 0, 0], [0, 0, -1], [0, 1, 0]]) @ rt_mat.transpose(
+        1, 0)
+    depth2img = k_mat @ rt_mat
    img_meta = dict(
        filename=filename,
+        depth2img=depth2img,
        pcd_horizontal_flip=False,
        pcd_vertical_flip=False,
        box_mode_3d=Box3DMode.DEPTH,
@@ -104,12 +108,8 @@ def test_show_result_meshlab():
        pcd_scale_factor=1.0,
        pts_filename=pcd,
        transformation_3d_flow=['R', 'S', 'T'])
-    calib = dict(K=K, Rt=Rt)
    data = dict(
-        points=[[torch.tensor(points)]],
+        points=[[torch.tensor(points)]], img_metas=[[img_meta]], img=[img])
-        img_metas=[[img_meta]],
-        img=[img],
-        calib=[calib])
    result = [dict(boxes_3d=box_3d, labels_3d=labels_3d, scores_3d=scores_3d)]
    tmp_dir = tempfile.TemporaryDirectory()
    temp_out_dir = tmp_dir.name

--- a/tests/test_utils/test_coord_3d_mode.py
+++ b/tests/test_utils/test_coord_3d_mode.py
@@ -191,6 +191,8 @@ def test_points_conversion():
        depth_points.tensor[:, 3:]
    ],
                                 dim=1)
+    mat = rt_mat_provided.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]])
+    rt_mat_provided = mat @ rt_mat_provided.transpose(1, 0)
    cam_point_tensor_new = Coord3DMode.convert_point(
        depth_points_new,
        Coord3DMode.DEPTH,

--- a/tools/misc/browse_dataset.py
+++ b/tools/misc/browse_dataset.py
@@ -141,7 +141,7 @@ def show_proj_bbox_img(idx,
            img,
            gt_bboxes,
            None,
-            example['calib'],
+            None,
            out_dir,
            filename,
            box_mode='depth',