[Enhance] Update PointFusion (#791)

* update point fusion * remove LIDAR hardcode * move get_proj_mat_by_coord_type to utils * fix lint * remove todo * fix lint

[Enhance] Update PointFusion (#791)
* update point fusion * remove LIDAR hardcode * move get_proj_mat_by_coord_type to utils * fix lint * remove todo * fix lint
c7659a12 · Danila Rukhovich · GitHub · fc301b98 · c7659a12 · c7659a12
Unverified Commit c7659a12 authored Aug 25, 2021 by Danila Rukhovich Committed by GitHub Aug 25, 2021
5 changed files
--- a/mmdet3d/core/bbox/structures/__init__.py
+++ b/mmdet3d/core/bbox/structures/__init__.py
@@ -5,12 +5,13 @@ from .cam_box3d import CameraInstance3DBoxes
 from .coord_3d_mode import Coord3DMode
 from .depth_box3d import DepthInstance3DBoxes
 from .lidar_box3d import LiDARInstance3DBoxes
-from .utils import (get_box_type, limit_period, mono_cam_box2vis,
-                    points_cam2img, rotation_3d_in_axis, xywhr2xyxyr)
+from .utils import (get_box_type, get_proj_mat_by_coord_type, limit_period,
+                    mono_cam_box2vis, points_cam2img, rotation_3d_in_axis,
+                    xywhr2xyxyr)

 __all__ = [
    'Box3DMode', 'BaseInstance3DBoxes', 'LiDARInstance3DBoxes',
    'CameraInstance3DBoxes', 'DepthInstance3DBoxes', 'xywhr2xyxyr',
    'get_box_type', 'rotation_3d_in_axis', 'limit_period', 'points_cam2img',
-    'Coord3DMode', 'mono_cam_box2vis'
+    'Coord3DMode', 'mono_cam_box2vis', 'get_proj_mat_by_coord_type'
 ]
--- a/mmdet3d/core/bbox/structures/utils.py
+++ b/mmdet3d/core/bbox/structures/utils.py
@@ -195,3 +195,20 @@ def mono_cam_box2vis(cam_box):
        cam_box, box_dim=cam_box.shape[-1], origin=(0.5, 0.5, 0.5))

    return cam_box
+
+
+def get_proj_mat_by_coord_type(img_meta, coord_type):
+    """Obtain image features using points.
+
+    Args:
+        img_meta (dict): Meta info.
+        coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'.
+            Can be case-insensitive.
+
+    Returns:
+        torch.Tensor: transformation matrix.
+    """
+    coord_type = coord_type.upper()
+    mapping = {'LIDAR': 'lidar2img', 'DEPTH': 'depth2img', 'CAMERA': 'cam2img'}
+    assert coord_type in mapping.keys()
+    return img_meta[mapping[coord_type]]
--- a/mmdet3d/models/detectors/imvoxelnet.py
+++ b/mmdet3d/models/detectors/imvoxelnet.py
@@ -61,7 +61,8 @@ class ImVoxelNet(BaseDetector):
                img_meta,
                img_features=feature[None, ...],
                points=points,
-                lidar2img_rt=points.new_tensor(img_meta['lidar2img']),
+                proj_mat=points.new_tensor(img_meta['lidar2img']),
+                coord_type='LIDAR',
                img_scale_factor=img_scale_factor,
                img_crop_offset=img_crop_offset,
                img_flip=img_flip,

--- a/mmdet3d/models/fusion_layers/coord_transform.py
+++ b/mmdet3d/models/fusion_layers/coord_transform.py
@@ -5,12 +5,12 @@ from functools import partial
 from mmdet3d.core.points import get_points_type


-def apply_3d_transformation(pcd, coords_type, img_meta, reverse=False):
+def apply_3d_transformation(pcd, coord_type, img_meta, reverse=False):
    """Apply transformation to input point cloud.

    Args:
        pcd (torch.Tensor): The point cloud to be transformed.
-        coords_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'
+        coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'.
        img_meta(dict): Meta info regarding data transformation.
        reverse (bool): Reversed transformation or not.

@@ -54,7 +54,7 @@ def apply_3d_transformation(pcd, coords_type, img_meta, reverse=False):
        if 'transformation_3d_flow' in img_meta else []

    pcd = pcd.clone()  # prevent inplace modification
-    pcd = get_points_type(coords_type)(pcd)
+    pcd = get_points_type(coord_type)(pcd)

    horizontal_flip_func = partial(pcd.flip, bev_direction='horizontal') \
        if pcd_horizontal_flip else lambda: None

--- a/mmdet3d/models/fusion_layers/point_fusion.py
+++ b/mmdet3d/models/fusion_layers/point_fusion.py
@@ -5,15 +5,17 @@ from mmcv.runner import BaseModule
 from torch import nn as nn
 from torch.nn import functional as F

+from mmdet3d.core.bbox.structures import (get_proj_mat_by_coord_type,
+                                          points_cam2img)
 from ..builder import FUSION_LAYERS
 from . import apply_3d_transformation


-def point_sample(
-    img_meta,
+def point_sample(img_meta,
                 img_features,
                 points,
-    lidar2img_rt,
+                 proj_mat,
+                 coord_type,
                 img_scale_factor,
                 img_crop_offset,
                 img_flip,
@@ -21,15 +23,15 @@ def point_sample(
                 img_shape,
                 aligned=True,
                 padding_mode='zeros',
-    align_corners=True,
-):
+                 align_corners=True):
    """Obtain image features using points.

    Args:
        img_meta (dict): Meta info.
        img_features (torch.Tensor): 1 x C x H x W image features.
        points (torch.Tensor): Nx3 point cloud in LiDAR coordinates.
-        lidar2img_rt (torch.Tensor): 4x4 transformation matrix.
+        proj_mat (torch.Tensor): 4x4 transformation matrix.
+        coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'.
        img_scale_factor (torch.Tensor): Scale factor with shape of \
            (w_scale, h_scale).
        img_crop_offset (torch.Tensor): Crop offset used to crop \
@@ -51,19 +53,11 @@ def point_sample(
    """

    # apply transformation based on info in img_meta
-    points = apply_3d_transformation(points, 'LIDAR', img_meta, reverse=True)
-
-    # project points from velo coordinate to camera coordinate
-    num_points = points.shape[0]
-    pts_4d = torch.cat([points, points.new_ones(size=(num_points, 1))], dim=-1)
-    pts_2d = pts_4d @ lidar2img_rt.t()
-
-    # cam_points is Tensor of Nx4 whose last column is 1
-    # transform camera coordinate to image coordinate
+    points = apply_3d_transformation(
+        points, coord_type, img_meta, reverse=True)

-    pts_2d[:, 2] = torch.clamp(pts_2d[:, 2], min=1e-5)
-    pts_2d[:, 0] /= pts_2d[:, 2]
-    pts_2d[:, 1] /= pts_2d[:, 2]
+    # project points to camera coordinate
+    pts_2d = points_cam2img(points, proj_mat)

    # img transformation: scale -> crop -> flip
    # the image is resized by img_scale_factor
@@ -108,6 +102,8 @@ class PointFusion(BaseModule):
        mid_channels (int): Channels of middle layers
        out_channels (int): Channels of output fused features
        img_levels (int, optional): Number of image levels. Defaults to 3.
+        coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'.
+            Defaults to 'LIDAR'.
        conv_cfg (dict, optional): Dict config of conv layers of middle
            layers. Defaults to None.
        norm_cfg (dict, optional): Dict config of norm layers of middle
@@ -137,6 +133,7 @@ class PointFusion(BaseModule):
                 mid_channels,
                 out_channels,
                 img_levels=3,
+                 coord_type='LIDAR',
                 conv_cfg=None,
                 norm_cfg=None,
                 act_cfg=None,
@@ -158,6 +155,7 @@ class PointFusion(BaseModule):
        assert len(img_channels) == len(img_levels)

        self.img_levels = img_levels
+        self.coord_type = coord_type
        self.act_cfg = act_cfg
        self.activate_out = activate_out
        self.fuse_out = fuse_out
@@ -289,13 +287,15 @@ class PointFusion(BaseModule):
        img_crop_offset = (
            pts.new_tensor(img_meta['img_crop_offset'])
            if 'img_crop_offset' in img_meta.keys() else 0)
+        proj_mat = get_proj_mat_by_coord_type(img_meta, self.coord_type)
        img_pts = point_sample(
-            img_meta,
-            img_feats,
-            pts,
-            pts.new_tensor(img_meta['lidar2img']),
-            img_scale_factor,
-            img_crop_offset,
+            img_meta=img_meta,
+            img_features=img_feats,
+            points=pts,
+            proj_mat=pts.new_tensor(proj_mat),
+            coord_type=self.coord_type,
+            img_scale_factor=img_scale_factor,
+            img_crop_offset=img_crop_offset,
            img_flip=img_flip,
            img_pad_shape=img_meta['input_shape'][:2],
            img_shape=img_meta['img_shape'][:2],