[Feature] Add MonoFlex Head (#1044)

8538177b · ChaimZhu · GitHub · 4590418e · 8538177b · 8538177b
Unverified Commit 8538177b authored Jan 21, 2022 by ChaimZhu Committed by GitHub Jan 21, 2022
12 changed files
--- a/mmdet3d/core/bbox/coders/monoflex_bbox_coder.py
+++ b/mmdet3d/core/bbox/coders/monoflex_bbox_coder.py
@@ -81,16 +81,16 @@ class MonoFlexCoder(BaseBBoxCoder):
            torch.Tensor: Targets of orientations.
        """
        local_yaw = gt_bboxes_3d.local_yaw
        # encode local yaw (-pi ~ pi) to multibin format
-        encode_local_yaw = np.zeros(self.num_dir_bins * 2)
+        encode_local_yaw = local_yaw.new_zeros(
+            [local_yaw.shape[0], self.num_dir_bins * 2])
        bin_size = 2 * np.pi / self.num_dir_bins
        margin_size = bin_size * self.bin_margin
-        bin_centers = self.bin_centers
+        bin_centers = local_yaw.new_tensor(self.bin_centers)
        range_size = bin_size / 2 + margin_size
-        offsets = local_yaw - bin_centers.unsqueeze(0)
+        offsets = local_yaw.unsqueeze(1) - bin_centers.unsqueeze(0)
        offsets[offsets > np.pi] = offsets[offsets > np.pi] - 2 * np.pi
        offsets[offsets < -np.pi] = offsets[offsets < -np.pi] + 2 * np.pi
@@ -98,7 +98,7 @@ class MonoFlexCoder(BaseBBoxCoder):
            offset = offsets[:, i]
            inds = abs(offset) < range_size
            encode_local_yaw[inds, i] = 1
-            encode_local_yaw[inds, i + self.num_dir_bins] = offset
+            encode_local_yaw[inds, i + self.num_dir_bins] = offset[inds]
        orientation_target = encode_local_yaw
@@ -164,7 +164,7 @@ class MonoFlexCoder(BaseBBoxCoder):
        pred_direct_depth_uncertainty = bbox[:, 49:50].squeeze(-1)
        # 2 dimension of offsets x keypoints (8 corners + top/bottom center)
-        pred_keypoints2d = bbox[:, 6:26]
+        pred_keypoints2d = bbox[:, 6:26].reshape(-1, 10, 2)
        # 1 dimension for depth offsets
        pred_direct_depth_offsets = bbox[:, 48:49].squeeze(-1)
@@ -273,11 +273,11 @@ class MonoFlexCoder(BaseBBoxCoder):
            raise NotImplementedError
        # (N, 3)
        centers2d_img = \
-            torch.cat(centers2d_img, depths.unsqueeze(-1), dim=1)
+            torch.cat((centers2d_img, depths.unsqueeze(-1)), dim=1)
        # (N, 4, 1)
        centers2d_extend = \
            torch.cat((centers2d_img, centers2d_img.new_ones(N, 1)),
-                      dim=1).unqueeze(-1)
+                      dim=1).unsqueeze(-1)
        locations = torch.matmul(cam2imgs_inv, centers2d_extend).squeeze(-1)
        return locations[:, :3]
@@ -450,15 +450,15 @@ class MonoFlexCoder(BaseBBoxCoder):
        local_yaws = orientations
        yaws = local_yaws + rays
-        larger_idx = (yaws > np.pi).nonzero()
+        larger_idx = (yaws > np.pi).nonzero(as_tuple=False)
-        small_idx = (yaws < -np.pi).nonzero()
+        small_idx = (yaws < -np.pi).nonzero(as_tuple=False)
        if len(larger_idx) != 0:
            yaws[larger_idx] -= 2 * np.pi
        if len(small_idx) != 0:
            yaws[small_idx] += 2 * np.pi
-        larger_idx = (local_yaws > np.pi).nonzero()
+        larger_idx = (local_yaws > np.pi).nonzero(as_tuple=False)
-        small_idx = (local_yaws < -np.pi).nonzero()
+        small_idx = (local_yaws < -np.pi).nonzero(as_tuple=False)
        if len(larger_idx) != 0:
            local_yaws[larger_idx] -= 2 * np.pi
        if len(small_idx) != 0:
@@ -491,7 +491,7 @@ class MonoFlexCoder(BaseBBoxCoder):
        return bboxes2d
-    def combine_depths(depth, depth_uncertainty):
+    def combine_depths(self, depth, depth_uncertainty):
        """Combine all the prediced depths with depth uncertainty.
        Args:

--- a/mmdet3d/core/bbox/structures/utils.py
+++ b/mmdet3d/core/bbox/structures/utils.py
@@ -324,8 +324,11 @@ def yaw2local(yaw, loc):
        torch.Tensor: local yaw (alpha in kitti).
    """
    local_yaw = yaw - torch.atan2(loc[:, 0], loc[:, 2])
-    while local_yaw > np.pi:
+    larger_idx = (local_yaw > np.pi).nonzero(as_tuple=False)
-        local_yaw -= np.pi * 2
+    small_idx = (local_yaw < -np.pi).nonzero(as_tuple=False)
-    while local_yaw < -np.pi:
+    if len(larger_idx) != 0:
-        local_yaw += np.pi * 2
+        local_yaw[larger_idx] -= 2 * np.pi
+    if len(small_idx) != 0:
+        local_yaw[small_idx] += 2 * np.pi
    return local_yaw
--- a/mmdet3d/core/utils/__init__.py
+++ b/mmdet3d/core/utils/__init__.py
 # Copyright (c) OpenMMLab. All rights reserved.
 from .array_converter import ArrayConverter, array_converter
-from .gaussian import draw_heatmap_gaussian, gaussian_2d, gaussian_radius
+from .gaussian import (draw_heatmap_gaussian, ellip_gaussian2D, gaussian_2d,
+                       gaussian_radius, get_ellip_gaussian_2D)
 __all__ = [
    'gaussian_2d', 'gaussian_radius', 'draw_heatmap_gaussian',
-    'ArrayConverter', 'array_converter'
+    'ArrayConverter', 'array_converter', 'ellip_gaussian2D',
+    'get_ellip_gaussian_2D'
 ]
--- a/mmdet3d/core/utils/gaussian.py
+++ b/mmdet3d/core/utils/gaussian.py
@@ -84,3 +84,75 @@ def gaussian_radius(det_size, min_overlap=0.5):
    sq3 = torch.sqrt(b3**2 - 4 * a3 * c3)
    r3 = (b3 + sq3) / 2
    return min(r1, r2, r3)
+def get_ellip_gaussian_2D(heatmap, center, radius_x, radius_y, k=1):
+    """Generate 2D ellipse gaussian heatmap.
+    Args:
+        heatmap (Tensor): Input heatmap, the gaussian kernel will cover on
+            it and maintain the max value.
+        center (list[int]): Coord of gaussian kernel's center.
+        radius_x (int): X-axis radius of gaussian kernel.
+        radius_y (int): Y-axis radius of gaussian kernel.
+        k (int, optional): Coefficient of gaussian kernel. Default: 1.
+    Returns:
+        out_heatmap (Tensor): Updated heatmap covered by gaussian kernel.
+    """
+    diameter_x, diameter_y = 2 * radius_x + 1, 2 * radius_y + 1
+    gaussian_kernel = ellip_gaussian2D((radius_x, radius_y),
+                                       sigma_x=diameter_x / 6,
+                                       sigma_y=diameter_y / 6,
+                                       dtype=heatmap.dtype,
+                                       device=heatmap.device)
+    x, y = int(center[0]), int(center[1])
+    height, width = heatmap.shape[0:2]
+    left, right = min(x, radius_x), min(width - x, radius_x + 1)
+    top, bottom = min(y, radius_y), min(height - y, radius_y + 1)
+    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
+    masked_gaussian = gaussian_kernel[radius_y - top:radius_y + bottom,
+                                      radius_x - left:radius_x + right]
+    out_heatmap = heatmap
+    torch.max(
+        masked_heatmap,
+        masked_gaussian * k,
+        out=out_heatmap[y - top:y + bottom, x - left:x + right])
+    return out_heatmap
+def ellip_gaussian2D(radius,
+                     sigma_x,
+                     sigma_y,
+                     dtype=torch.float32,
+                     device='cpu'):
+    """Generate 2D ellipse gaussian kernel.
+    Args:
+        radius (tuple(int)): Ellipse radius (radius_x, radius_y) of gaussian
+            kernel.
+        sigma_x (int): X-axis sigma of gaussian function.
+        sigma_y (int): Y-axis sigma of gaussian function.
+        dtype (torch.dtype, optional): Dtype of gaussian tensor.
+            Default: torch.float32.
+        device (str, optional): Device of gaussian tensor.
+            Default: 'cpu'.
+    Returns:
+        h (Tensor): Gaussian kernel with a
+            ``(2 * radius_y + 1) * (2 * radius_x + 1)`` shape.
+    """
+    x = torch.arange(
+        -radius[0], radius[0] + 1, dtype=dtype, device=device).view(1, -1)
+    y = torch.arange(
+        -radius[1], radius[1] + 1, dtype=dtype, device=device).view(-1, 1)
+    h = (-(x * x) / (2 * sigma_x * sigma_x) - (y * y) /
+         (2 * sigma_y * sigma_y)).exp()
+    h[h < torch.finfo(h.dtype).eps * h.max()] = 0
+    return h
--- a/mmdet3d/models/dense_heads/__init__.py
+++ b/mmdet3d/models/dense_heads/__init__.py
@@ -7,6 +7,7 @@ from .centerpoint_head import CenterHead
 from .fcos_mono3d_head import FCOSMono3DHead
 from .free_anchor3d_head import FreeAnchor3DHead
 from .groupfree3d_head import GroupFree3DHead
+from .monoflex_head import MonoFlexHead
 from .parta2_rpn_head import PartA2RPNHead
 from .pgd_head import PGDHead
 from .point_rpn_head import PointRPNHead
@@ -19,5 +20,6 @@ __all__ = [
    'Anchor3DHead', 'FreeAnchor3DHead', 'PartA2RPNHead', 'VoteHead',
    'SSD3DHead', 'BaseConvBboxHead', 'CenterHead', 'ShapeAwareHead',
    'BaseMono3DDenseHead', 'AnchorFreeMono3DHead', 'FCOSMono3DHead',
-    'GroupFree3DHead', 'PointRPNHead', 'SMOKEMono3DHead', 'PGDHead'
+    'GroupFree3DHead', 'PointRPNHead', 'SMOKEMono3DHead', 'PGDHead',
+    'MonoFlexHead'
 ]
--- a/mmdet3d/models/dense_heads/monoflex_head.py
+++ b/mmdet3d/models/dense_heads/monoflex_head.py
--- a/mmdet3d/models/model_utils/__init__.py
+++ b/mmdet3d/models/model_utils/__init__.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from .edge_fusion_module import EdgeFusionModule
 from .transformer import GroupFree3DMHA
 from .vote_module import VoteModule
-__all__ = ['VoteModule', 'GroupFree3DMHA']
+__all__ = ['VoteModule', 'GroupFree3DMHA', 'EdgeFusionModule']
--- a/mmdet3d/models/model_utils/edge_fusion_module.py
+++ b/mmdet3d/models/model_utils/edge_fusion_module.py
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule
+from torch import nn as nn
+from torch.nn import functional as F
+class EdgeFusionModule(BaseModule):
+    """Edge Fusion Module for feature map.
+    Args:
+        out_channels (int): The number of output channels.
+        feat_channels (int): The number of channels in feature map
+            during edge feature fusion.
+        kernel_size (int, optional): Kernel size of convolution.
+            Default: 3.
+        act_cfg (dict, optional): Config of activation.
+            Default: dict(type='ReLU').
+        norm_cfg (dict, optional): Config of normalization.
+            Default: dict(type='BN1d')).
+    """
+    def __init__(self,
+                 out_channels,
+                 feat_channels,
+                 kernel_size=3,
+                 act_cfg=dict(type='ReLU'),
+                 norm_cfg=dict(type='BN1d')):
+        super().__init__()
+        self.edge_convs = nn.Sequential(
+            ConvModule(
+                feat_channels,
+                feat_channels,
+                kernel_size=kernel_size,
+                padding=kernel_size // 2,
+                conv_cfg=dict(type='Conv1d'),
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg),
+            nn.Conv1d(feat_channels, out_channels, kernel_size=1))
+        self.feat_channels = feat_channels
+    def forward(self, features, fused_features, edge_indices, edge_lens,
+                output_h, output_w):
+        """Forward pass.
+        Args:
+            features (torch.Tensor): Different representative features
+                for fusion.
+            fused_features (torch.Tensor): Different representative
+                features to be fused.
+            edge_indices (torch.Tensor): Batch image edge indices.
+            edge_lens (list[int]): List of edge length of each image.
+            output_h (int): Height of output feature map.
+            output_w (int): Width of output feature map.
+        Returns:
+            torch.Tensor: Fused feature maps.
+        """
+        batch_size = features.shape[0]
+        # normalize
+        grid_edge_indices = edge_indices.view(batch_size, -1, 1, 2).float()
+        grid_edge_indices[..., 0] = \
+            grid_edge_indices[..., 0] / (output_w - 1) * 2 - 1
+        grid_edge_indices[..., 1] = \
+            grid_edge_indices[..., 1] / (output_h - 1) * 2 - 1
+        # apply edge fusion
+        edge_features = F.grid_sample(
+            features, grid_edge_indices, align_corners=True).squeeze(-1)
+        edge_output = self.edge_convs(edge_features)
+        for k in range(batch_size):
+            edge_indice_k = edge_indices[k, :edge_lens[k]]
+            fused_features[k, :, edge_indice_k[:, 1],
+                           edge_indice_k[:, 0]] += edge_output[
+                               k, :, :edge_lens[k]]
+        return fused_features
--- a/mmdet3d/models/utils/edge_indices.py
+++ b/mmdet3d/models/utils/edge_indices.py
@@ -4,6 +4,7 @@ import torch
 def get_edge_indices(img_metas,
+                     downsample_ratio,
                     step=1,
                     pad_mode='default',
                     dtype=np.float32,
@@ -17,6 +18,7 @@ def get_edge_indices(img_metas,
    Args:
        img_metas (list[dict]): Meta information of each image, e.g.,
            image size, scaling factor, etc.
+        downsample_ratio (int): Downsample ratio of output feature,
        step (int, optional): Step size used for generateing
            edge indices. Default: 1.
        pad_mode (str, optional): Padding mode during data pipeline.
@@ -32,13 +34,21 @@ def get_edge_indices(img_metas,
    edge_indices_list = []
    for i in range(len(img_metas)):
        img_shape = img_metas[i]['img_shape']
+        pad_shape = img_metas[i]['pad_shape']
        h, w = img_shape[:2]
+        pad_h, pad_w = pad_shape
        edge_indices = []
        if pad_mode == 'default':
            x_min = 0
            y_min = 0
-            x_max, y_max = w - 1, h - 1
+            x_max = (w - 1) // downsample_ratio
+            y_max = (h - 1) // downsample_ratio
+        elif pad_mode == 'center':
+            x_min = np.ceil((pad_w - w) / 2 * downsample_ratio)
+            y_min = np.ceil((pad_h - h) / 2 * downsample_ratio)
+            x_max = x_min + w // downsample_ratio
+            y_max = y_min + h // downsample_ratio
        else:
            raise NotImplementedError

--- a/tests/test_models/test_heads/test_heads.py
+++ b/tests/test_models/test_heads/test_heads.py
@@ -1505,3 +1505,62 @@ def test_pgd_head():
    assert results[0][2].shape == torch.Size([20])
    assert results[0][3] is None
    assert results[0][4].shape == torch.Size([20, 5])
+def test_monoflex_head():
+    head_cfg = dict(
+        type='MonoFlexHead',
+        num_classes=3,
+        in_channels=64,
+        use_edge_fusion=True,
+        edge_fusion_inds=[(1, 0)],
+        edge_heatmap_ratio=1 / 8,
+        stacked_convs=0,
+        feat_channels=64,
+        use_direction_classifier=False,
+        diff_rad_by_sin=False,
+        pred_attrs=False,
+        pred_velo=False,
+        dir_offset=0,
+        strides=None,
+        group_reg_dims=((4, ), (2, ), (20, ), (3, ), (3, ), (8, 8), (1, ),
+                        (1, )),
+        cls_branch=(256, ),
+        reg_branch=((256, ), (256, ), (256, ), (256, ), (256, ), (256, ),
+                    (256, ), (256, )),
+        num_attrs=0,
+        bbox_code_size=7,
+        dir_branch=(),
+        attr_branch=(),
+        bbox_coder=dict(
+            type='MonoFlexCoder',
+            depth_mode='exp',
+            base_depth=(26.494627, 16.05988),
+            depth_range=[0.1, 100],
+            combine_depth=True,
+            uncertainty_range=[-10, 10],
+            base_dims=((3.8840, 1.5261, 1.6286, 0.4259, 0.1367, 0.1022),
+                       (0.8423, 1.7607, 0.6602, 0.2349, 0.1133, 0.1427),
+                       (1.7635, 1.7372, 0.5968, 0.1766, 0.0948, 0.1242)),
+            dims_mode='linear',
+            multibin=True,
+            num_dir_bins=4,
+            bin_centers=[0, np.pi / 2, np.pi, -np.pi / 2],
+            bin_margin=np.pi / 6,
+            code_size=7),
+        conv_bias=True,
+        dcn_on_last_conv=False)
+    self = build_head(head_cfg)
+    feats = [torch.rand([2, 64, 32, 32], dtype=torch.float32)]
+    input_metas = [
+        dict(img_shape=(110, 110), pad_shape=(128, 128)),
+        dict(img_shape=(98, 110), pad_shape=(128, 128))
+    ]
+    cls_score, out_reg = self(feats, input_metas)
+    assert cls_score[0].shape == torch.Size([2, 3, 32, 32])
+    assert out_reg[0].shape == torch.Size([2, 50, 32, 32])
--- a/tests/test_utils/test_bbox_coders.py
+++ b/tests/test_utils/test_bbox_coders.py
 # Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
 import torch
 from mmcv.cnn import Scale
 from torch import nn as nn
@@ -596,3 +597,69 @@ def test_smoke_bbox_coder():
    locations = torch.tensor([[15., 2., 1.], [15., 2., -1.]])
    orientations = bbox_coder._decode_orientation(ori_vector, locations)
    assert orientations.shape == torch.Size([2, 1])
+def test_monoflex_bbox_coder():
+    bbox_coder_cfg = dict(
+        type='MonoFlexCoder',
+        depth_mode='exp',
+        base_depth=(26.494627, 16.05988),
+        depth_range=[0.1, 100],
+        combine_depth=True,
+        uncertainty_range=[-10, 10],
+        base_dims=((3.8840, 1.5261, 1.6286, 0.4259, 0.1367,
+                    0.1022), (0.8423, 1.7607, 0.6602, 0.2349, 0.1133, 0.1427),
+                   (1.7635, 1.7372, 0.5968, 0.1766, 0.0948, 0.1242)),
+        dims_mode='linear',
+        multibin=True,
+        num_dir_bins=4,
+        bin_centers=[0, np.pi / 2, np.pi, -np.pi / 2],
+        bin_margin=np.pi / 6,
+        code_size=7)
+    bbox_coder = build_bbox_coder(bbox_coder_cfg)
+    gt_bboxes_3d = CameraInstance3DBoxes(torch.rand([6, 7]))
+    orientation_target = bbox_coder.encode(gt_bboxes_3d)
+    assert orientation_target.shape == torch.Size([6, 8])
+    regression = torch.rand([100, 50])
+    base_centers2d = torch.rand([100, 2])
+    labels = torch.ones([100])
+    downsample_ratio = 4
+    cam2imgs = torch.rand([100, 4, 4])
+    preds = bbox_coder.decode(regression, base_centers2d, labels,
+                              downsample_ratio, cam2imgs)
+    assert preds['bboxes2d'].shape == torch.Size([100, 4])
+    assert preds['dimensions'].shape == torch.Size([100, 3])
+    assert preds['offsets2d'].shape == torch.Size([100, 2])
+    assert preds['keypoints2d'].shape == torch.Size([100, 10, 2])
+    assert preds['orientations'].shape == torch.Size([100, 16])
+    assert preds['direct_depth'].shape == torch.Size([
+        100,
+    ])
+    assert preds['keypoints_depth'].shape == torch.Size([100, 3])
+    assert preds['combined_depth'].shape == torch.Size([
+        100,
+    ])
+    assert preds['direct_depth_uncertainty'].shape == torch.Size([
+        100,
+    ])
+    assert preds['keypoints_depth_uncertainty'].shape == torch.Size([100, 3])
+    offsets_2d = torch.randn([100, 2])
+    depths = torch.randn([
+        100,
+    ])
+    locations = bbox_coder.decode_location(base_centers2d, offsets_2d, depths,
+                                           cam2imgs, downsample_ratio)
+    assert locations.shape == torch.Size([100, 3])
+    orientations = torch.randn([100, 16])
+    yaws, local_yaws = bbox_coder.decode_orientation(orientations, locations)
+    assert yaws.shape == torch.Size([
+        100,
+    ])
+    assert local_yaws.shape == torch.Size([
+        100,
+    ])
--- a/tests/test_utils/test_utils.py
+++ b/tests/test_utils/test_utils.py
@@ -195,11 +195,15 @@ def test_points_img2cam():
 def test_generate_edge_indices():
-    img_metas = [dict(img_shape=[300, 400]), dict(img_shape=[500, 450])]
+    input_metas = [
-    edge_indices_list = get_edge_indices(img_metas)
+        dict(img_shape=(110, 110), pad_shape=(128, 128)),
+        dict(img_shape=(98, 110), pad_shape=(128, 128))
+    ]
+    downsample_ratio = 4
+    edge_indices_list = get_edge_indices(input_metas, downsample_ratio)
-    assert edge_indices_list[0].shape[0] == 1396
+    assert edge_indices_list[0].shape[0] == 108
-    assert edge_indices_list[1].shape[0] == 1896
+    assert edge_indices_list[1].shape[0] == 102
 def test_truncation_hanlde():