[Feature] Add MonoFlex Head (#1044)

8538177b · ChaimZhu · GitHub · 4590418e · 8538177b · 8538177b
Unverified Commit 8538177b authored Jan 21, 2022 by ChaimZhu Committed by GitHub Jan 21, 2022
12 changed files
--- a/mmdet3d/core/bbox/coders/monoflex_bbox_coder.py
+++ b/mmdet3d/core/bbox/coders/monoflex_bbox_coder.py
@@ -81,16 +81,16 @@ class MonoFlexCoder(BaseBBoxCoder):
            torch.Tensor: Targets of orientations.
        """
        local_yaw = gt_bboxes_3d.local_yaw
        # encode local yaw (-pi ~ pi) to multibin format
-        encode_local_yaw = np.zeros(self.num_dir_bins * 2)
+        encode_local_yaw = local_yaw.new_zeros(
+            [local_yaw.shape[0], self.num_dir_bins * 2])
        bin_size = 2 * np.pi / self.num_dir_bins
        margin_size = bin_size * self.bin_margin
-        bin_centers = self.bin_centers
+        bin_centers = local_yaw.new_tensor(self.bin_centers)
        range_size = bin_size / 2 + margin_size
-        offsets = local_yaw - bin_centers.unsqueeze(0)
+        offsets = local_yaw.unsqueeze(1) - bin_centers.unsqueeze(0)
        offsets[offsets > np.pi] = offsets[offsets > np.pi] - 2 * np.pi
        offsets[offsets < -np.pi] = offsets[offsets < -np.pi] + 2 * np.pi
@@ -98,7 +98,7 @@ class MonoFlexCoder(BaseBBoxCoder):
            offset = offsets[:, i]
            inds = abs(offset) < range_size
            encode_local_yaw[inds, i] = 1
-            encode_local_yaw[inds, i + self.num_dir_bins] = offset
+            encode_local_yaw[inds, i + self.num_dir_bins] = offset[inds]
        orientation_target = encode_local_yaw
@@ -164,7 +164,7 @@ class MonoFlexCoder(BaseBBoxCoder):
        pred_direct_depth_uncertainty = bbox[:, 49:50].squeeze(-1)
        # 2 dimension of offsets x keypoints (8 corners + top/bottom center)
-        pred_keypoints2d = bbox[:, 6:26]
+        pred_keypoints2d = bbox[:, 6:26].reshape(-1, 10, 2)
        # 1 dimension for depth offsets
        pred_direct_depth_offsets = bbox[:, 48:49].squeeze(-1)
@@ -273,11 +273,11 @@ class MonoFlexCoder(BaseBBoxCoder):
            raise NotImplementedError
        # (N, 3)
        centers2d_img = \
-            torch.cat(centers2d_img, depths.unsqueeze(-1), dim=1)
+            torch.cat((centers2d_img, depths.unsqueeze(-1)), dim=1)
        # (N, 4, 1)
        centers2d_extend = \
            torch.cat((centers2d_img, centers2d_img.new_ones(N, 1)),
-                      dim=1).unqueeze(-1)
+                      dim=1).unsqueeze(-1)
        locations = torch.matmul(cam2imgs_inv, centers2d_extend).squeeze(-1)
        return locations[:, :3]
@@ -450,15 +450,15 @@ class MonoFlexCoder(BaseBBoxCoder):
        local_yaws = orientations
        yaws = local_yaws + rays
-        larger_idx = (yaws > np.pi).nonzero()
+        larger_idx = (yaws > np.pi).nonzero(as_tuple=False)
-        small_idx = (yaws < -np.pi).nonzero()
+        small_idx = (yaws < -np.pi).nonzero(as_tuple=False)
        if len(larger_idx) != 0:
            yaws[larger_idx] -= 2 * np.pi
        if len(small_idx) != 0:
            yaws[small_idx] += 2 * np.pi
-        larger_idx = (local_yaws > np.pi).nonzero()
+        larger_idx = (local_yaws > np.pi).nonzero(as_tuple=False)
-        small_idx = (local_yaws < -np.pi).nonzero()
+        small_idx = (local_yaws < -np.pi).nonzero(as_tuple=False)
        if len(larger_idx) != 0:
            local_yaws[larger_idx] -= 2 * np.pi
        if len(small_idx) != 0:
@@ -491,7 +491,7 @@ class MonoFlexCoder(BaseBBoxCoder):
        return bboxes2d
-    def combine_depths(depth, depth_uncertainty):
+    def combine_depths(self, depth, depth_uncertainty):
        """Combine all the prediced depths with depth uncertainty.
        Args:

--- a/mmdet3d/core/bbox/structures/utils.py
+++ b/mmdet3d/core/bbox/structures/utils.py
@@ -324,8 +324,11 @@ def yaw2local(yaw, loc):
        torch.Tensor: local yaw (alpha in kitti).
    """
    local_yaw = yaw - torch.atan2(loc[:, 0], loc[:, 2])
-    while local_yaw > np.pi:
+    larger_idx = (local_yaw > np.pi).nonzero(as_tuple=False)
-        local_yaw -= np.pi * 2
+    small_idx = (local_yaw < -np.pi).nonzero(as_tuple=False)
-    while local_yaw < -np.pi:
+    if len(larger_idx) != 0:
-        local_yaw += np.pi * 2
+        local_yaw[larger_idx] -= 2 * np.pi
+    if len(small_idx) != 0:
+        local_yaw[small_idx] += 2 * np.pi
    return local_yaw
--- a/mmdet3d/core/utils/__init__.py
+++ b/mmdet3d/core/utils/__init__.py
 # Copyright (c) OpenMMLab. All rights reserved.
 from .array_converter import ArrayConverter, array_converter
-from .gaussian import draw_heatmap_gaussian, gaussian_2d, gaussian_radius
+from .gaussian import (draw_heatmap_gaussian, ellip_gaussian2D, gaussian_2d,
+                       gaussian_radius, get_ellip_gaussian_2D)
 __all__ = [
    'gaussian_2d', 'gaussian_radius', 'draw_heatmap_gaussian',
-    'ArrayConverter', 'array_converter'
+    'ArrayConverter', 'array_converter', 'ellip_gaussian2D',
+    'get_ellip_gaussian_2D'
 ]
--- a/mmdet3d/core/utils/gaussian.py
+++ b/mmdet3d/core/utils/gaussian.py
@@ -84,3 +84,75 @@ def gaussian_radius(det_size, min_overlap=0.5):
    sq3 = torch.sqrt(b3**2 - 4 * a3 * c3)
    r3 = (b3 + sq3) / 2
    return min(r1, r2, r3)
+def get_ellip_gaussian_2D(heatmap, center, radius_x, radius_y, k=1):
+    """Generate 2D ellipse gaussian heatmap.
+    Args:
+        heatmap (Tensor): Input heatmap, the gaussian kernel will cover on
+            it and maintain the max value.
+        center (list[int]): Coord of gaussian kernel's center.
+        radius_x (int): X-axis radius of gaussian kernel.
+        radius_y (int): Y-axis radius of gaussian kernel.
+        k (int, optional): Coefficient of gaussian kernel. Default: 1.
+    Returns:
+        out_heatmap (Tensor): Updated heatmap covered by gaussian kernel.
+    """
+    diameter_x, diameter_y = 2 * radius_x + 1, 2 * radius_y + 1
+    gaussian_kernel = ellip_gaussian2D((radius_x, radius_y),
+                                       sigma_x=diameter_x / 6,
+                                       sigma_y=diameter_y / 6,
+                                       dtype=heatmap.dtype,
+                                       device=heatmap.device)
+    x, y = int(center[0]), int(center[1])
+    height, width = heatmap.shape[0:2]
+    left, right = min(x, radius_x), min(width - x, radius_x + 1)
+    top, bottom = min(y, radius_y), min(height - y, radius_y + 1)
+    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
+    masked_gaussian = gaussian_kernel[radius_y - top:radius_y + bottom,
+                                      radius_x - left:radius_x + right]
+    out_heatmap = heatmap
+    torch.max(
+        masked_heatmap,
+        masked_gaussian * k,
+        out=out_heatmap[y - top:y + bottom, x - left:x + right])
+    return out_heatmap
+def ellip_gaussian2D(radius,
+                     sigma_x,
+                     sigma_y,
+                     dtype=torch.float32,
+                     device='cpu'):
+    """Generate 2D ellipse gaussian kernel.
+    Args:
+        radius (tuple(int)): Ellipse radius (radius_x, radius_y) of gaussian
+            kernel.
+        sigma_x (int): X-axis sigma of gaussian function.
+        sigma_y (int): Y-axis sigma of gaussian function.
+        dtype (torch.dtype, optional): Dtype of gaussian tensor.
+            Default: torch.float32.
+        device (str, optional): Device of gaussian tensor.
+            Default: 'cpu'.
+    Returns:
+        h (Tensor): Gaussian kernel with a
+            ``(2 * radius_y + 1) * (2 * radius_x + 1)`` shape.
+    """
+    x = torch.arange(
+        -radius[0], radius[0] + 1, dtype=dtype, device=device).view(1, -1)
+    y = torch.arange(
+        -radius[1], radius[1] + 1, dtype=dtype, device=device).view(-1, 1)
+    h = (-(x * x) / (2 * sigma_x * sigma_x) - (y * y) /
+         (2 * sigma_y * sigma_y)).exp()
+    h[h < torch.finfo(h.dtype).eps * h.max()] = 0
+    return h
--- a/mmdet3d/models/dense_heads/__init__.py
+++ b/mmdet3d/models/dense_heads/__init__.py
@@ -7,6 +7,7 @@ from .centerpoint_head import CenterHead
 from .fcos_mono3d_head import FCOSMono3DHead
 from .free_anchor3d_head import FreeAnchor3DHead
 from .groupfree3d_head import GroupFree3DHead
+from .monoflex_head import MonoFlexHead
 from .parta2_rpn_head import PartA2RPNHead
 from .pgd_head import PGDHead
 from .point_rpn_head import PointRPNHead
@@ -19,5 +20,6 @@ __all__ = [
    'Anchor3DHead', 'FreeAnchor3DHead', 'PartA2RPNHead', 'VoteHead',
    'SSD3DHead', 'BaseConvBboxHead', 'CenterHead', 'ShapeAwareHead',
    'BaseMono3DDenseHead', 'AnchorFreeMono3DHead', 'FCOSMono3DHead',
-    'GroupFree3DHead', 'PointRPNHead', 'SMOKEMono3DHead', 'PGDHead'
+    'GroupFree3DHead', 'PointRPNHead', 'SMOKEMono3DHead', 'PGDHead',
+    'MonoFlexHead'
 ]
--- a/mmdet3d/models/dense_heads/monoflex_head.py
+++ b/mmdet3d/models/dense_heads/monoflex_head.py
+import torch
+from mmcv.cnn import xavier_init
+from torch import nn as nn
+from mmdet3d.core.utils import get_ellip_gaussian_2D
+from mmdet3d.models.model_utils import EdgeFusionModule
+from mmdet3d.models.utils import (filter_outside_objs, get_edge_indices,
+                                  get_keypoints, handle_proj_objs)
+from mmdet.core import multi_apply
+from mmdet.core.bbox.builder import build_bbox_coder
+from mmdet.models.builder import HEADS, build_loss
+from mmdet.models.utils import gaussian_radius, gen_gaussian_target
+from mmdet.models.utils.gaussian_target import (get_local_maximum,
+                                                get_topk_from_heatmap,
+                                                transpose_and_gather_feat)
+from .anchor_free_mono3d_head import AnchorFreeMono3DHead
+@HEADS.register_module()
+class MonoFlexHead(AnchorFreeMono3DHead):
+    r"""MonoFlex head used in `MonoFlex <https://arxiv.org/abs/2104.02323>`_
+    .. code-block:: none
+                / --> 3 x 3 conv --> 1 x 1 conv --> [edge fusion] --> cls
+                |
+                | --> 3 x 3 conv --> 1 x 1 conv --> 2d bbox
+                |
+                | --> 3 x 3 conv --> 1 x 1 conv --> [edge fusion] --> 2d offsets
+                |
+                | --> 3 x 3 conv --> 1 x 1 conv -->  keypoints offsets
+                |
+                | --> 3 x 3 conv --> 1 x 1 conv -->  keypoints uncertainty
+        feature
+                | --> 3 x 3 conv --> 1 x 1 conv -->  keypoints uncertainty
+                |
+                | --> 3 x 3 conv --> 1 x 1 conv -->   3d dimensions
+                |
+                |                  |--- 1 x 1 conv -->  ori cls
+                | --> 3 x 3 conv --|
+                |                  |--- 1 x 1 conv -->  ori offsets
+                |
+                | --> 3 x 3 conv --> 1 x 1 conv -->  depth
+                |
+                \ --> 3 x 3 conv --> 1 x 1 conv -->  depth uncertainty
+    Args:
+        use_edge_fusion (bool): Whether to use edge fusion module while
+            feature extraction.
+        edge_fusion_inds (list[tuple]): Indices of feature to use edge fusion.
+        edge_heatmap_ratio (float): Ratio of generating target heatmap.
+        filter_outside_objs (bool, optional): Whether to filter the
+            outside objects. Default: True.
+        loss_cls (dict, optional): Config of classification loss.
+            Default: loss_cls=dict(type='GaussionFocalLoss', loss_weight=1.0).
+        loss_bbox (dict, optional): Config of localization loss.
+            Default: loss_bbox=dict(type='IOULoss', loss_weight=10.0).
+        loss_dir (dict, optional): Config of direction classification loss.
+            Default: dict(type='MultibinLoss', loss_weight=0.1).
+        loss_keypoints (dict, optional): Config of keypoints loss.
+            Default: dict(type='L1Loss', loss_weight=0.1).
+        loss_dims: (dict, optional): Config of dimensions loss.
+            Default: dict(type='L1Loss', loss_weight=0.1).
+        loss_offsets2d: (dict, optional): Config of offsets2d loss.
+            Default: dict(type='L1Loss', loss_weight=0.1).
+        loss_direct_depth: (dict, optional): Config of directly regression depth loss.
+            Default: dict(type='L1Loss', loss_weight=0.1).
+        loss_keypoints_depth: (dict, optional): Config of keypoints decoded depth loss.
+            Default: dict(type='L1Loss', loss_weight=0.1).
+        loss_combined_depth: (dict, optional): Config of combined depth loss.
+            Default: dict(type='L1Loss', loss_weight=0.1).
+        loss_attr (dict, optional): Config of attribute classification loss.
+            In MonoFlex, Default: None.
+        bbox_coder (dict, optional): Bbox coder for encoding and decoding boxes.
+            Default: dict(type='MonoFlexCoder', code_size=7).
+        norm_cfg (dict, optional): Dictionary to construct and config norm layer.
+            Default: norm_cfg=dict(type='GN', num_groups=32, requires_grad=True).
+        init_cfg (dict): Initialization config dict. Default: None.
+    """  # noqa: E501
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 use_edge_fusion,
+                 edge_fusion_inds,
+                 edge_heatmap_ratio,
+                 filter_outside_objs=True,
+                 loss_cls=dict(type='GaussianFocalLoss', loss_weight=1.0),
+                 loss_bbox=dict(type='IoULoss', loss_weight=0.1),
+                 loss_dir=dict(type='MultiBinLoss', loss_weight=0.1),
+                 loss_keypoints=dict(type='L1Loss', loss_weight=0.1),
+                 loss_dims=dict(type='L1Loss', loss_weight=0.1),
+                 loss_offsets2d=dict(type='L1Loss', loss_weight=0.1),
+                 loss_direct_depth=dict(type='L1Loss', loss_weight=0.1),
+                 loss_keypoints_depth=dict(type='L1Loss', loss_weight=0.1),
+                 loss_combined_depth=dict(type='L1Loss', loss_weight=0.1),
+                 loss_attr=None,
+                 bbox_coder=dict(type='MonoFlexCoder', code_size=7),
+                 norm_cfg=dict(type='BN'),
+                 init_cfg=None,
+                 init_bias=-2.19,
+                 **kwargs):
+        self.use_edge_fusion = use_edge_fusion
+        self.edge_fusion_inds = edge_fusion_inds
+        super().__init__(
+            num_classes,
+            in_channels,
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            loss_dir=loss_dir,
+            loss_attr=loss_attr,
+            norm_cfg=norm_cfg,
+            init_cfg=init_cfg,
+            **kwargs)
+        self.filter_outside_objs = filter_outside_objs
+        self.edge_heatmap_ratio = edge_heatmap_ratio
+        self.init_bias = init_bias
+        self.loss_dir = build_loss(loss_dir)
+        self.loss_keypoints = build_loss(loss_keypoints)
+        self.loss_dims = build_loss(loss_dims)
+        self.loss_offsets2d = build_loss(loss_offsets2d)
+        self.loss_direct_depth = build_loss(loss_direct_depth)
+        self.loss_keypoints_depth = build_loss(loss_keypoints_depth)
+        self.loss_combined_depth = build_loss(loss_combined_depth)
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+    def _init_edge_module(self):
+        """Initialize edge fusion module for feature extraction."""
+        self.edge_fuse_cls = EdgeFusionModule(self.num_classes, 256)
+        for i in range(len(self.edge_fusion_inds)):
+            reg_inds, out_inds = self.edge_fusion_inds[i]
+            out_channels = self.group_reg_dims[reg_inds][out_inds]
+            fusion_layer = EdgeFusionModule(out_channels, 256)
+            layer_name = f'edge_fuse_reg_{reg_inds}_{out_inds}'
+            self.add_module(layer_name, fusion_layer)
+    def init_weights(self):
+        """Initialize weights."""
+        super().init_weights()
+        self.conv_cls.bias.data.fill_(self.init_bias)
+        xavier_init(self.conv_regs[4][0], gain=0.01)
+        xavier_init(self.conv_regs[7][0], gain=0.01)
+        for m in self.conv_regs.modules():
+            if isinstance(m, nn.Conv2d):
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def _init_predictor(self):
+        """Initialize predictor layers of the head."""
+        self.conv_cls_prev = self._init_branch(
+            conv_channels=self.cls_branch,
+            conv_strides=(1, ) * len(self.cls_branch))
+        self.conv_cls = nn.Conv2d(self.cls_branch[-1], self.cls_out_channels,
+                                  1)
+        # init regression head
+        self.conv_reg_prevs = nn.ModuleList()
+        # init output head
+        self.conv_regs = nn.ModuleList()
+        # group_reg_dims:
+        # ((4, ), (2, ), (20, ), (3, ), (3, ), (8, 8), (1, ), (1, ))
+        for i in range(len(self.group_reg_dims)):
+            reg_dims = self.group_reg_dims[i]
+            reg_branch_channels = self.reg_branch[i]
+            out_channel = self.out_channels[i]
+            reg_list = nn.ModuleList()
+            if len(reg_branch_channels) > 0:
+                self.conv_reg_prevs.append(
+                    self._init_branch(
+                        conv_channels=reg_branch_channels,
+                        conv_strides=(1, ) * len(reg_branch_channels)))
+                for reg_dim in reg_dims:
+                    reg_list.append(nn.Conv2d(out_channel, reg_dim, 1))
+                self.conv_regs.append(reg_list)
+            else:
+                self.conv_reg_prevs.append(None)
+                for reg_dim in reg_dims:
+                    reg_list.append(nn.Conv2d(self.feat_channels, reg_dim, 1))
+                self.conv_regs.append(reg_list)
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self._init_predictor()
+        if self.use_edge_fusion:
+            self._init_edge_module()
+    def forward_train(self, x, input_metas, gt_bboxes, gt_labels, gt_bboxes_3d,
+                      gt_labels_3d, centers2d, depths, attr_labels,
+                      gt_bboxes_ignore, proposal_cfg, **kwargs):
+        """
+        Args:
+            x (list[Tensor]): Features from FPN.
+            input_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes (list[Tensor]): Ground truth bboxes of the image,
+                shape (num_gts, 4).
+            gt_labels (list[Tensor]): Ground truth labels of each box,
+                shape (num_gts,).
+            gt_bboxes_3d (list[Tensor]): 3D ground truth bboxes of the image,
+                shape (num_gts, self.bbox_code_size).
+            gt_labels_3d (list[Tensor]): 3D ground truth labels of each box,
+                shape (num_gts,).
+            centers2d (list[Tensor]): Projected 3D center of each box,
+                shape (num_gts, 2).
+            depths (list[Tensor]): Depth of projected 3D center of each box,
+                shape (num_gts,).
+            attr_labels (list[Tensor]): Attribute labels of each box,
+                shape (num_gts,).
+            gt_bboxes_ignore (list[Tensor]): Ground truth bboxes to be
+                ignored, shape (num_ignored_gts, 4).
+            proposal_cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used
+        Returns:
+            tuple:
+                losses: (dict[str, Tensor]): A dictionary of loss components.
+                proposal_list (list[Tensor]): Proposals of each image.
+        """
+        outs = self(x, input_metas)
+        if gt_labels is None:
+            loss_inputs = outs + (gt_bboxes, gt_bboxes_3d, centers2d, depths,
+                                  attr_labels, input_metas)
+        else:
+            loss_inputs = outs + (gt_bboxes, gt_labels, gt_bboxes_3d,
+                                  gt_labels_3d, centers2d, depths, attr_labels,
+                                  input_metas)
+        losses = self.loss(*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+        if proposal_cfg is None:
+            return losses
+        else:
+            proposal_list = self.get_bboxes(
+                *outs, input_metas, cfg=proposal_cfg)
+            return losses, proposal_list
+    def forward(self, feats, input_metas):
+        """Forward features from the upstream network.
+        Args:
+            feats (list[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+            input_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+        Returns:
+            tuple:
+                cls_scores (list[Tensor]): Box scores for each scale level,
+                    each is a 4D-tensor, the channel number is
+                    num_points * num_classes.
+                bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                    level, each is a 4D-tensor, the channel number is
+                    num_points * bbox_code_size.
+        """
+        mlvl_input_metas = [input_metas for i in range(len(feats))]
+        return multi_apply(self.forward_single, feats, mlvl_input_metas)
+    def forward_single(self, x, input_metas):
+        """Forward features of a single scale level.
+        Args:
+            x (Tensor): Feature maps from a specific FPN feature level.
+            input_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+        Returns:
+            tuple: Scores for each class, bbox predictions.
+        """
+        img_h, img_w = input_metas[0]['pad_shape'][:2]
+        batch_size, _, feat_h, feat_w = x.shape
+        downsample_ratio = img_h / feat_h
+        for conv_cls_prev_layer in self.conv_cls_prev:
+            cls_feat = conv_cls_prev_layer(x)
+        out_cls = self.conv_cls(cls_feat)
+        if self.use_edge_fusion:
+            # calculate the edge indices for the batch data
+            edge_indices_list = get_edge_indices(
+                input_metas, downsample_ratio, device=x.device)
+            edge_lens = [
+                edge_indices.shape[0] for edge_indices in edge_indices_list
+            ]
+            max_edge_len = max(edge_lens)
+            edge_indices = x.new_zeros((batch_size, max_edge_len, 2),
+                                       dtype=torch.long)
+            for i in range(batch_size):
+                edge_indices[i, :edge_lens[i]] = edge_indices_list[i]
+            # cls feature map edge fusion
+            out_cls = self.edge_fuse_cls(cls_feat, out_cls, edge_indices,
+                                         edge_lens, feat_h, feat_w)
+        bbox_pred = []
+        for i in range(len(self.group_reg_dims)):
+            reg_feat = x.clone()
+            # feature regression head
+            if len(self.reg_branch[i]) > 0:
+                for conv_reg_prev_layer in self.conv_reg_prevs[i]:
+                    reg_feat = conv_reg_prev_layer(reg_feat)
+            for j, conv_reg in enumerate(self.conv_regs[i]):
+                out_reg = conv_reg(reg_feat)
+                #  Use Edge Fusion Module
+                if self.use_edge_fusion and (i, j) in self.edge_fusion_inds:
+                    # reg feature map edge fusion
+                    out_reg = getattr(self, 'edge_fuse_reg_{}_{}'.format(
+                        i, j))(reg_feat, out_reg, edge_indices, edge_lens,
+                               feat_h, feat_w)
+                bbox_pred.append(out_reg)
+        bbox_pred = torch.cat(bbox_pred, dim=1)
+        cls_score = out_cls.sigmoid()  # turn to 0-1
+        cls_score = cls_score.clamp(min=1e-4, max=1 - 1e-4)
+        return cls_score, bbox_pred
+    def get_bboxes(self, cls_scores, bbox_preds, input_metas):
+        """Generate bboxes from bbox head predictions.
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level.
+            bbox_preds (list[Tensor]): Box regression for each scale.
+            input_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            rescale (bool): If True, return boxes in original image space.
+        Returns:
+            list[tuple[:obj:`CameraInstance3DBoxes`, Tensor, Tensor, None]]:
+                Each item in result_list is 4-tuple.
+        """
+        assert len(cls_scores) == len(bbox_preds) == 1
+        cam2imgs = torch.stack([
+            cls_scores[0].new_tensor(input_meta['cam2img'])
+            for input_meta in input_metas
+        ])
+        batch_bboxes, batch_scores, batch_topk_labels = self.decode_heatmap(
+            cls_scores[0],
+            bbox_preds[0],
+            input_metas,
+            cam2imgs=cam2imgs,
+            topk=100,
+            kernel=3)
+        result_list = []
+        for img_id in range(len(input_metas)):
+            bboxes = batch_bboxes[img_id]
+            scores = batch_scores[img_id]
+            labels = batch_topk_labels[img_id]
+            keep_idx = scores > 0.25
+            bboxes = bboxes[keep_idx]
+            scores = scores[keep_idx]
+            labels = labels[keep_idx]
+            bboxes = input_metas[img_id]['box_type_3d'](
+                bboxes, box_dim=self.bbox_code_size, origin=(0.5, 0.5, 0.5))
+            attrs = None
+            result_list.append((bboxes, scores, labels, attrs))
+        return result_list
+    def decode_heatmap(self,
+                       cls_score,
+                       reg_pred,
+                       input_metas,
+                       cam2imgs,
+                       topk=100,
+                       kernel=3):
+        """Transform outputs into detections raw bbox predictions.
+        Args:
+            class_score (Tensor): Center predict heatmap,
+                shape (B, num_classes, H, W).
+            reg_pred (Tensor): Box regression map.
+                shape (B, channel, H , W).
+            input_metas (List[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            cam2imgs (Tensor): Camera intrinsic matrix.
+                shape (N, 4, 4)
+            topk (int, optional): Get top k center keypoints from heatmap.
+                Default 100.
+            kernel (int, optional): Max pooling kernel for extract local
+                maximum pixels. Default 3.
+        Returns:
+            tuple[torch.Tensor]: Decoded output of SMOKEHead, containing
+               the following Tensors:
+              - batch_bboxes (Tensor): Coords of each 3D box.
+                    shape (B, k, 7)
+              - batch_scores (Tensor): Scores of each 3D box.
+                    shape (B, k)
+              - batch_topk_labels (Tensor): Categories of each 3D box.
+                    shape (B, k)
+        """
+        img_h, img_w = input_metas[0]['pad_shape'][:2]
+        batch_size, _, feat_h, feat_w = cls_score.shape
+        downsample_ratio = img_h / feat_h
+        center_heatmap_pred = get_local_maximum(cls_score, kernel=kernel)
+        *batch_dets, topk_ys, topk_xs = get_topk_from_heatmap(
+            center_heatmap_pred, k=topk)
+        batch_scores, batch_index, batch_topk_labels = batch_dets
+        regression = transpose_and_gather_feat(reg_pred, batch_index)
+        regression = regression.view(-1, 8)
+        pred_base_centers2d = torch.cat(
+            [topk_xs.view(-1, 1),
+             topk_ys.view(-1, 1).float()], dim=1)
+        preds = self.bbox_coder.decode(regression, batch_topk_labels,
+                                       downsample_ratio, cam2imgs)
+        pred_locations = self.bbox_coder.decode_location(
+            pred_base_centers2d, preds['offsets2d'], preds['combined_depth'],
+            cam2imgs, downsample_ratio)
+        pred_yaws = self.bbox_coder.decode_orientation(
+            preds['orientations']).unsqueeze(-1)
+        pred_dims = preds['dimensions']
+        batch_bboxes = torch.cat((pred_locations, pred_dims, pred_yaws), dim=1)
+        batch_bboxes = batch_bboxes.view(batch_size, -1, self.bbox_code_size)
+        return batch_bboxes, batch_scores, batch_topk_labels
+    def get_predictions(self, pred_reg, labels3d, centers2d, reg_mask,
+                        batch_indices, input_metas, downsample_ratio):
+        """Prepare predictions for computing loss.
+        Args:
+            pred_reg (Tensor): Box regression map.
+                shape (B, channel, H , W).
+            labels3d (Tensor): Labels of each 3D box.
+                shape (B * max_objs, )
+            centers2d (Tensor): Coords of each projected 3D box
+                center on image. shape (N, 2)
+            reg_mask (Tensor): Indexes of the existence of the 3D box.
+                shape (B * max_objs, )
+            batch_indices (Tenosr): Batch indices of the 3D box.
+                shape (N, 3)
+            input_metas (list[dict]): Meta information of each image,
+                e.g., image size, scaling factor, etc.
+            downsample_ratio (int): The stride of feature map.
+        Returns:
+            dict: The predictions for computing loss.
+        """
+        batch, channel = pred_reg.shape[0], pred_reg.shape[1]
+        w = pred_reg.shape[3]
+        cam2imgs = torch.stack([
+            centers2d.new_tensor(input_meta['cam2img'])
+            for input_meta in input_metas
+        ])
+        # (batch_size, 4, 4) -> (N, 4, 4)
+        cam2imgs = cam2imgs[batch_indices, :, :]
+        centers2d_inds = centers2d[:, 1] * w + centers2d[:, 0]
+        centers2d_inds = centers2d_inds.view(batch, -1)
+        pred_regression = transpose_and_gather_feat(pred_reg, centers2d_inds)
+        pred_regression_pois = pred_regression.view(-1, channel)[reg_mask]
+        preds = self.bbox_coder.decode(pred_regression_pois, labels3d,
+                                       downsample_ratio, cam2imgs)
+        return preds
+    def get_targets(self, gt_bboxes_list, gt_labels_list, gt_bboxes_3d_list,
+                    gt_labels_3d_list, centers2d_list, depths_list, feat_shape,
+                    img_shape, input_metas):
+        """Get training targets for batch images.
+``
+        Args:
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each
+                image, shape (num_gt, 4).
+            gt_labels_list (list[Tensor]): Ground truth labels of each
+                box, shape (num_gt,).
+            gt_bboxes_3d_list (list[:obj:`CameraInstance3DBoxes`]): 3D
+                Ground truth bboxes of each image,
+                shape (num_gt, bbox_code_size).
+            gt_labels_3d_list (list[Tensor]): 3D Ground truth labels of
+                each box, shape (num_gt,).
+            centers2d_list (list[Tensor]): Projected 3D centers onto 2D
+                image, shape (num_gt, 2).
+            depths_list (list[Tensor]): Depth of projected 3D centers onto 2D
+                image, each has shape (num_gt, 1).
+            feat_shape (tuple[int]): Feature map shape with value,
+                shape (B, _, H, W).
+            img_shape (tuple[int]): Image shape in [h, w] format.
+            input_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+        Returns:
+            tuple[Tensor, dict]: The Tensor value is the targets of
+                center heatmap, the dict has components below:
+              - base_centers2d_target (Tensor): Coords of each projected 3D box
+                    center on image. shape (B * max_objs, 2), [dtype: int]
+              - labels3d (Tensor): Labels of each 3D box.
+                    shape (N, )
+              - reg_mask (Tensor): Mask of the existence of the 3D box.
+                    shape (B * max_objs, )
+              - batch_indices (Tensor): Batch id of the 3D box.
+                    shape (N, )
+              - depth_target (Tensor): Depth target of each 3D box.
+                    shape (N, )
+              - keypoints2d_target (Tensor): Keypoints of each projected 3D box
+                    on image. shape (N, 10, 2)
+              - keypoints_mask (Tensor): Keypoints mask of each projected 3D
+                    box on image. shape (N, 10)
+              - keypoints_depth_mask (Tensor): Depths decoded from keypoints
+                    of each 3D box. shape (N, 3)
+              - orientations_target (Tensor): Orientation (encoded local yaw)
+                    target of each 3D box. shape (N, )
+              - offsets2d_target (Tensor): Offsets target of each projected
+                    3D box. shape (N, 2)
+              - dimensions_target (Tensor): Dimensions target of each 3D box.
+                    shape (N, 3)
+              - downsample_ratio (int): The stride of feature map.
+        """
+        img_h, img_w = img_shape[:2]
+        batch_size, _, feat_h, feat_w = feat_shape
+        width_ratio = float(feat_w / img_w)  # 1/4
+        height_ratio = float(feat_h / img_h)  # 1/4
+        assert width_ratio == height_ratio
+        # Whether to filter the objects which are not in FOV.
+        if self.filter_outside_objs:
+            filter_outside_objs(gt_bboxes_list, gt_labels_list,
+                                gt_bboxes_3d_list, gt_labels_3d_list,
+                                centers2d_list, input_metas)
+        # transform centers2d to base centers2d for regression and
+        # heatmap generation.
+        # centers2d = int(base_centers2d) + offsets2d
+        base_centers2d_list, offsets2d_list, trunc_mask_list = \
+            handle_proj_objs(centers2d_list, gt_bboxes_list, input_metas)
+        keypoints2d_list, keypoints_mask_list, keypoints_depth_mask_list = \
+            get_keypoints(gt_bboxes_3d_list, centers2d_list, input_metas)
+        center_heatmap_target = gt_bboxes_list[-1].new_zeros(
+            [batch_size, self.num_classes, feat_h, feat_w])
+        for batch_id in range(batch_size):
+            # project gt_bboxes from input image to feat map
+            gt_bboxes = gt_bboxes_list[batch_id] * width_ratio
+            gt_labels = gt_labels_list[batch_id]
+            # project base centers2d from input image to feat map
+            gt_base_centers2d = base_centers2d_list[batch_id] * width_ratio
+            trunc_masks = trunc_mask_list[batch_id]
+            for j, base_center2d in enumerate(gt_base_centers2d):
+                if trunc_masks[j]:
+                    # for outside objects, generate ellipse heatmap
+                    base_center2d_x_int, base_center2d_y_int = \
+                        base_center2d.int()
+                    scale_box_w = min(base_center2d_x_int - gt_bboxes[j][0],
+                                      gt_bboxes[j][2] - base_center2d_x_int)
+                    scale_box_h = min(base_center2d_y_int - gt_bboxes[j][1],
+                                      gt_bboxes[j][3] - base_center2d_y_int)
+                    radius_x = scale_box_w * self.edge_heatmap_ratio
+                    radius_y = scale_box_h * self.edge_heatmap_ratio
+                    radius_x, radius_y = max(0, int(radius_x)), max(
+                        0, int(radius_y))
+                    assert min(radius_x, radius_y) == 0
+                    ind = gt_labels[j]
+                    get_ellip_gaussian_2D(
+                        center_heatmap_target[batch_id, ind],
+                        [base_center2d_x_int, base_center2d_y_int], radius_x,
+                        radius_y)
+                else:
+                    base_center2d_x_int, base_center2d_y_int = \
+                        base_center2d.int()
+                    scale_box_h = (gt_bboxes[j][3] - gt_bboxes[j][1])
+                    scale_box_w = (gt_bboxes[j][2] - gt_bboxes[j][0])
+                    radius = gaussian_radius([scale_box_h, scale_box_w],
+                                             min_overlap=0.7)
+                    radius = max(0, int(radius))
+                    ind = gt_labels[j]
+                    gen_gaussian_target(
+                        center_heatmap_target[batch_id, ind],
+                        [base_center2d_x_int, base_center2d_y_int], radius)
+        avg_factor = max(1, center_heatmap_target.eq(1).sum())
+        num_ctrs = [centers2d.shape[0] for centers2d in centers2d_list]
+        max_objs = max(num_ctrs)
+        batch_indices = [
+            centers2d_list[0].new_full((num_ctrs[i], ), i)
+            for i in range(batch_size)
+        ]
+        batch_indices = torch.cat(batch_indices, dim=0)
+        reg_mask = torch.zeros(
+            (batch_size, max_objs),
+            dtype=torch.bool).to(base_centers2d_list[0].device)
+        gt_bboxes_3d = input_metas['box_type_3d'].cat(gt_bboxes_3d_list)
+        gt_bboxes_3d = gt_bboxes_3d.to(base_centers2d_list[0].device)
+        # encode original local yaw to multibin format
+        orienations_target = self.bbox_coder.encode(gt_bboxes_3d)
+        batch_base_centers2d = base_centers2d_list[0].new_zeros(
+            (batch_size, max_objs, 2))
+        for i in range(batch_size):
+            reg_mask[i, :num_ctrs[i]] = 1
+            batch_base_centers2d[i, :num_ctrs[i]] = base_centers2d_list[i]
+        flatten_reg_mask = reg_mask.flatten()
+        # transform base centers2d from input scale to output scale
+        batch_base_centers2d = batch_base_centers2d.view(-1, 2) * width_ratio
+        dimensions_target = gt_bboxes_3d.tensor[:, 3:6]
+        labels_3d = torch.cat(gt_labels_3d_list)
+        keypoints2d_target = torch.cat(keypoints2d_list)
+        keypoints_mask = torch.cat(keypoints_mask_list)
+        keypoints_depth_mask = torch.cat(keypoints_depth_mask_list)
+        offsets2d_target = torch.cat(offsets2d_list)
+        bboxes2d = torch.cat(gt_bboxes_list)
+        # transform FCOS style bbox into [x1, y1, x2, y2] format.
+        bboxes2d_target = torch.cat([bboxes2d[:, 0:2] * -1, bboxes2d[:, 2:]],
+                                    dim=-1)
+        depths = torch.cat(depths_list)
+        target_labels = dict(
+            base_centers2d_target=batch_base_centers2d.int(),
+            labels3d=labels_3d,
+            reg_mask=flatten_reg_mask,
+            batch_indices=batch_indices,
+            bboxes2d_target=bboxes2d_target,
+            depth_target=depths,
+            keypoints2d_target=keypoints2d_target,
+            keypoints_mask=keypoints_mask,
+            keypoints_depth_mask=keypoints_depth_mask,
+            orienations_target=orienations_target,
+            offsets2d_target=offsets2d_target,
+            dimensions_target=dimensions_target,
+            downsample_ratio=1 / width_ratio)
+        return center_heatmap_target, avg_factor, target_labels
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             gt_bboxes,
+             gt_labels,
+             gt_bboxes_3d,
+             gt_labels_3d,
+             centers2d,
+             depths,
+             attr_labels,
+             input_metas,
+             gt_bboxes_ignore=None):
+        """Compute loss of the head.
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level.
+                shape (num_gt, 4).
+            bbox_preds (list[Tensor]): Box dims is a 4D-tensor, the channel
+                number is bbox_code_size.
+                shape (B, 7, H, W).
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image.
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): Class indices corresponding to each box.
+                shape (num_gts, ).
+            gt_bboxes_3d (list[:obj:`CameraInstance3DBoxes`]): 3D boxes ground
+                truth. it is the flipped gt_bboxes
+            gt_labels_3d (list[Tensor]): Same as gt_labels.
+            centers2d (list[Tensor]): 2D centers on the image.
+                shape (num_gts, 2).
+            depths (list[Tensor]): Depth ground truth.
+                shape (num_gts, ).
+            attr_labels (list[Tensor]): Attributes indices of each box.
+                In kitti it's None.
+            input_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (None | list[Tensor]): Specify which bounding
+                boxes can be ignored when computing the loss.
+                Default: None.
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert len(cls_scores) == len(bbox_preds) == 1
+        assert attr_labels is None
+        assert gt_bboxes_ignore is None
+        center2d_heatmap = cls_scores[0]
+        pred_reg = bbox_preds[0]
+        center2d_heatmap_target, avg_factor, target_labels = \
+            self.get_targets(gt_bboxes, gt_labels, gt_bboxes_3d,
+                             gt_labels_3d, centers2d, depths,
+                             center2d_heatmap.shape,
+                             input_metas[0]['pad_shape'],
+                             input_metas)
+        preds = self.get_predictions(
+            pred_reg=pred_reg,
+            labels3d=target_labels['labels3d'],
+            centers2d=target_labels['base_centers2d_target'],
+            reg_mask=target_labels['reg_mask'],
+            batch_indices=target_labels['batch_indices'],
+            input_metas=input_metas,
+            downsample_ratio=target_labels['downsample_ratio'])
+        # heatmap loss
+        loss_cls = self.loss_cls(
+            center2d_heatmap, center2d_heatmap_target, avg_factor=avg_factor)
+        # bbox2d regression loss
+        loss_bbox = self.loss_bbox(preds['bboxes2d'],
+                                   target_labels['bboxes2d_target'])
+        # keypoints loss, the keypoints in predictions and target are all
+        # local coordinates. Check the mask dtype should be bool, not int
+        # or float to ensure the indexing is bool index
+        keypoints2d_mask = target_labels['keypoints2d_mask']
+        loss_keypoints = self.loss_keypoints(
+            preds['keypoints2d'][keypoints2d_mask],
+            target_labels['keypoints2d_target'][keypoints2d_mask])
+        # orientations loss
+        loss_dir = self.loss_dir(preds['orientations'],
+                                 target_labels['orientations_target'])
+        # dimensions loss
+        loss_dims = self.loss_dims(preds['dimensions'],
+                                   target_labels['dimensions_target'])
+        # offsets for center heatmap
+        loss_offsets2d = self.loss_offsets2d(preds['offsets2d'],
+                                             target_labels['offsets2d_target'])
+        # directly regressed depth loss with direct depth uncertainty loss
+        direct_depth_weights = torch.exp(-preds['direct_depth_uncertainty'])
+        loss_weight_1 = self.loss_direct_depth.loss_weight
+        loss_direct_depth = self.loss_direct_depth(
+            preds['direct_depth'], target_labels['depth_target'],
+            direct_depth_weights)
+        loss_uncertainty_1 =\
+            preds['direct_depth_uncertainty'] * loss_weight_1
+        loss_direct_depth = loss_direct_depth + loss_uncertainty_1.mean()
+        # keypoints decoded depth loss with keypoints depth uncertainty loss
+        depth_mask = target_labels['keypoints_depth_mask']
+        depth_target = target_labels['depth_target'].unsqueeze(-1).repeat(1, 3)
+        valid_keypoints_depth_uncertainty = preds[
+            'keypoints_depth_uncertainty'][depth_mask]
+        valid_keypoints_depth_weights = torch.exp(
+            -valid_keypoints_depth_uncertainty)
+        loss_keypoints_depth = self.loss_keypoint_depth(
+            preds['keypoints_depth'][depth_mask], depth_target[depth_mask],
+            valid_keypoints_depth_weights)
+        loss_weight_2 = self.loss_keypoints_depth.loss_weight
+        loss_uncertainty_2 =\
+            valid_keypoints_depth_uncertainty * loss_weight_2
+        loss_keypoints_depth = loss_keypoints_depth + loss_uncertainty_2.mean()
+        # combined depth loss for optimiaze the uncertainty
+        loss_combined_depth = self.loss_combined_depth(
+            preds['combined_depth'], target_labels['depth_target'])
+        loss_dict = dict(
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            loss_keypoints=loss_keypoints,
+            loss_dir=loss_dir,
+            loss_dims=loss_dims,
+            loss_offsets2d=loss_offsets2d,
+            loss_direct_depth=loss_direct_depth,
+            loss_keypoints_depth=loss_keypoints_depth,
+            loss_combined_depth=loss_combined_depth)
+        return loss_dict
--- a/mmdet3d/models/model_utils/__init__.py
+++ b/mmdet3d/models/model_utils/__init__.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from .edge_fusion_module import EdgeFusionModule
 from .transformer import GroupFree3DMHA
 from .vote_module import VoteModule
-__all__ = ['VoteModule', 'GroupFree3DMHA']
+__all__ = ['VoteModule', 'GroupFree3DMHA', 'EdgeFusionModule']
--- a/mmdet3d/models/model_utils/edge_fusion_module.py
+++ b/mmdet3d/models/model_utils/edge_fusion_module.py
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule
+from torch import nn as nn
+from torch.nn import functional as F
+class EdgeFusionModule(BaseModule):
+    """Edge Fusion Module for feature map.
+    Args:
+        out_channels (int): The number of output channels.
+        feat_channels (int): The number of channels in feature map
+            during edge feature fusion.
+        kernel_size (int, optional): Kernel size of convolution.
+            Default: 3.
+        act_cfg (dict, optional): Config of activation.
+            Default: dict(type='ReLU').
+        norm_cfg (dict, optional): Config of normalization.
+            Default: dict(type='BN1d')).
+    """
+    def __init__(self,
+                 out_channels,
+                 feat_channels,
+                 kernel_size=3,
+                 act_cfg=dict(type='ReLU'),
+                 norm_cfg=dict(type='BN1d')):
+        super().__init__()
+        self.edge_convs = nn.Sequential(
+            ConvModule(
+                feat_channels,
+                feat_channels,
+                kernel_size=kernel_size,
+                padding=kernel_size // 2,
+                conv_cfg=dict(type='Conv1d'),
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg),
+            nn.Conv1d(feat_channels, out_channels, kernel_size=1))
+        self.feat_channels = feat_channels
+    def forward(self, features, fused_features, edge_indices, edge_lens,
+                output_h, output_w):
+        """Forward pass.
+        Args:
+            features (torch.Tensor): Different representative features
+                for fusion.
+            fused_features (torch.Tensor): Different representative
+                features to be fused.
+            edge_indices (torch.Tensor): Batch image edge indices.
+            edge_lens (list[int]): List of edge length of each image.
+            output_h (int): Height of output feature map.
+            output_w (int): Width of output feature map.
+        Returns:
+            torch.Tensor: Fused feature maps.
+        """
+        batch_size = features.shape[0]
+        # normalize
+        grid_edge_indices = edge_indices.view(batch_size, -1, 1, 2).float()
+        grid_edge_indices[..., 0] = \
+            grid_edge_indices[..., 0] / (output_w - 1) * 2 - 1
+        grid_edge_indices[..., 1] = \
+            grid_edge_indices[..., 1] / (output_h - 1) * 2 - 1
+        # apply edge fusion
+        edge_features = F.grid_sample(
+            features, grid_edge_indices, align_corners=True).squeeze(-1)
+        edge_output = self.edge_convs(edge_features)
+        for k in range(batch_size):
+            edge_indice_k = edge_indices[k, :edge_lens[k]]
+            fused_features[k, :, edge_indice_k[:, 1],
+                           edge_indice_k[:, 0]] += edge_output[
+                               k, :, :edge_lens[k]]
+        return fused_features
--- a/mmdet3d/models/utils/edge_indices.py
+++ b/mmdet3d/models/utils/edge_indices.py
@@ -4,6 +4,7 @@ import torch
 def get_edge_indices(img_metas,
+                     downsample_ratio,
                     step=1,
                     pad_mode='default',
                     dtype=np.float32,
@@ -17,6 +18,7 @@ def get_edge_indices(img_metas,
    Args:
        img_metas (list[dict]): Meta information of each image, e.g.,
            image size, scaling factor, etc.
+        downsample_ratio (int): Downsample ratio of output feature,
        step (int, optional): Step size used for generateing
            edge indices. Default: 1.
        pad_mode (str, optional): Padding mode during data pipeline.
@@ -32,13 +34,21 @@ def get_edge_indices(img_metas,
    edge_indices_list = []
    for i in range(len(img_metas)):
        img_shape = img_metas[i]['img_shape']
+        pad_shape = img_metas[i]['pad_shape']
        h, w = img_shape[:2]
+        pad_h, pad_w = pad_shape
        edge_indices = []
        if pad_mode == 'default':
            x_min = 0
            y_min = 0
-            x_max, y_max = w - 1, h - 1
+            x_max = (w - 1) // downsample_ratio
+            y_max = (h - 1) // downsample_ratio
+        elif pad_mode == 'center':
+            x_min = np.ceil((pad_w - w) / 2 * downsample_ratio)
+            y_min = np.ceil((pad_h - h) / 2 * downsample_ratio)
+            x_max = x_min + w // downsample_ratio
+            y_max = y_min + h // downsample_ratio
        else:
            raise NotImplementedError

--- a/tests/test_models/test_heads/test_heads.py
+++ b/tests/test_models/test_heads/test_heads.py
@@ -1505,3 +1505,62 @@ def test_pgd_head():
    assert results[0][2].shape == torch.Size([20])
    assert results[0][3] is None
    assert results[0][4].shape == torch.Size([20, 5])
+def test_monoflex_head():
+    head_cfg = dict(
+        type='MonoFlexHead',
+        num_classes=3,
+        in_channels=64,
+        use_edge_fusion=True,
+        edge_fusion_inds=[(1, 0)],
+        edge_heatmap_ratio=1 / 8,
+        stacked_convs=0,
+        feat_channels=64,
+        use_direction_classifier=False,
+        diff_rad_by_sin=False,
+        pred_attrs=False,
+        pred_velo=False,
+        dir_offset=0,
+        strides=None,
+        group_reg_dims=((4, ), (2, ), (20, ), (3, ), (3, ), (8, 8), (1, ),
+                        (1, )),
+        cls_branch=(256, ),
+        reg_branch=((256, ), (256, ), (256, ), (256, ), (256, ), (256, ),
+                    (256, ), (256, )),
+        num_attrs=0,
+        bbox_code_size=7,
+        dir_branch=(),
+        attr_branch=(),
+        bbox_coder=dict(
+            type='MonoFlexCoder',
+            depth_mode='exp',
+            base_depth=(26.494627, 16.05988),
+            depth_range=[0.1, 100],
+            combine_depth=True,
+            uncertainty_range=[-10, 10],
+            base_dims=((3.8840, 1.5261, 1.6286, 0.4259, 0.1367, 0.1022),
+                       (0.8423, 1.7607, 0.6602, 0.2349, 0.1133, 0.1427),
+                       (1.7635, 1.7372, 0.5968, 0.1766, 0.0948, 0.1242)),
+            dims_mode='linear',
+            multibin=True,
+            num_dir_bins=4,
+            bin_centers=[0, np.pi / 2, np.pi, -np.pi / 2],
+            bin_margin=np.pi / 6,
+            code_size=7),
+        conv_bias=True,
+        dcn_on_last_conv=False)
+    self = build_head(head_cfg)
+    feats = [torch.rand([2, 64, 32, 32], dtype=torch.float32)]
+    input_metas = [
+        dict(img_shape=(110, 110), pad_shape=(128, 128)),
+        dict(img_shape=(98, 110), pad_shape=(128, 128))
+    ]
+    cls_score, out_reg = self(feats, input_metas)
+    assert cls_score[0].shape == torch.Size([2, 3, 32, 32])
+    assert out_reg[0].shape == torch.Size([2, 50, 32, 32])
--- a/tests/test_utils/test_bbox_coders.py
+++ b/tests/test_utils/test_bbox_coders.py
 # Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
 import torch
 from mmcv.cnn import Scale
 from torch import nn as nn
@@ -596,3 +597,69 @@ def test_smoke_bbox_coder():
    locations = torch.tensor([[15., 2., 1.], [15., 2., -1.]])
    orientations = bbox_coder._decode_orientation(ori_vector, locations)
    assert orientations.shape == torch.Size([2, 1])
+def test_monoflex_bbox_coder():
+    bbox_coder_cfg = dict(
+        type='MonoFlexCoder',
+        depth_mode='exp',
+        base_depth=(26.494627, 16.05988),
+        depth_range=[0.1, 100],
+        combine_depth=True,
+        uncertainty_range=[-10, 10],
+        base_dims=((3.8840, 1.5261, 1.6286, 0.4259, 0.1367,
+                    0.1022), (0.8423, 1.7607, 0.6602, 0.2349, 0.1133, 0.1427),
+                   (1.7635, 1.7372, 0.5968, 0.1766, 0.0948, 0.1242)),
+        dims_mode='linear',
+        multibin=True,
+        num_dir_bins=4,
+        bin_centers=[0, np.pi / 2, np.pi, -np.pi / 2],
+        bin_margin=np.pi / 6,
+        code_size=7)
+    bbox_coder = build_bbox_coder(bbox_coder_cfg)
+    gt_bboxes_3d = CameraInstance3DBoxes(torch.rand([6, 7]))
+    orientation_target = bbox_coder.encode(gt_bboxes_3d)
+    assert orientation_target.shape == torch.Size([6, 8])
+    regression = torch.rand([100, 50])
+    base_centers2d = torch.rand([100, 2])
+    labels = torch.ones([100])
+    downsample_ratio = 4
+    cam2imgs = torch.rand([100, 4, 4])
+    preds = bbox_coder.decode(regression, base_centers2d, labels,
+                              downsample_ratio, cam2imgs)
+    assert preds['bboxes2d'].shape == torch.Size([100, 4])
+    assert preds['dimensions'].shape == torch.Size([100, 3])
+    assert preds['offsets2d'].shape == torch.Size([100, 2])
+    assert preds['keypoints2d'].shape == torch.Size([100, 10, 2])
+    assert preds['orientations'].shape == torch.Size([100, 16])
+    assert preds['direct_depth'].shape == torch.Size([
+        100,
+    ])
+    assert preds['keypoints_depth'].shape == torch.Size([100, 3])
+    assert preds['combined_depth'].shape == torch.Size([
+        100,
+    ])
+    assert preds['direct_depth_uncertainty'].shape == torch.Size([
+        100,
+    ])
+    assert preds['keypoints_depth_uncertainty'].shape == torch.Size([100, 3])
+    offsets_2d = torch.randn([100, 2])
+    depths = torch.randn([
+        100,
+    ])
+    locations = bbox_coder.decode_location(base_centers2d, offsets_2d, depths,
+                                           cam2imgs, downsample_ratio)
+    assert locations.shape == torch.Size([100, 3])
+    orientations = torch.randn([100, 16])
+    yaws, local_yaws = bbox_coder.decode_orientation(orientations, locations)
+    assert yaws.shape == torch.Size([
+        100,
+    ])
+    assert local_yaws.shape == torch.Size([
+        100,
+    ])
--- a/tests/test_utils/test_utils.py
+++ b/tests/test_utils/test_utils.py
@@ -195,11 +195,15 @@ def test_points_img2cam():
 def test_generate_edge_indices():
-    img_metas = [dict(img_shape=[300, 400]), dict(img_shape=[500, 450])]
+    input_metas = [
-    edge_indices_list = get_edge_indices(img_metas)
+        dict(img_shape=(110, 110), pad_shape=(128, 128)),
+        dict(img_shape=(98, 110), pad_shape=(128, 128))
+    ]
+    downsample_ratio = 4
+    edge_indices_list = get_edge_indices(input_metas, downsample_ratio)
-    assert edge_indices_list[0].shape[0] == 1396
+    assert edge_indices_list[0].shape[0] == 108
-    assert edge_indices_list[1].shape[0] == 1896
+    assert edge_indices_list[1].shape[0] == 102
 def test_truncation_hanlde():