clamp reference point max to 1.0 to avoid NaN in regressed bbox

Summary: For training DF-DETR with swin-transformer backbone which uses large size_divisibility 224 (=32 * 7) and potentially has more zero-padding, we find the regressed box can contain NaN values and fail the assertion here (https://fburl.com/code/p27ztcce). This issue might be caused by two potential reasons. - Fix 1. In DF-DETR encoder, the reference points prepared by `get_reference_points()` can contain normalized x,y coordinates larger than 1 due to the rounding issues during mask interpolation across feature scales (specific examples can be given upon request LoL). Thus, we clamp max of x,y coordinates to 1.0. - Fix 2. The MLP used in bbox_embed heads contains 3 FC layers, which might be too many. We introduce an argument `BBOX_EMBED_NUM_LAYERS` to allow users to configure the number of FC layers. This change is back-compatible. Reviewed By: zhanghang1989 Differential Revision: D30661167 fbshipit-source-id: c7e94983bf1ec07426fdf1b9d363e5163637f21a

clamp reference point max to 1.0 to avoid NaN in regressed bbox
Summary: For training DF-DETR with swin-transformer backbone which uses large size_divisibility 224 (=32 * 7) and potentially has more zero-padding, we find the regressed box can contain NaN values and fail the assertion here (https://fburl.com/code/p27ztcce). This issue might be caused by two potential reasons. - Fix 1. In DF-DETR encoder, the reference points prepared by `get_reference_points()` can contain normalized x,y coordinates larger than 1 due to the rounding issues during mask interpolation across feature scales (specific examples can be given upon request LoL). Thus, we clamp max of x,y coordinates to 1.0. - Fix 2. The MLP used in bbox_embed heads contains 3 FC layers, which might be too many. We introduce an argument `BBOX_EMBED_NUM_LAYERS` to allow users to configure the number of FC layers. This change is back-compatible. Reviewed By: zhanghang1989 Differential Revision: D30661167 fbshipit-source-id: c7e94983bf1ec07426fdf1b9d363e5163637f21a
0a38f8c8 · Zhicheng Yan · Facebook GitHub Bot · ecbe3e02 · 0a38f8c8 · 0a38f8c8
Commit 0a38f8c8 authored Sep 01, 2021 by Zhicheng Yan Committed by Facebook GitHub Bot Sep 01, 2021
7 changed files
--- a/projects_oss/detr/detr/d2/config.py
+++ b/projects_oss/detr/detr/d2/config.py
@@ -40,6 +40,7 @@ def add_detr_config(cfg):
    cfg.MODEL.DETR.DIM_FEEDFORWARD = 2048
    cfg.MODEL.DETR.ENC_LAYERS = 6
    cfg.MODEL.DETR.DEC_LAYERS = 6
+    cfg.MODEL.DETR.BBOX_EMBED_NUM_LAYERS = 3
    cfg.MODEL.DETR.PRE_NORM = False

    cfg.MODEL.DETR.HIDDEN_DIM = 256

--- a/projects_oss/detr/detr/functions/ms_deform_attn_func.py
+++ b/projects_oss/detr/detr/functions/ms_deform_attn_func.py
@@ -54,8 +54,11 @@ class MSDeformAttnFunction(Function):
 def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
    # for debug and test only,
    # need to use cuda version instead
+    # value shape (N, K, num_heads, channels_per_head)
    N_, S_, M_, D_ = value.shape
+    # sampling_locations shape (N, len_q, num_heads, num_levels, num_points, 2)
    _, Lq_, M_, L_, P_, _ = sampling_locations.shape
+    # a list of num_level tensors. Each has shape (N, H_l*W_l, num_heads, channels_per_head)
    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
    sampling_grids = 2 * sampling_locations - 1
    sampling_value_list = []

--- a/projects_oss/detr/detr/models/backbone.py
+++ b/projects_oss/detr/detr/models/backbone.py
@@ -128,7 +128,10 @@ class Joiner(nn.Sequential):
        for x in out:
            pos.append(self[1](x).to(x.tensors.dtype))

-        # shape a list of tensors, each tensor shape (B, C, H, W)
+        # out: a list of NestedTensor
+        #   each tensor has shape (B, C, H, W)
+        #   each mask has shape (B, H, W)
+        # pos: a list of tensors, each has shape (B, C, H, W)
        return out, pos



--- a/projects_oss/detr/detr/models/deformable_detr.py
+++ b/projects_oss/detr/detr/models/deformable_detr.py
@@ -56,6 +56,7 @@ class DeformableDETR(nn.Module):
        aux_loss=True,
        with_box_refine=False,
        two_stage=False,
+        bbox_embed_num_layers=3,
    ):
        """Initializes the model.
        Parameters:
@@ -67,6 +68,7 @@ class DeformableDETR(nn.Module):
            aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
            with_box_refine: iterative bounding box refinement
            two_stage: two-stage Deformable DETR
+            bbox_embed_num_layers: number of FC layers in bbox_embed MLP
        """
        super().__init__()
        self.num_queries = num_queries
@@ -74,7 +76,7 @@ class DeformableDETR(nn.Module):
        hidden_dim = transformer.d_model
        # We will use sigmoid activation and focal loss
        self.class_embed = nn.Linear(hidden_dim, num_classes)
-        self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
+        self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, bbox_embed_num_layers)
        self.num_feature_levels = num_feature_levels
        if not two_stage:
            self.query_embed = nn.Embedding(num_queries, hidden_dim * 2)
@@ -147,7 +149,7 @@ class DeformableDETR(nn.Module):
            for box_embed in self.bbox_embed:
                nn.init.constant_(box_embed.layers[-1].bias.data[2:], 0.0)

-            self.transformer.encoder.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
+            self.transformer.encoder.bbox_embed = MLP(hidden_dim, hidden_dim, 4, bbox_embed_num_layers)

    def forward(self, samples: NestedTensor):
        """The forward expects a NestedTensor, which consists of:

--- a/projects_oss/detr/detr/models/deformable_transformer.py
+++ b/projects_oss/detr/detr/models/deformable_transformer.py
@@ -6,7 +6,6 @@
 # Modified from DETR (https://github.com/facebookresearch/detr)
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 # ------------------------------------------------------------------------
-
 import copy
 import math

@@ -18,7 +17,6 @@ from torch.nn.init import xavier_uniform_, constant_, normal_
 from ..modules import MSDeformAttn
 from ..util.misc import inverse_sigmoid

-
 # we do not use float("-inf") to avoid potential NaN during training
 NEG_INF = -10000.0

@@ -432,8 +430,9 @@ class DeformableTransformerEncoder(nn.Module):
        reference_points = torch.cat(reference_points_list, 1)
        # reference_points
        #   shape (N, K, 1, 2) * (N, 1, num_levels, 2) = (N, K, num_levels, 2)
-        #   value should be <1
+        #   ideally, value should be <1. In practice, value coule be >= 1. Thus, clamp max to 1
        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
+        reference_points = reference_points.clamp(max=1.0)
        return reference_points

    def forward(

--- a/projects_oss/detr/detr/models/position_encoding.py
+++ b/projects_oss/detr/detr/models/position_encoding.py
@@ -29,7 +29,9 @@ class PositionEmbeddingSine(nn.Module):
        self.centered = centered

    def forward(self, tensor_list: NestedTensor):
+        # x shape (B, C, H, W)
        x = tensor_list.tensors
+        # mask shape (B, H, W)
        mask = tensor_list.mask
        assert mask is not None
        not_mask = ~mask

--- a/projects_oss/detr/detr/modules/ms_deform_attn.py
+++ b/projects_oss/detr/detr/modules/ms_deform_attn.py
@@ -65,7 +65,7 @@ class MSDeformAttn(nn.Module):
        constant_(self.sampling_offsets.weight.data, 0.)
        # shape (num_heads,)
        thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
-        # shape (2 * num_heads)
+        # shape (num_heads, 2)
        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
        # shape (num_heads, num_levels, num_points, 2)
        grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)