"vscode:/vscode.git/clone" did not exist on "fd1c54abf2c3c0ea0ad8f16fc278ae62371154f3"
Commit 0a38f8c8 authored by Zhicheng Yan's avatar Zhicheng Yan Committed by Facebook GitHub Bot
Browse files

clamp reference point max to 1.0 to avoid NaN in regressed bbox

Summary:
For training DF-DETR with swin-transformer backbone which uses large size_divisibility 224 (=32 * 7) and potentially has more zero-padding, we find the regressed box can contain NaN values and fail the assertion here (https://fburl.com/code/p27ztcce).

This issue might be caused by two potential reasons.
- Fix 1. In DF-DETR encoder, the reference points prepared by `get_reference_points()` can contain normalized x,y coordinates larger than 1 due to the rounding issues during mask interpolation across feature scales (specific examples can be given upon request LoL). Thus, we clamp max of x,y coordinates to 1.0.

- Fix 2. The MLP used in bbox_embed heads contains 3 FC layers, which might be too many. We introduce an argument `BBOX_EMBED_NUM_LAYERS` to allow users to configure the number of FC layers. This change is back-compatible.

Reviewed By: zhanghang1989

Differential Revision: D30661167

fbshipit-source-id: c7e94983bf1ec07426fdf1b9d363e5163637f21a
parent ecbe3e02
......@@ -40,6 +40,7 @@ def add_detr_config(cfg):
cfg.MODEL.DETR.DIM_FEEDFORWARD = 2048
cfg.MODEL.DETR.ENC_LAYERS = 6
cfg.MODEL.DETR.DEC_LAYERS = 6
cfg.MODEL.DETR.BBOX_EMBED_NUM_LAYERS = 3
cfg.MODEL.DETR.PRE_NORM = False
cfg.MODEL.DETR.HIDDEN_DIM = 256
......
......@@ -54,8 +54,11 @@ class MSDeformAttnFunction(Function):
def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
# for debug and test only,
# need to use cuda version instead
# value shape (N, K, num_heads, channels_per_head)
N_, S_, M_, D_ = value.shape
# sampling_locations shape (N, len_q, num_heads, num_levels, num_points, 2)
_, Lq_, M_, L_, P_, _ = sampling_locations.shape
# a list of num_level tensors. Each has shape (N, H_l*W_l, num_heads, channels_per_head)
value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
sampling_grids = 2 * sampling_locations - 1
sampling_value_list = []
......
......@@ -128,7 +128,10 @@ class Joiner(nn.Sequential):
for x in out:
pos.append(self[1](x).to(x.tensors.dtype))
# shape a list of tensors, each tensor shape (B, C, H, W)
# out: a list of NestedTensor
# each tensor has shape (B, C, H, W)
# each mask has shape (B, H, W)
# pos: a list of tensors, each has shape (B, C, H, W)
return out, pos
......
......@@ -56,6 +56,7 @@ class DeformableDETR(nn.Module):
aux_loss=True,
with_box_refine=False,
two_stage=False,
bbox_embed_num_layers=3,
):
"""Initializes the model.
Parameters:
......@@ -67,6 +68,7 @@ class DeformableDETR(nn.Module):
aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
with_box_refine: iterative bounding box refinement
two_stage: two-stage Deformable DETR
bbox_embed_num_layers: number of FC layers in bbox_embed MLP
"""
super().__init__()
self.num_queries = num_queries
......@@ -74,7 +76,7 @@ class DeformableDETR(nn.Module):
hidden_dim = transformer.d_model
# We will use sigmoid activation and focal loss
self.class_embed = nn.Linear(hidden_dim, num_classes)
self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, bbox_embed_num_layers)
self.num_feature_levels = num_feature_levels
if not two_stage:
self.query_embed = nn.Embedding(num_queries, hidden_dim * 2)
......@@ -147,7 +149,7 @@ class DeformableDETR(nn.Module):
for box_embed in self.bbox_embed:
nn.init.constant_(box_embed.layers[-1].bias.data[2:], 0.0)
self.transformer.encoder.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
self.transformer.encoder.bbox_embed = MLP(hidden_dim, hidden_dim, 4, bbox_embed_num_layers)
def forward(self, samples: NestedTensor):
"""The forward expects a NestedTensor, which consists of:
......
......@@ -6,7 +6,6 @@
# Modified from DETR (https://github.com/facebookresearch/detr)
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
# ------------------------------------------------------------------------
import copy
import math
......@@ -18,7 +17,6 @@ from torch.nn.init import xavier_uniform_, constant_, normal_
from ..modules import MSDeformAttn
from ..util.misc import inverse_sigmoid
# we do not use float("-inf") to avoid potential NaN during training
NEG_INF = -10000.0
......@@ -432,8 +430,9 @@ class DeformableTransformerEncoder(nn.Module):
reference_points = torch.cat(reference_points_list, 1)
# reference_points
# shape (N, K, 1, 2) * (N, 1, num_levels, 2) = (N, K, num_levels, 2)
# value should be <1
# ideally, value should be <1. In practice, value coule be >= 1. Thus, clamp max to 1
reference_points = reference_points[:, :, None] * valid_ratios[:, None]
reference_points = reference_points.clamp(max=1.0)
return reference_points
def forward(
......
......@@ -29,7 +29,9 @@ class PositionEmbeddingSine(nn.Module):
self.centered = centered
def forward(self, tensor_list: NestedTensor):
# x shape (B, C, H, W)
x = tensor_list.tensors
# mask shape (B, H, W)
mask = tensor_list.mask
assert mask is not None
not_mask = ~mask
......
......@@ -65,7 +65,7 @@ class MSDeformAttn(nn.Module):
constant_(self.sampling_offsets.weight.data, 0.)
# shape (num_heads,)
thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
# shape (2 * num_heads)
# shape (num_heads, 2)
grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
# shape (num_heads, num_levels, num_points, 2)
grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment