Commit 0a38f8c8 authored by Zhicheng Yan's avatar Zhicheng Yan Committed by Facebook GitHub Bot
Browse files

clamp reference point max to 1.0 to avoid NaN in regressed bbox

Summary:
For training DF-DETR with swin-transformer backbone which uses large size_divisibility 224 (=32 * 7) and potentially has more zero-padding, we find the regressed box can contain NaN values and fail the assertion here (https://fburl.com/code/p27ztcce).

This issue might be caused by two potential reasons.
- Fix 1. In DF-DETR encoder, the reference points prepared by `get_reference_points()` can contain normalized x,y coordinates larger than 1 due to the rounding issues during mask interpolation across feature scales (specific examples can be given upon request LoL). Thus, we clamp max of x,y coordinates to 1.0.

- Fix 2. The MLP used in bbox_embed heads contains 3 FC layers, which might be too many. We introduce an argument `BBOX_EMBED_NUM_LAYERS` to allow users to configure the number of FC layers. This change is back-compatible.

Reviewed By: zhanghang1989

Differential Revision: D30661167

fbshipit-source-id: c7e94983bf1ec07426fdf1b9d363e5163637f21a
parent ecbe3e02
...@@ -40,6 +40,7 @@ def add_detr_config(cfg): ...@@ -40,6 +40,7 @@ def add_detr_config(cfg):
cfg.MODEL.DETR.DIM_FEEDFORWARD = 2048 cfg.MODEL.DETR.DIM_FEEDFORWARD = 2048
cfg.MODEL.DETR.ENC_LAYERS = 6 cfg.MODEL.DETR.ENC_LAYERS = 6
cfg.MODEL.DETR.DEC_LAYERS = 6 cfg.MODEL.DETR.DEC_LAYERS = 6
cfg.MODEL.DETR.BBOX_EMBED_NUM_LAYERS = 3
cfg.MODEL.DETR.PRE_NORM = False cfg.MODEL.DETR.PRE_NORM = False
cfg.MODEL.DETR.HIDDEN_DIM = 256 cfg.MODEL.DETR.HIDDEN_DIM = 256
......
...@@ -54,8 +54,11 @@ class MSDeformAttnFunction(Function): ...@@ -54,8 +54,11 @@ class MSDeformAttnFunction(Function):
def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
# for debug and test only, # for debug and test only,
# need to use cuda version instead # need to use cuda version instead
# value shape (N, K, num_heads, channels_per_head)
N_, S_, M_, D_ = value.shape N_, S_, M_, D_ = value.shape
# sampling_locations shape (N, len_q, num_heads, num_levels, num_points, 2)
_, Lq_, M_, L_, P_, _ = sampling_locations.shape _, Lq_, M_, L_, P_, _ = sampling_locations.shape
# a list of num_level tensors. Each has shape (N, H_l*W_l, num_heads, channels_per_head)
value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
sampling_grids = 2 * sampling_locations - 1 sampling_grids = 2 * sampling_locations - 1
sampling_value_list = [] sampling_value_list = []
......
...@@ -128,7 +128,10 @@ class Joiner(nn.Sequential): ...@@ -128,7 +128,10 @@ class Joiner(nn.Sequential):
for x in out: for x in out:
pos.append(self[1](x).to(x.tensors.dtype)) pos.append(self[1](x).to(x.tensors.dtype))
# shape a list of tensors, each tensor shape (B, C, H, W) # out: a list of NestedTensor
# each tensor has shape (B, C, H, W)
# each mask has shape (B, H, W)
# pos: a list of tensors, each has shape (B, C, H, W)
return out, pos return out, pos
......
...@@ -56,6 +56,7 @@ class DeformableDETR(nn.Module): ...@@ -56,6 +56,7 @@ class DeformableDETR(nn.Module):
aux_loss=True, aux_loss=True,
with_box_refine=False, with_box_refine=False,
two_stage=False, two_stage=False,
bbox_embed_num_layers=3,
): ):
"""Initializes the model. """Initializes the model.
Parameters: Parameters:
...@@ -67,6 +68,7 @@ class DeformableDETR(nn.Module): ...@@ -67,6 +68,7 @@ class DeformableDETR(nn.Module):
aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used. aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
with_box_refine: iterative bounding box refinement with_box_refine: iterative bounding box refinement
two_stage: two-stage Deformable DETR two_stage: two-stage Deformable DETR
bbox_embed_num_layers: number of FC layers in bbox_embed MLP
""" """
super().__init__() super().__init__()
self.num_queries = num_queries self.num_queries = num_queries
...@@ -74,7 +76,7 @@ class DeformableDETR(nn.Module): ...@@ -74,7 +76,7 @@ class DeformableDETR(nn.Module):
hidden_dim = transformer.d_model hidden_dim = transformer.d_model
# We will use sigmoid activation and focal loss # We will use sigmoid activation and focal loss
self.class_embed = nn.Linear(hidden_dim, num_classes) self.class_embed = nn.Linear(hidden_dim, num_classes)
self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3) self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, bbox_embed_num_layers)
self.num_feature_levels = num_feature_levels self.num_feature_levels = num_feature_levels
if not two_stage: if not two_stage:
self.query_embed = nn.Embedding(num_queries, hidden_dim * 2) self.query_embed = nn.Embedding(num_queries, hidden_dim * 2)
...@@ -147,7 +149,7 @@ class DeformableDETR(nn.Module): ...@@ -147,7 +149,7 @@ class DeformableDETR(nn.Module):
for box_embed in self.bbox_embed: for box_embed in self.bbox_embed:
nn.init.constant_(box_embed.layers[-1].bias.data[2:], 0.0) nn.init.constant_(box_embed.layers[-1].bias.data[2:], 0.0)
self.transformer.encoder.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3) self.transformer.encoder.bbox_embed = MLP(hidden_dim, hidden_dim, 4, bbox_embed_num_layers)
def forward(self, samples: NestedTensor): def forward(self, samples: NestedTensor):
"""The forward expects a NestedTensor, which consists of: """The forward expects a NestedTensor, which consists of:
......
...@@ -6,7 +6,6 @@ ...@@ -6,7 +6,6 @@
# Modified from DETR (https://github.com/facebookresearch/detr) # Modified from DETR (https://github.com/facebookresearch/detr)
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
# ------------------------------------------------------------------------ # ------------------------------------------------------------------------
import copy import copy
import math import math
...@@ -18,7 +17,6 @@ from torch.nn.init import xavier_uniform_, constant_, normal_ ...@@ -18,7 +17,6 @@ from torch.nn.init import xavier_uniform_, constant_, normal_
from ..modules import MSDeformAttn from ..modules import MSDeformAttn
from ..util.misc import inverse_sigmoid from ..util.misc import inverse_sigmoid
# we do not use float("-inf") to avoid potential NaN during training # we do not use float("-inf") to avoid potential NaN during training
NEG_INF = -10000.0 NEG_INF = -10000.0
...@@ -432,8 +430,9 @@ class DeformableTransformerEncoder(nn.Module): ...@@ -432,8 +430,9 @@ class DeformableTransformerEncoder(nn.Module):
reference_points = torch.cat(reference_points_list, 1) reference_points = torch.cat(reference_points_list, 1)
# reference_points # reference_points
# shape (N, K, 1, 2) * (N, 1, num_levels, 2) = (N, K, num_levels, 2) # shape (N, K, 1, 2) * (N, 1, num_levels, 2) = (N, K, num_levels, 2)
# value should be <1 # ideally, value should be <1. In practice, value coule be >= 1. Thus, clamp max to 1
reference_points = reference_points[:, :, None] * valid_ratios[:, None] reference_points = reference_points[:, :, None] * valid_ratios[:, None]
reference_points = reference_points.clamp(max=1.0)
return reference_points return reference_points
def forward( def forward(
......
...@@ -29,7 +29,9 @@ class PositionEmbeddingSine(nn.Module): ...@@ -29,7 +29,9 @@ class PositionEmbeddingSine(nn.Module):
self.centered = centered self.centered = centered
def forward(self, tensor_list: NestedTensor): def forward(self, tensor_list: NestedTensor):
# x shape (B, C, H, W)
x = tensor_list.tensors x = tensor_list.tensors
# mask shape (B, H, W)
mask = tensor_list.mask mask = tensor_list.mask
assert mask is not None assert mask is not None
not_mask = ~mask not_mask = ~mask
......
...@@ -65,7 +65,7 @@ class MSDeformAttn(nn.Module): ...@@ -65,7 +65,7 @@ class MSDeformAttn(nn.Module):
constant_(self.sampling_offsets.weight.data, 0.) constant_(self.sampling_offsets.weight.data, 0.)
# shape (num_heads,) # shape (num_heads,)
thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads) thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
# shape (2 * num_heads) # shape (num_heads, 2)
grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
# shape (num_heads, num_levels, num_points, 2) # shape (num_heads, num_levels, num_points, 2)
grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1) grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment