Commit 82295dbf authored by Yanghan Wang's avatar Yanghan Wang Committed by Facebook GitHub Bot
Browse files

enable black for mobile-vision

Summary:
https://fb.workplace.com/groups/pythonfoundation/posts/2990917737888352

Remove `mobile-vision` from opt-out list; leaving `mobile-vision/SNPE` opted out because of 3rd-party code.

arc lint --take BLACK --apply-patches --paths-cmd 'hg files mobile-vision'

allow-large-files

Reviewed By: sstsai-adl

Differential Revision: D30721093

fbshipit-source-id: 9e5c16d988b315b93a28038443ecfb92efd18ef8
parent a56c7e15
......@@ -5,11 +5,10 @@
Modules to compute the matching cost and solve the corresponding LSAP.
"""
import torch
from detr.util.box_ops import box_cxcywh_to_xyxy, generalized_box_iou
from scipy.optimize import linear_sum_assignment
from torch import nn
from detr.util.box_ops import box_cxcywh_to_xyxy, generalized_box_iou
class HungarianMatcher(nn.Module):
"""This class computes an assignment between the targets and the predictions of the network
......@@ -19,7 +18,13 @@ class HungarianMatcher(nn.Module):
while the others are un-matched (and thus treated as non-objects).
"""
def __init__(self, cost_class: float = 1, cost_bbox: float = 1, cost_giou: float = 1, use_focal_loss=False):
def __init__(
self,
cost_class: float = 1,
cost_bbox: float = 1,
cost_giou: float = 1,
use_focal_loss=False,
):
"""Creates the matcher
Params:
......@@ -31,12 +36,14 @@ class HungarianMatcher(nn.Module):
self.cost_class = cost_class
self.cost_bbox = cost_bbox
self.cost_giou = cost_giou
assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, "all costs cant be 0"
assert (
cost_class != 0 or cost_bbox != 0 or cost_giou != 0
), "all costs cant be 0"
self.use_focal_loss = use_focal_loss
@torch.no_grad()
def forward(self, outputs, targets):
""" Performs the matching
"""Performs the matching
Params:
outputs: This is a dict that contains at least these entries:
......@@ -61,7 +68,9 @@ class HungarianMatcher(nn.Module):
if self.use_focal_loss:
out_prob = outputs["pred_logits"].flatten(0, 1).sigmoid()
else:
out_prob = outputs["pred_logits"].flatten(0, 1).softmax(-1) # [batch_size * num_queries, num_classes]
out_prob = (
outputs["pred_logits"].flatten(0, 1).softmax(-1)
) # [batch_size * num_queries, num_classes]
out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4]
# Also concat the target labels and boxes
......@@ -74,29 +83,57 @@ class HungarianMatcher(nn.Module):
if self.use_focal_loss:
alpha = 0.25
gamma = 2.0
neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())
pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
neg_cost_class = (
(1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())
)
pos_cost_class = (
alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
)
cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids]
else:
cost_class = -out_prob[:, tgt_ids] # shape [batch_size * num_queries, \sum_b NUM-BOX_b]
cost_class = -out_prob[
:, tgt_ids
] # shape [batch_size * num_queries, \sum_b NUM-BOX_b]
# Compute the L1 cost between boxes
cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1) # shape [batch_size * num_queries,\sum_b NUM-BOX_b]
cost_bbox = torch.cdist(
out_bbox, tgt_bbox, p=1
) # shape [batch_size * num_queries,\sum_b NUM-BOX_b]
# Compute the giou cost betwen boxes
# shape [batch_size * num_queries, \sum_b NUM-BOX_b]
cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox))
cost_giou = -generalized_box_iou(
box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox)
)
# Final cost matrix
C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
C = C.view(bs, num_queries, -1).cpu() # shape [batch_size, num_queries, \sum_b NUM-BOX_b]
C = (
self.cost_bbox * cost_bbox
+ self.cost_class * cost_class
+ self.cost_giou * cost_giou
)
C = C.view(
bs, num_queries, -1
).cpu() # shape [batch_size, num_queries, \sum_b NUM-BOX_b]
sizes = [len(v["boxes"]) for v in targets] # shape [batch_size,]
# each split c shape [batch_size, num_queries, NUM-BOX_b]
indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
indices = [
linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))
]
# A list where each item is [row_indices, col_indices]
return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
return [
(
torch.as_tensor(i, dtype=torch.int64),
torch.as_tensor(j, dtype=torch.int64),
)
for i, j in indices
]
def build_matcher(args):
return HungarianMatcher(cost_class=args.set_cost_class, cost_bbox=args.set_cost_bbox, cost_giou=args.set_cost_giou)
return HungarianMatcher(
cost_class=args.set_cost_class,
cost_bbox=args.set_cost_bbox,
cost_giou=args.set_cost_giou,
)
......@@ -5,10 +5,10 @@
Various positional encodings for the transformer.
"""
import math
import torch
from torch import nn
import torch
from detr.util.misc import NestedTensor
from torch import nn
class PositionEmbeddingSine(nn.Module):
......@@ -16,7 +16,15 @@ class PositionEmbeddingSine(nn.Module):
This is a more standard version of the position embedding, very similar to the one
used by the Attention is all you need paper, generalized to work on images.
"""
def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None, centered=False):
def __init__(
self,
num_pos_feats=64,
temperature=10000,
normalize=False,
scale=None,
centered=False,
):
super().__init__()
self.num_pos_feats = num_pos_feats
self.temperature = temperature
......@@ -47,13 +55,25 @@ class PositionEmbeddingSine(nn.Module):
x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) # shape (N, )
dim_t = self.temperature ** (
2 * (dim_t // 2) / self.num_pos_feats
) # shape (N, )
pos_x = x_embed[:, :, :, None] / dim_t # shape (B, H, W, N)
pos_y = y_embed[:, :, :, None] / dim_t
pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) # shape (B, H, W, N)
pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) # shape (B, H, W, N)
pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) # shape (B, 2*N, H, W)
pos_x = torch.stack(
(pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
).flatten(
3
) # shape (B, H, W, N)
pos_y = torch.stack(
(pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
).flatten(
3
) # shape (B, H, W, N)
pos = torch.cat((pos_y, pos_x), dim=3).permute(
0, 3, 1, 2
) # shape (B, 2*N, H, W)
return pos
......@@ -61,6 +81,7 @@ class PositionEmbeddingLearned(nn.Module):
"""
Absolute pos embedding, learned.
"""
def __init__(self, num_pos_feats=256):
super().__init__()
self.row_embed = nn.Embedding(50, num_pos_feats)
......@@ -78,19 +99,27 @@ class PositionEmbeddingLearned(nn.Module):
j = torch.arange(h, device=x.device)
x_emb = self.col_embed(i)
y_emb = self.row_embed(j)
pos = torch.cat([
pos = (
torch.cat(
[
x_emb.unsqueeze(0).repeat(h, 1, 1),
y_emb.unsqueeze(1).repeat(1, w, 1),
], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1)
],
dim=-1,
)
.permute(2, 0, 1)
.unsqueeze(0)
.repeat(x.shape[0], 1, 1, 1)
)
return pos
def build_position_encoding(args):
N_steps = args.hidden_dim // 2
if args.position_embedding in ('v2', 'sine'):
if args.position_embedding in ("v2", "sine"):
# TODO find a better way of exposing other arguments
position_embedding = PositionEmbeddingSine(N_steps, normalize=True)
elif args.position_embedding in ('v3', 'learned'):
elif args.position_embedding in ("v3", "learned"):
position_embedding = PositionEmbeddingLearned(N_steps)
else:
raise ValueError(f"not supported {args.position_embedding}")
......
......@@ -8,14 +8,13 @@ import io
from collections import defaultdict
from typing import List, Optional
import detr.util.box_ops as box_ops
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from PIL import Image
import detr.util.box_ops as box_ops
from detr.util.misc import NestedTensor, interpolate, nested_tensor_from_tensor_list
from PIL import Image
from torch import Tensor
try:
from panopticapi.utils import id2rgb, rgb2id
......@@ -33,8 +32,12 @@ class DETRsegm(nn.Module):
p.requires_grad_(False)
hidden_dim, nheads = detr.transformer.d_model, detr.transformer.nhead
self.bbox_attention = MHAttentionMap(hidden_dim, hidden_dim, nheads, dropout=0.0)
self.mask_head = MaskHeadSmallConv(hidden_dim + nheads, [1024, 512, 256], hidden_dim)
self.bbox_attention = MHAttentionMap(
hidden_dim, hidden_dim, nheads, dropout=0.0
)
self.mask_head = MaskHeadSmallConv(
hidden_dim + nheads, [1024, 512, 256], hidden_dim
)
def forward(self, samples: NestedTensor):
if isinstance(samples, (list, torch.Tensor)):
......@@ -46,19 +49,27 @@ class DETRsegm(nn.Module):
src, mask = features[-1].decompose()
assert mask is not None
src_proj = self.detr.input_proj(src)
hs, memory = self.detr.transformer(src_proj, mask, self.detr.query_embed.weight, pos[-1])
hs, memory = self.detr.transformer(
src_proj, mask, self.detr.query_embed.weight, pos[-1]
)
outputs_class = self.detr.class_embed(hs)
outputs_coord = self.detr.bbox_embed(hs).sigmoid()
out = {"pred_logits": outputs_class[-1], "pred_boxes": outputs_coord[-1]}
if self.detr.aux_loss:
out['aux_outputs'] = self.detr._set_aux_loss(outputs_class, outputs_coord)
out["aux_outputs"] = self.detr._set_aux_loss(outputs_class, outputs_coord)
# FIXME h_boxes takes the last one computed, keep this in mind
bbox_mask = self.bbox_attention(hs[-1], memory, mask=mask)
seg_masks = self.mask_head(src_proj, bbox_mask, [features[2].tensors, features[1].tensors, features[0].tensors])
outputs_seg_masks = seg_masks.view(bs, self.detr.num_queries, seg_masks.shape[-2], seg_masks.shape[-1])
seg_masks = self.mask_head(
src_proj,
bbox_mask,
[features[2].tensors, features[1].tensors, features[0].tensors],
)
outputs_seg_masks = seg_masks.view(
bs, self.detr.num_queries, seg_masks.shape[-2], seg_masks.shape[-1]
)
out["pred_masks"] = outputs_seg_masks
return out
......@@ -77,7 +88,14 @@ class MaskHeadSmallConv(nn.Module):
def __init__(self, dim, fpn_dims, context_dim):
super().__init__()
inter_dims = [dim, context_dim // 2, context_dim // 4, context_dim // 8, context_dim // 16, context_dim // 64]
inter_dims = [
dim,
context_dim // 2,
context_dim // 4,
context_dim // 8,
context_dim // 16,
context_dim // 64,
]
self.lay1 = torch.nn.Conv2d(dim, dim, 3, padding=1)
self.gn1 = torch.nn.GroupNorm(8, dim)
self.lay2 = torch.nn.Conv2d(dim, inter_dims[1], 3, padding=1)
......@@ -159,9 +177,19 @@ class MHAttentionMap(nn.Module):
def forward(self, q, k, mask: Optional[Tensor] = None):
q = self.q_linear(q)
k = F.conv2d(k, self.k_linear.weight.unsqueeze(-1).unsqueeze(-1), self.k_linear.bias)
qh = q.view(q.shape[0], q.shape[1], self.num_heads, self.hidden_dim // self.num_heads)
kh = k.view(k.shape[0], self.num_heads, self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1])
k = F.conv2d(
k, self.k_linear.weight.unsqueeze(-1).unsqueeze(-1), self.k_linear.bias
)
qh = q.view(
q.shape[0], q.shape[1], self.num_heads, self.hidden_dim // self.num_heads
)
kh = k.view(
k.shape[0],
self.num_heads,
self.hidden_dim // self.num_heads,
k.shape[-2],
k.shape[-1],
)
weights = torch.einsum("bqnc,bnchw->bqnhw", qh * self.normalize_fact, kh)
if mask is not None:
......@@ -189,7 +217,9 @@ def dice_loss(inputs, targets, num_boxes):
return loss.sum() / num_boxes
def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
def sigmoid_focal_loss(
inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2
):
"""
Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
Args:
......@@ -227,10 +257,14 @@ class PostProcessSegm(nn.Module):
assert len(orig_target_sizes) == len(max_target_sizes)
max_h, max_w = max_target_sizes.max(0)[0].tolist()
outputs_masks = outputs["pred_masks"].squeeze(2)
outputs_masks = F.interpolate(outputs_masks, size=(max_h, max_w), mode="bilinear", align_corners=False)
outputs_masks = F.interpolate(
outputs_masks, size=(max_h, max_w), mode="bilinear", align_corners=False
)
outputs_masks = (outputs_masks.sigmoid() > self.threshold).cpu()
for i, (cur_mask, t, tt) in enumerate(zip(outputs_masks, max_target_sizes, orig_target_sizes)):
for i, (cur_mask, t, tt) in enumerate(
zip(outputs_masks, max_target_sizes, orig_target_sizes)
):
img_h, img_w = t[0], t[1]
results[i]["masks"] = cur_mask[:, :img_h, :img_w].unsqueeze(1)
results[i]["masks"] = F.interpolate(
......@@ -242,7 +276,7 @@ class PostProcessSegm(nn.Module):
class PostProcessPanoptic(nn.Module):
"""This class converts the output of the model to the final panoptic result, in the format expected by the
coco panoptic API """
coco panoptic API"""
def __init__(self, is_thing_map, threshold=0.85):
"""
......@@ -255,8 +289,8 @@ class PostProcessPanoptic(nn.Module):
self.threshold = threshold
self.is_thing_map = is_thing_map
def forward(self, outputs, processed_sizes, target_sizes=None): #noqa: C901
""" This function computes the panoptic prediction from the model's predictions.
def forward(self, outputs, processed_sizes, target_sizes=None): # noqa: C901
"""This function computes the panoptic prediction from the model's predictions.
Parameters:
outputs: This is a dict coming directly from the model. See the model doc for the content.
processed_sizes: This is a list of tuples (or torch tensors) of sizes of the images that were passed to the
......@@ -267,7 +301,11 @@ class PostProcessPanoptic(nn.Module):
if target_sizes is None:
target_sizes = processed_sizes
assert len(processed_sizes) == len(target_sizes)
out_logits, raw_masks, raw_boxes = outputs["pred_logits"], outputs["pred_masks"], outputs["pred_boxes"]
out_logits, raw_masks, raw_boxes = (
outputs["pred_logits"],
outputs["pred_masks"],
outputs["pred_boxes"],
)
assert len(out_logits) == len(raw_masks) == len(target_sizes)
preds = []
......@@ -281,12 +319,16 @@ class PostProcessPanoptic(nn.Module):
):
# we filter empty queries and detection below threshold
scores, labels = cur_logits.softmax(-1).max(-1)
keep = labels.ne(outputs["pred_logits"].shape[-1] - 1) & (scores > self.threshold)
keep = labels.ne(outputs["pred_logits"].shape[-1] - 1) & (
scores > self.threshold
)
cur_scores, cur_classes = cur_logits.softmax(-1).max(-1)
cur_scores = cur_scores[keep]
cur_classes = cur_classes[keep]
cur_masks = cur_masks[keep]
cur_masks = interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1)
cur_masks = interpolate(
cur_masks[:, None], to_tuple(size), mode="bilinear"
).squeeze(1)
cur_boxes = box_ops.box_cxcywh_to_xyxy(cur_boxes[keep])
h, w = cur_masks.shape[-2:]
......@@ -322,10 +364,14 @@ class PostProcessPanoptic(nn.Module):
final_h, final_w = to_tuple(target_size)
seg_img = Image.fromarray(id2rgb(m_id.view(h, w).cpu().numpy()))
seg_img = seg_img.resize(size=(final_w, final_h), resample=Image.NEAREST)
seg_img = seg_img.resize(
size=(final_w, final_h), resample=Image.NEAREST
)
np_seg_img = (
torch.ByteTensor(torch.ByteStorage.from_buffer(seg_img.tobytes())).view(final_h, final_w, 3).numpy()
torch.ByteTensor(torch.ByteStorage.from_buffer(seg_img.tobytes()))
.view(final_h, final_w, 3)
.numpy()
)
m_id = torch.from_numpy(rgb2id(np_seg_img))
......@@ -339,7 +385,9 @@ class PostProcessPanoptic(nn.Module):
# We know filter empty masks as long as we find some
while True:
filtered_small = torch.as_tensor(
[area[i] <= 4 for i, c in enumerate(cur_classes)], dtype=torch.bool, device=keep.device
[area[i] <= 4 for i, c in enumerate(cur_classes)],
dtype=torch.bool,
device=keep.device,
)
if filtered_small.any().item():
cur_scores = cur_scores[~filtered_small]
......@@ -355,11 +403,21 @@ class PostProcessPanoptic(nn.Module):
segments_info = []
for i, a in enumerate(area):
cat = cur_classes[i].item()
segments_info.append({"id": i, "isthing": self.is_thing_map[cat], "category_id": cat, "area": a})
segments_info.append(
{
"id": i,
"isthing": self.is_thing_map[cat],
"category_id": cat,
"area": a,
}
)
del cur_classes
with io.BytesIO() as out:
seg_img.save(out, format="PNG")
predictions = {"png_string": out.getvalue(), "segments_info": segments_info}
predictions = {
"png_string": out.getvalue(),
"segments_info": segments_info,
}
preds.append(predictions)
return preds
......@@ -18,23 +18,38 @@ from torch import nn, Tensor
class Transformer(nn.Module):
def __init__(self, d_model=512, nhead=8, num_encoder_layers=6,
num_decoder_layers=6, dim_feedforward=2048, dropout=0.1,
activation="relu", normalize_before=False,
return_intermediate_dec=False):
def __init__(
self,
d_model=512,
nhead=8,
num_encoder_layers=6,
num_decoder_layers=6,
dim_feedforward=2048,
dropout=0.1,
activation="relu",
normalize_before=False,
return_intermediate_dec=False,
):
super().__init__()
encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward,
dropout, activation, normalize_before)
encoder_layer = TransformerEncoderLayer(
d_model, nhead, dim_feedforward, dropout, activation, normalize_before
)
encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
self.encoder = TransformerEncoder(
encoder_layer, num_encoder_layers, encoder_norm
)
decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward,
dropout, activation, normalize_before)
decoder_layer = TransformerDecoderLayer(
d_model, nhead, dim_feedforward, dropout, activation, normalize_before
)
decoder_norm = nn.LayerNorm(d_model)
self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm,
return_intermediate=return_intermediate_dec)
self.decoder = TransformerDecoder(
decoder_layer,
num_decoder_layers,
decoder_norm,
return_intermediate=return_intermediate_dec,
)
self._reset_parameters()
......@@ -63,30 +78,41 @@ class Transformer(nn.Module):
# memory shape (L, B, C)
memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed)
# hs shape (NUM_LEVEL, S, B, C)
hs = self.decoder(tgt, memory, memory_key_padding_mask=mask,
pos=pos_embed, query_pos=query_embed)
hs = self.decoder(
tgt,
memory,
memory_key_padding_mask=mask,
pos=pos_embed,
query_pos=query_embed,
)
# return shape (NUM_LEVEL, B, S, C) and (B, C, H, W)
return hs.transpose(1, 2), memory.permute(1, 2, 0).view(bs, c, h, w)
class TransformerEncoder(nn.Module):
def __init__(self, encoder_layer, num_layers, norm=None):
super().__init__()
self.layers = _get_clones(encoder_layer, num_layers)
self.num_layers = num_layers
self.norm = norm
def forward(self, src,
def forward(
self,
src,
mask: Optional[Tensor] = None,
src_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None):
pos: Optional[Tensor] = None,
):
output = src
# mask, shape (L, L)
# src_key_padding_mask, shape (B, L)
for layer in self.layers:
output = layer(output, src_mask=mask,
src_key_padding_mask=src_key_padding_mask, pos=pos)
output = layer(
output,
src_mask=mask,
src_key_padding_mask=src_key_padding_mask,
pos=pos,
)
if self.norm is not None:
output = self.norm(output)
......@@ -95,7 +121,6 @@ class TransformerEncoder(nn.Module):
class TransformerDecoder(nn.Module):
def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False):
super().__init__()
self.layers = _get_clones(decoder_layer, num_layers)
......@@ -103,13 +128,17 @@ class TransformerDecoder(nn.Module):
self.norm = norm
self.return_intermediate = return_intermediate
def forward(self, tgt, memory,
def forward(
self,
tgt,
memory,
tgt_mask: Optional[Tensor] = None,
memory_mask: Optional[Tensor] = None,
tgt_key_padding_mask: Optional[Tensor] = None,
memory_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None):
query_pos: Optional[Tensor] = None,
):
output = tgt
intermediate = []
......@@ -119,11 +148,16 @@ class TransformerDecoder(nn.Module):
# memory_mask shape (L, S)
# memory_key_padding_mask shape (B, S)
for layer in self.layers:
output = layer(output, memory, tgt_mask=tgt_mask,
output = layer(
output,
memory,
tgt_mask=tgt_mask,
memory_mask=memory_mask,
tgt_key_padding_mask=tgt_key_padding_mask,
memory_key_padding_mask=memory_key_padding_mask,
pos=pos, query_pos=query_pos)
pos=pos,
query_pos=query_pos,
)
if self.return_intermediate:
intermediate.append(self.norm(output))
......@@ -140,9 +174,15 @@ class TransformerDecoder(nn.Module):
class TransformerEncoderLayer(nn.Module):
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
activation="relu", normalize_before=False):
def __init__(
self,
d_model,
nhead,
dim_feedforward=2048,
dropout=0.1,
activation="relu",
normalize_before=False,
):
super().__init__()
self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
# Implementation of Feedforward model
......@@ -161,16 +201,19 @@ class TransformerEncoderLayer(nn.Module):
def with_pos_embed(self, tensor, pos: Optional[Tensor]):
return tensor if pos is None else tensor + pos
def forward_post(self,
def forward_post(
self,
src,
src_mask: Optional[Tensor] = None,
src_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None):
pos: Optional[Tensor] = None,
):
q = k = self.with_pos_embed(src, pos) # shape (L, B, D)
# src mask, shape (L, L)
# src_key_padding_mask: shape (B, L)
src2 = self.self_attn(q, k, src, attn_mask=src_mask,
key_padding_mask=src_key_padding_mask)[0]
src2 = self.self_attn(
q, k, src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask
)[0]
src = src + self.dropout1(src2)
src = self.norm1(src)
src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
......@@ -178,33 +221,46 @@ class TransformerEncoderLayer(nn.Module):
src = self.norm2(src)
return src
def forward_pre(self, src,
def forward_pre(
self,
src,
src_mask: Optional[Tensor] = None,
src_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None):
pos: Optional[Tensor] = None,
):
src2 = self.norm1(src)
q = k = self.with_pos_embed(src2, pos)
src2 = self.self_attn(q, k, src2, attn_mask=src_mask,
key_padding_mask=src_key_padding_mask)[0]
src2 = self.self_attn(
q, k, src2, attn_mask=src_mask, key_padding_mask=src_key_padding_mask
)[0]
src = src + self.dropout1(src2)
src2 = self.norm2(src)
src2 = self.linear2(self.dropout(self.activation(self.linear1(src2))))
src = src + self.dropout2(src2)
return src
def forward(self, src,
def forward(
self,
src,
src_mask: Optional[Tensor] = None,
src_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None):
pos: Optional[Tensor] = None,
):
if self.normalize_before:
return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
return self.forward_post(src, src_mask, src_key_padding_mask, pos)
class TransformerDecoderLayer(nn.Module):
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
activation="relu", normalize_before=False):
def __init__(
self,
d_model,
nhead,
dim_feedforward=2048,
dropout=0.1,
activation="relu",
normalize_before=False,
):
super().__init__()
self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
......@@ -226,28 +282,36 @@ class TransformerDecoderLayer(nn.Module):
def with_pos_embed(self, tensor, pos: Optional[Tensor]):
return tensor if pos is None else tensor + pos
def forward_post(self, tgt, memory,
def forward_post(
self,
tgt,
memory,
tgt_mask: Optional[Tensor] = None,
memory_mask: Optional[Tensor] = None,
tgt_key_padding_mask: Optional[Tensor] = None,
memory_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None):
query_pos: Optional[Tensor] = None,
):
# tgt shape (L, B, C)
# tgt_mask shape (L, L)
# tgt_key_padding_mask shape (B, L)
q = k = self.with_pos_embed(tgt, query_pos)
tgt2 = self.self_attn(q, k, tgt, attn_mask=tgt_mask,
key_padding_mask=tgt_key_padding_mask)[0]
tgt2 = self.self_attn(
q, k, tgt, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask
)[0]
tgt = tgt + self.dropout1(tgt2)
tgt = self.norm1(tgt)
# memory_mask shape (L, S)
# memory_key_padding_mask shape (B, S)
# query_pos shape (L, B, C)
tgt2 = self.multihead_attn(self.with_pos_embed(tgt, query_pos),
tgt2 = self.multihead_attn(
self.with_pos_embed(tgt, query_pos),
self.with_pos_embed(memory, pos),
memory, attn_mask=memory_mask,
key_padding_mask=memory_key_padding_mask)[0]
memory,
attn_mask=memory_mask,
key_padding_mask=memory_key_padding_mask,
)[0]
tgt = tgt + self.dropout2(tgt2)
tgt = self.norm2(tgt)
tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
......@@ -256,41 +320,69 @@ class TransformerDecoderLayer(nn.Module):
# return tgt shape (L, B, C)
return tgt
def forward_pre(self, tgt, memory,
def forward_pre(
self,
tgt,
memory,
tgt_mask: Optional[Tensor] = None,
memory_mask: Optional[Tensor] = None,
tgt_key_padding_mask: Optional[Tensor] = None,
memory_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None):
query_pos: Optional[Tensor] = None,
):
tgt2 = self.norm1(tgt)
q = k = self.with_pos_embed(tgt2, query_pos)
tgt2 = self.self_attn(q, k, tgt2, attn_mask=tgt_mask,
key_padding_mask=tgt_key_padding_mask)[0]
tgt2 = self.self_attn(
q, k, tgt2, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask
)[0]
tgt = tgt + self.dropout1(tgt2)
tgt2 = self.norm2(tgt)
tgt2 = self.multihead_attn(self.with_pos_embed(tgt2, query_pos),
tgt2 = self.multihead_attn(
self.with_pos_embed(tgt2, query_pos),
self.with_pos_embed(memory, pos),
memory, attn_mask=memory_mask,
key_padding_mask=memory_key_padding_mask)[0]
memory,
attn_mask=memory_mask,
key_padding_mask=memory_key_padding_mask,
)[0]
tgt = tgt + self.dropout2(tgt2)
tgt2 = self.norm3(tgt)
tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
tgt = tgt + self.dropout3(tgt2)
return tgt
def forward(self, tgt, memory,
def forward(
self,
tgt,
memory,
tgt_mask: Optional[Tensor] = None,
memory_mask: Optional[Tensor] = None,
tgt_key_padding_mask: Optional[Tensor] = None,
memory_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None):
query_pos: Optional[Tensor] = None,
):
if self.normalize_before:
return self.forward_pre(tgt, memory, tgt_mask, memory_mask,
tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
return self.forward_post(tgt, memory, tgt_mask, memory_mask,
tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
return self.forward_pre(
tgt,
memory,
tgt_mask,
memory_mask,
tgt_key_padding_mask,
memory_key_padding_mask,
pos,
query_pos,
)
return self.forward_post(
tgt,
memory,
tgt_mask,
memory_mask,
tgt_key_padding_mask,
memory_key_padding_mask,
pos,
query_pos,
)
def _get_clones(module, N):
......@@ -318,4 +410,4 @@ def _get_activation_fn(activation):
return F.gelu
if activation == "glu":
return F.glu
raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
raise RuntimeError(f"activation should be relu/gelu, not {activation}.")
......@@ -9,15 +9,15 @@
# ------------------------------------------------------------------------------------------------
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
from __future__ import print_function
import warnings
import math
import warnings
import torch
from torch import nn
import torch.nn.functional as F
from torch import nn
from torch.nn.init import xavier_uniform_, constant_
from ..functions import MSDeformAttnFunction
......@@ -25,8 +25,10 @@ from ..functions import MSDeformAttnFunction
def _is_power_of_2(n):
if (not isinstance(n, int)) or (n < 0):
raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
return (n & (n-1) == 0) and n != 0
raise ValueError(
"invalid input for _is_power_of_2: {} (type: {})".format(n, type(n))
)
return (n & (n - 1) == 0) and n != 0
class MSDeformAttn(nn.Module):
......@@ -40,12 +42,18 @@ class MSDeformAttn(nn.Module):
"""
super().__init__()
if d_model % n_heads != 0:
raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
raise ValueError(
"d_model must be divisible by n_heads, but got {} and {}".format(
d_model, n_heads
)
)
_d_per_head = d_model // n_heads
# you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
if not _is_power_of_2(_d_per_head):
warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
"which is more efficient in our CUDA implementation.")
warnings.warn(
"You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
"which is more efficient in our CUDA implementation."
)
self.im2col_step = 64
......@@ -62,25 +70,39 @@ class MSDeformAttn(nn.Module):
self._reset_parameters()
def _reset_parameters(self):
constant_(self.sampling_offsets.weight.data, 0.)
constant_(self.sampling_offsets.weight.data, 0.0)
# shape (num_heads,)
thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
thetas = torch.arange(self.n_heads, dtype=torch.float32) * (
2.0 * math.pi / self.n_heads
)
# shape (num_heads, 2)
grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
# shape (num_heads, num_levels, num_points, 2)
grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
grid_init = (
(grid_init / grid_init.abs().max(-1, keepdim=True)[0])
.view(self.n_heads, 1, 1, 2)
.repeat(1, self.n_levels, self.n_points, 1)
)
for i in range(self.n_points):
grid_init[:, :, i, :] *= i + 1
with torch.no_grad():
self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
constant_(self.attention_weights.weight.data, 0.)
constant_(self.attention_weights.bias.data, 0.)
constant_(self.attention_weights.weight.data, 0.0)
constant_(self.attention_weights.bias.data, 0.0)
xavier_uniform_(self.value_proj.weight.data)
constant_(self.value_proj.bias.data, 0.)
constant_(self.value_proj.bias.data, 0.0)
xavier_uniform_(self.output_proj.weight.data)
constant_(self.output_proj.bias.data, 0.)
def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
constant_(self.output_proj.bias.data, 0.0)
def forward(
self,
query,
reference_points,
input_flatten,
input_spatial_shapes,
input_level_start_index,
input_padding_mask=None,
):
"""
:param query (N, Length_{query}, C)
:param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
......@@ -100,21 +122,45 @@ class MSDeformAttn(nn.Module):
if input_padding_mask is not None:
value = value.masked_fill(input_padding_mask[..., None], float(0))
value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
sampling_offsets = self.sampling_offsets(query).view(
N, Len_q, self.n_heads, self.n_levels, self.n_points, 2
)
attention_weights = self.attention_weights(query).view(
N, Len_q, self.n_heads, self.n_levels * self.n_points
)
attention_weights = F.softmax(attention_weights, -1).view(
N, Len_q, self.n_heads, self.n_levels, self.n_points
)
# N, Len_q, n_heads, n_levels, n_points, 2
if reference_points.shape[-1] == 2:
offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
sampling_locations = reference_points[:, :, None, :, None, :] \
offset_normalizer = torch.stack(
[input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1
)
sampling_locations = (
reference_points[:, :, None, :, None, :]
+ sampling_offsets / offset_normalizer[None, None, None, :, None, :]
)
elif reference_points.shape[-1] == 4:
sampling_locations = reference_points[:, :, None, :, None, :2] \
+ sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
sampling_locations = (
reference_points[:, :, None, :, None, :2]
+ sampling_offsets
/ self.n_points
* reference_points[:, :, None, :, None, 2:]
* 0.5
)
else:
raise ValueError(
'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
"Last dim of reference_points must be 2 or 4, but get {} instead.".format(
reference_points.shape[-1]
)
)
output = MSDeformAttnFunction.apply(
value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
value,
input_spatial_shapes,
input_level_start_index,
sampling_locations,
attention_weights,
self.im2col_step,
)
output = self.output_proj(output)
return output
......@@ -4,9 +4,9 @@ from d2go.config import CfgNode as CN
from d2go.data.dataset_mappers.build import D2GO_DATA_MAPPER_REGISTRY
from d2go.data.dataset_mappers.d2go_dataset_mapper import D2GoDatasetMapper
from d2go.runner import GeneralizedRCNNRunner
from detr.d2 import DetrDatasetMapper, add_detr_config
from detr.backbone.deit import add_deit_backbone_config
from detr.backbone.pit import add_pit_backbone_config
from detr.d2 import DetrDatasetMapper, add_detr_config
@D2GO_DATA_MAPPER_REGISTRY.register()
......
......@@ -10,15 +10,13 @@ from torchvision.ops.boxes import box_area
def box_cxcywh_to_xyxy(x):
x_c, y_c, w, h = x.unbind(-1)
b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
(x_c + 0.5 * w), (y_c + 0.5 * h)]
b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)]
return torch.stack(b, dim=-1)
def box_xyxy_to_cxcywh(x):
x0, y0, x1, y1 = x.unbind(-1)
b = [(x0 + x1) / 2, (y0 + y1) / 2,
(x1 - x0), (y1 - y0)]
b = [(x0 + x1) / 2, (y0 + y1) / 2, (x1 - x0), (y1 - y0)]
return torch.stack(b, dim=-1)
......@@ -79,11 +77,11 @@ def masks_to_boxes(masks):
x = torch.arange(0, w, dtype=torch.float)
y, x = torch.meshgrid(y, x)
x_mask = (masks * x.unsqueeze(0))
x_mask = masks * x.unsqueeze(0)
x_max = x_mask.flatten(1).max(-1)[0]
x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
y_mask = (masks * y.unsqueeze(0))
y_mask = masks * y.unsqueeze(0)
y_max = y_mask.flatten(1).max(-1)[0]
y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
......
......@@ -6,21 +6,22 @@ Misc functions, including distributed helpers.
Mostly copy-paste from torchvision references.
"""
import datetime
import os
import pickle
import subprocess
import time
from collections import defaultdict, deque
import datetime
import pickle
from distutils.version import LooseVersion
from typing import Optional, List
import torch
import torch.distributed as dist
from torch import Tensor
# needed due to empty tensor bug in pytorch and torchvision 0.5
import torchvision
from distutils.version import LooseVersion
from torch import Tensor
if LooseVersion(torchvision.__version__) < LooseVersion("0.7.0"):
from torchvision.ops import _new_empty_tensor
from torchvision.ops.misc import _output_size
......@@ -50,7 +51,7 @@ class SmoothedValue(object):
"""
if not is_dist_avail_and_initialized():
return
t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda")
dist.barrier()
dist.all_reduce(t)
t = t.tolist()
......@@ -85,7 +86,8 @@ class SmoothedValue(object):
avg=self.avg,
global_avg=self.global_avg,
max=self.max,
value=self.value)
value=self.value,
)
def all_gather(data):
......@@ -119,14 +121,16 @@ def all_gather(data):
for _ in size_list:
tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda"))
if local_size != max_size:
padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda")
padding = torch.empty(
size=(max_size - local_size,), dtype=torch.uint8, device="cuda"
)
tensor = torch.cat((tensor, padding), dim=0)
dist.all_gather(tensor_list, tensor)
data_list = []
for size, tensor in zip(size_list, tensor_list):
buffer = tensor.cpu().numpy().tobytes()[:size]
data_list.append(pickle.loads(buffer)) #noqa
data_list.append(pickle.loads(buffer)) # noqa
return data_list
......@@ -175,15 +179,14 @@ class MetricLogger(object):
return self.meters[attr]
if attr in self.__dict__:
return self.__dict__[attr]
raise AttributeError("'{}' object has no attribute '{}'".format(
type(self).__name__, attr))
raise AttributeError(
"'{}' object has no attribute '{}'".format(type(self).__name__, attr)
)
def __str__(self):
loss_str = []
for name, meter in self.meters.items():
loss_str.append(
"{}: {}".format(name, str(meter))
)
loss_str.append("{}: {}".format(name, str(meter)))
return self.delimiter.join(loss_str)
def synchronize_between_processes(self):
......@@ -196,31 +199,35 @@ class MetricLogger(object):
def log_every(self, iterable, print_freq, header=None):
i = 0
if not header:
header = ''
header = ""
start_time = time.time()
end = time.time()
iter_time = SmoothedValue(fmt='{avg:.4f}')
data_time = SmoothedValue(fmt='{avg:.4f}')
space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
iter_time = SmoothedValue(fmt="{avg:.4f}")
data_time = SmoothedValue(fmt="{avg:.4f}")
space_fmt = ":" + str(len(str(len(iterable)))) + "d"
if torch.cuda.is_available():
log_msg = self.delimiter.join([
log_msg = self.delimiter.join(
[
header,
'[{0' + space_fmt + '}/{1}]',
'eta: {eta}',
'{meters}',
'time: {time}',
'data: {data}',
'max mem: {memory:.0f}'
])
"[{0" + space_fmt + "}/{1}]",
"eta: {eta}",
"{meters}",
"time: {time}",
"data: {data}",
"max mem: {memory:.0f}",
]
)
else:
log_msg = self.delimiter.join([
log_msg = self.delimiter.join(
[
header,
'[{0' + space_fmt + '}/{1}]',
'eta: {eta}',
'{meters}',
'time: {time}',
'data: {data}'
])
"[{0" + space_fmt + "}/{1}]",
"eta: {eta}",
"{meters}",
"time: {time}",
"data: {data}",
]
)
MB = 1024.0 * 1024.0
for obj in iterable:
data_time.update(time.time() - end)
......@@ -230,38 +237,54 @@ class MetricLogger(object):
eta_seconds = iter_time.global_avg * (len(iterable) - i)
eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
if torch.cuda.is_available():
print(log_msg.format(
i, len(iterable), eta=eta_string,
print(
log_msg.format(
i,
len(iterable),
eta=eta_string,
meters=str(self),
time=str(iter_time), data=str(data_time),
memory=torch.cuda.max_memory_allocated() / MB))
time=str(iter_time),
data=str(data_time),
memory=torch.cuda.max_memory_allocated() / MB,
)
)
else:
print(log_msg.format(
i, len(iterable), eta=eta_string,
print(
log_msg.format(
i,
len(iterable),
eta=eta_string,
meters=str(self),
time=str(iter_time), data=str(data_time)))
time=str(iter_time),
data=str(data_time),
)
)
i += 1
end = time.time()
total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print('{} Total time: {} ({:.4f} s / it)'.format(
header, total_time_str, total_time / len(iterable)))
print(
"{} Total time: {} ({:.4f} s / it)".format(
header, total_time_str, total_time / len(iterable)
)
)
def get_sha():
cwd = os.path.dirname(os.path.abspath(__file__))
def _run(command):
return subprocess.check_output(command, cwd=cwd).decode('ascii').strip()
sha = 'N/A'
return subprocess.check_output(command, cwd=cwd).decode("ascii").strip()
sha = "N/A"
diff = "clean"
branch = 'N/A'
branch = "N/A"
try:
sha = _run(['git', 'rev-parse', 'HEAD'])
subprocess.check_output(['git', 'diff'], cwd=cwd)
diff = _run(['git', 'diff-index', 'HEAD'])
sha = _run(["git", "rev-parse", "HEAD"])
subprocess.check_output(["git", "diff"], cwd=cwd)
diff = _run(["git", "diff-index", "HEAD"])
diff = "has uncommited changes" if diff else "clean"
branch = _run(['git', 'rev-parse', '--abbrev-ref', 'HEAD'])
branch = _run(["git", "rev-parse", "--abbrev-ref", "HEAD"])
except Exception:
pass
message = f"sha: {sha}, status: {diff}, branch: {branch}"
......@@ -325,9 +348,9 @@ def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
for img, pad_img, m in zip(tensor_list, tensor, mask):
pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
m[: img.shape[1], :img.shape[2]] = False
m[: img.shape[1], : img.shape[2]] = False
else:
raise ValueError('not supported')
raise ValueError("not supported")
return NestedTensor(tensor, mask)
......@@ -337,7 +360,9 @@ def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
max_size = []
for i in range(tensor_list[0].dim()):
max_size_i = torch.max(torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)).to(torch.int64)
max_size_i = torch.max(
torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)
).to(torch.int64)
max_size.append(max_size_i)
max_size = tuple(max_size)
......@@ -349,11 +374,15 @@ def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTen
padded_masks = []
for img in tensor_list:
padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
padded_img = torch.nn.functional.pad(
img, (0, padding[2], 0, padding[1], 0, padding[0])
)
padded_imgs.append(padded_img)
m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
padded_mask = torch.nn.functional.pad(
m, (0, padding[2], 0, padding[1]), "constant", 1
)
padded_masks.append(padded_mask.to(torch.bool))
tensor = torch.stack(padded_imgs)
......@@ -367,10 +396,11 @@ def setup_for_distributed(is_master):
This function disables printing when not in master process
"""
import builtins as __builtin__
builtin_print = __builtin__.print
def print(*args, **kwargs):
force = kwargs.pop('force', False)
force = kwargs.pop("force", False)
if is_master or force:
builtin_print(*args, **kwargs)
......@@ -407,26 +437,31 @@ def save_on_master(*args, **kwargs):
def init_distributed_mode(args):
if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
args.rank = int(os.environ["RANK"])
args.world_size = int(os.environ['WORLD_SIZE'])
args.gpu = int(os.environ['LOCAL_RANK'])
elif 'SLURM_PROCID' in os.environ:
args.rank = int(os.environ['SLURM_PROCID'])
args.world_size = int(os.environ["WORLD_SIZE"])
args.gpu = int(os.environ["LOCAL_RANK"])
elif "SLURM_PROCID" in os.environ:
args.rank = int(os.environ["SLURM_PROCID"])
args.gpu = args.rank % torch.cuda.device_count()
else:
print('Not using distributed mode')
print("Not using distributed mode")
args.distributed = False
return
args.distributed = True
torch.cuda.set_device(args.gpu)
args.dist_backend = 'nccl'
print('| distributed init (rank {}): {}'.format(
args.rank, args.dist_url), flush=True)
torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
world_size=args.world_size, rank=args.rank)
args.dist_backend = "nccl"
print(
"| distributed init (rank {}): {}".format(args.rank, args.dist_url), flush=True
)
torch.distributed.init_process_group(
backend=args.dist_backend,
init_method=args.dist_url,
world_size=args.world_size,
rank=args.rank,
)
torch.distributed.barrier()
setup_for_distributed(args.rank == 0)
......@@ -450,14 +485,16 @@ def accuracy(output, target, topk=(1,)):
return res
def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corners=None):
def interpolate(
input, size=None, scale_factor=None, mode="nearest", align_corners=None
):
# type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor
"""
Equivalent to nn.functional.interpolate, but with support for empty batch sizes.
This will eventually be supported natively by PyTorch, and this
class can go away.
"""
#if float(torchvision.__version__[:3]) < 0.7:
# if float(torchvision.__version__[:3]) < 0.7:
if LooseVersion(torchvision.__version__) < LooseVersion("0.7.0"):
if input.numel() > 0:
return torch.nn.functional.interpolate(
......@@ -468,10 +505,13 @@ def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corne
output_shape = list(input.shape[:-2]) + list(output_shape)
return _new_empty_tensor(input, output_shape)
else:
return torchvision.ops.misc.interpolate(input, size, scale_factor, mode, align_corners)
return torchvision.ops.misc.interpolate(
input, size, scale_factor, mode, align_corners
)
def inverse_sigmoid(x, eps=1e-5):
x = x.clamp(min=0, max=1)
x1 = x.clamp(min=eps)
x2 = (1 - x).clamp(min=eps)
return torch.log(x1/x2)
return torch.log(x1 / x2)
This diff is collapsed.
This diff is collapsed.
......@@ -6,20 +6,19 @@
# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
# ------------------------------------------------------------------------------------------------
import os
import glob
import os
import torch
from torch.utils.cpp_extension import CUDA_HOME
from torch.utils.cpp_extension import CppExtension
from torch.utils.cpp_extension import CUDAExtension
from setuptools import find_packages
from setuptools import setup
from torch.utils.cpp_extension import CUDAExtension
from torch.utils.cpp_extension import CUDA_HOME
from torch.utils.cpp_extension import CppExtension
requirements = ["torch", "torchvision"]
def get_extensions():
this_dir = os.path.dirname(os.path.abspath(__file__))
extensions_dir = os.path.join(this_dir, "detr/src")
......@@ -49,7 +48,7 @@ def get_extensions():
"-D__CUDA_NO_HALF2_OPERATORS__",
]
else:
raise NotImplementedError('Cuda is not availabel')
raise NotImplementedError("Cuda is not availabel")
sources = [os.path.join(extensions_dir, s) for s in sources]
include_dirs = [extensions_dir]
......@@ -64,13 +63,14 @@ def get_extensions():
]
return ext_modules
if __name__ == '__main__':
if __name__ == "__main__":
setup(
name="detr",
url="https://github.com/facebookresearch/d2go/detr",
license='Apache-2.0',
license="Apache-2.0",
packages=find_packages(exclude=["test_all.py"]),
package_data={ 'detr': ['LICENSE']},
package_data={"detr": ["LICENSE"]},
ext_modules=get_extensions(),
cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
)
This diff is collapsed.
This diff is collapsed.
......@@ -13,6 +13,7 @@ from d2go.utils.testing.data_loader_helper import create_local_dataset
# RUN:
# buck test mobile-vision/d2go/projects_oss/detr:test_detr_runner
def _get_cfg(runner, output_dir, dataset_name):
cfg = runner.get_default_cfg()
cfg.MODEL.DEVICE = "cpu"
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -126,7 +126,5 @@ class TestDataTransformsBoxUtils(unittest.TestCase):
boxes = np.array([[91, 46, 144, 111]])
transformed_bboxs = enlarge_box_tfm[0].apply_coords(boxes)
err_msg = "transformed_bbox = {}, expected {}".format(
transformed_bboxs, boxes
)
err_msg = "transformed_bbox = {}, expected {}".format(transformed_bboxs, boxes)
self.assertTrue(np.allclose(transformed_bboxs, boxes), err_msg)
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment