enable black for mobile-vision

Summary: https://fb.workplace.com/groups/pythonfoundation/posts/2990917737888352 Remove `mobile-vision` from opt-out list; leaving `mobile-vision/SNPE` opted out because of 3rd-party code. arc lint --take BLACK --apply-patches --paths-cmd 'hg files mobile-vision' allow-large-files Reviewed By: sstsai-adl Differential Revision: D30721093 fbshipit-source-id: 9e5c16d988b315b93a28038443ecfb92efd18ef8

enable black for mobile-vision
Summary: https://fb.workplace.com/groups/pythonfoundation/posts/2990917737888352 Remove `mobile-vision` from opt-out list; leaving `mobile-vision/SNPE` opted out because of 3rd-party code. arc lint --take BLACK --apply-patches --paths-cmd 'hg files mobile-vision' allow-large-files Reviewed By: sstsai-adl Differential Revision: D30721093 fbshipit-source-id: 9e5c16d988b315b93a28038443ecfb92efd18ef8
82295dbf · Yanghan Wang · Facebook GitHub Bot · a56c7e15 · 82295dbf · 82295dbf
Commit 82295dbf authored Sep 08, 2021 by Yanghan Wang Committed by Facebook GitHub Bot Sep 08, 2021
20 changed files
--- a/projects_oss/detr/detr/models/matcher.py
+++ b/projects_oss/detr/detr/models/matcher.py
@@ -5,11 +5,10 @@
 Modules to compute the matching cost and solve the corresponding LSAP.
 """
 import torch
+from detr.util.box_ops import box_cxcywh_to_xyxy, generalized_box_iou
 from scipy.optimize import linear_sum_assignment
 from torch import nn

-from detr.util.box_ops import box_cxcywh_to_xyxy, generalized_box_iou
-

 class HungarianMatcher(nn.Module):
    """This class computes an assignment between the targets and the predictions of the network
@@ -19,7 +18,13 @@ class HungarianMatcher(nn.Module):
    while the others are un-matched (and thus treated as non-objects).
    """

-    def __init__(self, cost_class: float = 1, cost_bbox: float = 1, cost_giou: float = 1, use_focal_loss=False):
+    def __init__(
+        self,
+        cost_class: float = 1,
+        cost_bbox: float = 1,
+        cost_giou: float = 1,
+        use_focal_loss=False,
+    ):
        """Creates the matcher

        Params:
@@ -31,12 +36,14 @@ class HungarianMatcher(nn.Module):
        self.cost_class = cost_class
        self.cost_bbox = cost_bbox
        self.cost_giou = cost_giou
-        assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, "all costs cant be 0"
+        assert (
+            cost_class != 0 or cost_bbox != 0 or cost_giou != 0
+        ), "all costs cant be 0"
        self.use_focal_loss = use_focal_loss

    @torch.no_grad()
    def forward(self, outputs, targets):
-        """ Performs the matching
+        """Performs the matching

        Params:
            outputs: This is a dict that contains at least these entries:
@@ -61,7 +68,9 @@ class HungarianMatcher(nn.Module):
        if self.use_focal_loss:
            out_prob = outputs["pred_logits"].flatten(0, 1).sigmoid()
        else:
-            out_prob = outputs["pred_logits"].flatten(0, 1).softmax(-1)  # [batch_size * num_queries, num_classes]
+            out_prob = (
+                outputs["pred_logits"].flatten(0, 1).softmax(-1)
+            )  # [batch_size * num_queries, num_classes]
        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]

        # Also concat the target labels and boxes
@@ -74,29 +83,57 @@ class HungarianMatcher(nn.Module):
        if self.use_focal_loss:
            alpha = 0.25
            gamma = 2.0
-            neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())
-            pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
+            neg_cost_class = (
+                (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())
+            )
+            pos_cost_class = (
+                alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
+            )
            cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids]
        else:
-            cost_class = -out_prob[:, tgt_ids]  # shape [batch_size * num_queries, \sum_b NUM-BOX_b]
+            cost_class = -out_prob[
+                :, tgt_ids
+            ]  # shape [batch_size * num_queries, \sum_b NUM-BOX_b]

        # Compute the L1 cost between boxes
-        cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)  # shape [batch_size * num_queries,\sum_b NUM-BOX_b]
+        cost_bbox = torch.cdist(
+            out_bbox, tgt_bbox, p=1
+        )  # shape [batch_size * num_queries,\sum_b NUM-BOX_b]

        # Compute the giou cost betwen boxes
        # shape [batch_size * num_queries, \sum_b NUM-BOX_b]
-        cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox))
+        cost_giou = -generalized_box_iou(
+            box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox)
+        )

        # Final cost matrix
-        C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
-        C = C.view(bs, num_queries, -1).cpu()  # shape [batch_size, num_queries, \sum_b NUM-BOX_b]
+        C = (
+            self.cost_bbox * cost_bbox
+            + self.cost_class * cost_class
+            + self.cost_giou * cost_giou
+        )
+        C = C.view(
+            bs, num_queries, -1
+        ).cpu()  # shape [batch_size, num_queries, \sum_b NUM-BOX_b]

        sizes = [len(v["boxes"]) for v in targets]  # shape [batch_size,]
        # each split c shape [batch_size, num_queries, NUM-BOX_b]
-        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
+        indices = [
+            linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))
+        ]
        # A list where each item is [row_indices, col_indices]
-        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
+        return [
+            (
+                torch.as_tensor(i, dtype=torch.int64),
+                torch.as_tensor(j, dtype=torch.int64),
+            )
+            for i, j in indices
+        ]


 def build_matcher(args):
-    return HungarianMatcher(cost_class=args.set_cost_class, cost_bbox=args.set_cost_bbox, cost_giou=args.set_cost_giou)
+    return HungarianMatcher(
+        cost_class=args.set_cost_class,
+        cost_bbox=args.set_cost_bbox,
+        cost_giou=args.set_cost_giou,
+    )
--- a/projects_oss/detr/detr/models/position_encoding.py
+++ b/projects_oss/detr/detr/models/position_encoding.py
@@ -5,10 +5,10 @@
 Various positional encodings for the transformer.
 """
 import math
-import torch
-from torch import nn

+import torch
 from detr.util.misc import NestedTensor
+from torch import nn


 class PositionEmbeddingSine(nn.Module):
@@ -16,7 +16,15 @@ class PositionEmbeddingSine(nn.Module):
    This is a more standard version of the position embedding, very similar to the one
    used by the Attention is all you need paper, generalized to work on images.
    """
-    def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None, centered=False):
+
+    def __init__(
+        self,
+        num_pos_feats=64,
+        temperature=10000,
+        normalize=False,
+        scale=None,
+        centered=False,
+    ):
        super().__init__()
        self.num_pos_feats = num_pos_feats
        self.temperature = temperature
@@ -47,13 +55,25 @@ class PositionEmbeddingSine(nn.Module):
                x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale

        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
-        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)  # shape (N, )
+        dim_t = self.temperature ** (
+            2 * (dim_t // 2) / self.num_pos_feats
+        )  # shape (N, )

        pos_x = x_embed[:, :, :, None] / dim_t  # shape (B, H, W, N)
        pos_y = y_embed[:, :, :, None] / dim_t
-        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)  # shape (B, H, W, N)
-        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)  # shape (B, H, W, N)
-        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)  # shape (B, 2*N, H, W)
+        pos_x = torch.stack(
+            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
+        ).flatten(
+            3
+        )  # shape (B, H, W, N)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
+        ).flatten(
+            3
+        )  # shape (B, H, W, N)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(
+            0, 3, 1, 2
+        )  # shape (B, 2*N, H, W)
        return pos


@@ -61,6 +81,7 @@ class PositionEmbeddingLearned(nn.Module):
    """
    Absolute pos embedding, learned.
    """
+
    def __init__(self, num_pos_feats=256):
        super().__init__()
        self.row_embed = nn.Embedding(50, num_pos_feats)
@@ -78,19 +99,27 @@ class PositionEmbeddingLearned(nn.Module):
        j = torch.arange(h, device=x.device)
        x_emb = self.col_embed(i)
        y_emb = self.row_embed(j)
-        pos = torch.cat([
+        pos = (
+            torch.cat(
+                [
                    x_emb.unsqueeze(0).repeat(h, 1, 1),
                    y_emb.unsqueeze(1).repeat(1, w, 1),
-        ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1)
+                ],
+                dim=-1,
+            )
+            .permute(2, 0, 1)
+            .unsqueeze(0)
+            .repeat(x.shape[0], 1, 1, 1)
+        )
        return pos


 def build_position_encoding(args):
    N_steps = args.hidden_dim // 2
-    if args.position_embedding in ('v2', 'sine'):
+    if args.position_embedding in ("v2", "sine"):
        # TODO find a better way of exposing other arguments
        position_embedding = PositionEmbeddingSine(N_steps, normalize=True)
-    elif args.position_embedding in ('v3', 'learned'):
+    elif args.position_embedding in ("v3", "learned"):
        position_embedding = PositionEmbeddingLearned(N_steps)
    else:
        raise ValueError(f"not supported {args.position_embedding}")

--- a/projects_oss/detr/detr/models/segmentation.py
+++ b/projects_oss/detr/detr/models/segmentation.py
@@ -8,14 +8,13 @@ import io
 from collections import defaultdict
 from typing import List, Optional

+import detr.util.box_ops as box_ops
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from torch import Tensor
-from PIL import Image
-
-import detr.util.box_ops as box_ops
 from detr.util.misc import NestedTensor, interpolate, nested_tensor_from_tensor_list
+from PIL import Image
+from torch import Tensor

 try:
    from panopticapi.utils import id2rgb, rgb2id
@@ -33,8 +32,12 @@ class DETRsegm(nn.Module):
                p.requires_grad_(False)

        hidden_dim, nheads = detr.transformer.d_model, detr.transformer.nhead
-        self.bbox_attention = MHAttentionMap(hidden_dim, hidden_dim, nheads, dropout=0.0)
-        self.mask_head = MaskHeadSmallConv(hidden_dim + nheads, [1024, 512, 256], hidden_dim)
+        self.bbox_attention = MHAttentionMap(
+            hidden_dim, hidden_dim, nheads, dropout=0.0
+        )
+        self.mask_head = MaskHeadSmallConv(
+            hidden_dim + nheads, [1024, 512, 256], hidden_dim
+        )

    def forward(self, samples: NestedTensor):
        if isinstance(samples, (list, torch.Tensor)):
@@ -46,19 +49,27 @@ class DETRsegm(nn.Module):
        src, mask = features[-1].decompose()
        assert mask is not None
        src_proj = self.detr.input_proj(src)
-        hs, memory = self.detr.transformer(src_proj, mask, self.detr.query_embed.weight, pos[-1])
+        hs, memory = self.detr.transformer(
+            src_proj, mask, self.detr.query_embed.weight, pos[-1]
+        )

        outputs_class = self.detr.class_embed(hs)
        outputs_coord = self.detr.bbox_embed(hs).sigmoid()
        out = {"pred_logits": outputs_class[-1], "pred_boxes": outputs_coord[-1]}
        if self.detr.aux_loss:
-            out['aux_outputs'] = self.detr._set_aux_loss(outputs_class, outputs_coord)
+            out["aux_outputs"] = self.detr._set_aux_loss(outputs_class, outputs_coord)

        # FIXME h_boxes takes the last one computed, keep this in mind
        bbox_mask = self.bbox_attention(hs[-1], memory, mask=mask)

-        seg_masks = self.mask_head(src_proj, bbox_mask, [features[2].tensors, features[1].tensors, features[0].tensors])
-        outputs_seg_masks = seg_masks.view(bs, self.detr.num_queries, seg_masks.shape[-2], seg_masks.shape[-1])
+        seg_masks = self.mask_head(
+            src_proj,
+            bbox_mask,
+            [features[2].tensors, features[1].tensors, features[0].tensors],
+        )
+        outputs_seg_masks = seg_masks.view(
+            bs, self.detr.num_queries, seg_masks.shape[-2], seg_masks.shape[-1]
+        )

        out["pred_masks"] = outputs_seg_masks
        return out
@@ -77,7 +88,14 @@ class MaskHeadSmallConv(nn.Module):
    def __init__(self, dim, fpn_dims, context_dim):
        super().__init__()

-        inter_dims = [dim, context_dim // 2, context_dim // 4, context_dim // 8, context_dim // 16, context_dim // 64]
+        inter_dims = [
+            dim,
+            context_dim // 2,
+            context_dim // 4,
+            context_dim // 8,
+            context_dim // 16,
+            context_dim // 64,
+        ]
        self.lay1 = torch.nn.Conv2d(dim, dim, 3, padding=1)
        self.gn1 = torch.nn.GroupNorm(8, dim)
        self.lay2 = torch.nn.Conv2d(dim, inter_dims[1], 3, padding=1)
@@ -159,9 +177,19 @@ class MHAttentionMap(nn.Module):

    def forward(self, q, k, mask: Optional[Tensor] = None):
        q = self.q_linear(q)
-        k = F.conv2d(k, self.k_linear.weight.unsqueeze(-1).unsqueeze(-1), self.k_linear.bias)
-        qh = q.view(q.shape[0], q.shape[1], self.num_heads, self.hidden_dim // self.num_heads)
-        kh = k.view(k.shape[0], self.num_heads, self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1])
+        k = F.conv2d(
+            k, self.k_linear.weight.unsqueeze(-1).unsqueeze(-1), self.k_linear.bias
+        )
+        qh = q.view(
+            q.shape[0], q.shape[1], self.num_heads, self.hidden_dim // self.num_heads
+        )
+        kh = k.view(
+            k.shape[0],
+            self.num_heads,
+            self.hidden_dim // self.num_heads,
+            k.shape[-2],
+            k.shape[-1],
+        )
        weights = torch.einsum("bqnc,bnchw->bqnhw", qh * self.normalize_fact, kh)

        if mask is not None:
@@ -189,7 +217,9 @@ def dice_loss(inputs, targets, num_boxes):
    return loss.sum() / num_boxes


-def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
+def sigmoid_focal_loss(
+    inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2
+):
    """
    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
    Args:
@@ -227,10 +257,14 @@ class PostProcessSegm(nn.Module):
        assert len(orig_target_sizes) == len(max_target_sizes)
        max_h, max_w = max_target_sizes.max(0)[0].tolist()
        outputs_masks = outputs["pred_masks"].squeeze(2)
-        outputs_masks = F.interpolate(outputs_masks, size=(max_h, max_w), mode="bilinear", align_corners=False)
+        outputs_masks = F.interpolate(
+            outputs_masks, size=(max_h, max_w), mode="bilinear", align_corners=False
+        )
        outputs_masks = (outputs_masks.sigmoid() > self.threshold).cpu()

-        for i, (cur_mask, t, tt) in enumerate(zip(outputs_masks, max_target_sizes, orig_target_sizes)):
+        for i, (cur_mask, t, tt) in enumerate(
+            zip(outputs_masks, max_target_sizes, orig_target_sizes)
+        ):
            img_h, img_w = t[0], t[1]
            results[i]["masks"] = cur_mask[:, :img_h, :img_w].unsqueeze(1)
            results[i]["masks"] = F.interpolate(
@@ -242,7 +276,7 @@ class PostProcessSegm(nn.Module):

 class PostProcessPanoptic(nn.Module):
    """This class converts the output of the model to the final panoptic result, in the format expected by the
-    coco panoptic API """
+    coco panoptic API"""

    def __init__(self, is_thing_map, threshold=0.85):
        """
@@ -255,8 +289,8 @@ class PostProcessPanoptic(nn.Module):
        self.threshold = threshold
        self.is_thing_map = is_thing_map

-    def forward(self, outputs, processed_sizes, target_sizes=None): #noqa: C901
-        """ This function computes the panoptic prediction from the model's predictions.
+    def forward(self, outputs, processed_sizes, target_sizes=None):  # noqa: C901
+        """This function computes the panoptic prediction from the model's predictions.
        Parameters:
            outputs: This is a dict coming directly from the model. See the model doc for the content.
            processed_sizes: This is a list of tuples (or torch tensors) of sizes of the images that were passed to the
@@ -267,7 +301,11 @@ class PostProcessPanoptic(nn.Module):
        if target_sizes is None:
            target_sizes = processed_sizes
        assert len(processed_sizes) == len(target_sizes)
-        out_logits, raw_masks, raw_boxes = outputs["pred_logits"], outputs["pred_masks"], outputs["pred_boxes"]
+        out_logits, raw_masks, raw_boxes = (
+            outputs["pred_logits"],
+            outputs["pred_masks"],
+            outputs["pred_boxes"],
+        )
        assert len(out_logits) == len(raw_masks) == len(target_sizes)
        preds = []

@@ -281,12 +319,16 @@ class PostProcessPanoptic(nn.Module):
        ):
            # we filter empty queries and detection below threshold
            scores, labels = cur_logits.softmax(-1).max(-1)
-            keep = labels.ne(outputs["pred_logits"].shape[-1] - 1) & (scores > self.threshold)
+            keep = labels.ne(outputs["pred_logits"].shape[-1] - 1) & (
+                scores > self.threshold
+            )
            cur_scores, cur_classes = cur_logits.softmax(-1).max(-1)
            cur_scores = cur_scores[keep]
            cur_classes = cur_classes[keep]
            cur_masks = cur_masks[keep]
-            cur_masks = interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1)
+            cur_masks = interpolate(
+                cur_masks[:, None], to_tuple(size), mode="bilinear"
+            ).squeeze(1)
            cur_boxes = box_ops.box_cxcywh_to_xyxy(cur_boxes[keep])

            h, w = cur_masks.shape[-2:]
@@ -322,10 +364,14 @@ class PostProcessPanoptic(nn.Module):
                final_h, final_w = to_tuple(target_size)

                seg_img = Image.fromarray(id2rgb(m_id.view(h, w).cpu().numpy()))
-                seg_img = seg_img.resize(size=(final_w, final_h), resample=Image.NEAREST)
+                seg_img = seg_img.resize(
+                    size=(final_w, final_h), resample=Image.NEAREST
+                )

                np_seg_img = (
-                    torch.ByteTensor(torch.ByteStorage.from_buffer(seg_img.tobytes())).view(final_h, final_w, 3).numpy()
+                    torch.ByteTensor(torch.ByteStorage.from_buffer(seg_img.tobytes()))
+                    .view(final_h, final_w, 3)
+                    .numpy()
                )
                m_id = torch.from_numpy(rgb2id(np_seg_img))

@@ -339,7 +385,9 @@ class PostProcessPanoptic(nn.Module):
                # We know filter empty masks as long as we find some
                while True:
                    filtered_small = torch.as_tensor(
-                        [area[i] <= 4 for i, c in enumerate(cur_classes)], dtype=torch.bool, device=keep.device
+                        [area[i] <= 4 for i, c in enumerate(cur_classes)],
+                        dtype=torch.bool,
+                        device=keep.device,
                    )
                    if filtered_small.any().item():
                        cur_scores = cur_scores[~filtered_small]
@@ -355,11 +403,21 @@ class PostProcessPanoptic(nn.Module):
            segments_info = []
            for i, a in enumerate(area):
                cat = cur_classes[i].item()
-                segments_info.append({"id": i, "isthing": self.is_thing_map[cat], "category_id": cat, "area": a})
+                segments_info.append(
+                    {
+                        "id": i,
+                        "isthing": self.is_thing_map[cat],
+                        "category_id": cat,
+                        "area": a,
+                    }
+                )
            del cur_classes

            with io.BytesIO() as out:
                seg_img.save(out, format="PNG")
-                predictions = {"png_string": out.getvalue(), "segments_info": segments_info}
+                predictions = {
+                    "png_string": out.getvalue(),
+                    "segments_info": segments_info,
+                }
            preds.append(predictions)
        return preds
--- a/projects_oss/detr/detr/models/transformer.py
+++ b/projects_oss/detr/detr/models/transformer.py
@@ -18,23 +18,38 @@ from torch import nn, Tensor


 class Transformer(nn.Module):
-
-    def __init__(self, d_model=512, nhead=8, num_encoder_layers=6,
-                 num_decoder_layers=6, dim_feedforward=2048, dropout=0.1,
-                 activation="relu", normalize_before=False,
-                 return_intermediate_dec=False):
+    def __init__(
+        self,
+        d_model=512,
+        nhead=8,
+        num_encoder_layers=6,
+        num_decoder_layers=6,
+        dim_feedforward=2048,
+        dropout=0.1,
+        activation="relu",
+        normalize_before=False,
+        return_intermediate_dec=False,
+    ):
        super().__init__()

-        encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward,
-                                                dropout, activation, normalize_before)
+        encoder_layer = TransformerEncoderLayer(
+            d_model, nhead, dim_feedforward, dropout, activation, normalize_before
+        )
        encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
-        self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
+        self.encoder = TransformerEncoder(
+            encoder_layer, num_encoder_layers, encoder_norm
+        )

-        decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward,
-                                                dropout, activation, normalize_before)
+        decoder_layer = TransformerDecoderLayer(
+            d_model, nhead, dim_feedforward, dropout, activation, normalize_before
+        )
        decoder_norm = nn.LayerNorm(d_model)
-        self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm,
-                                          return_intermediate=return_intermediate_dec)
+        self.decoder = TransformerDecoder(
+            decoder_layer,
+            num_decoder_layers,
+            decoder_norm,
+            return_intermediate=return_intermediate_dec,
+        )

        self._reset_parameters()

@@ -63,30 +78,41 @@ class Transformer(nn.Module):
        # memory shape (L, B, C)
        memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed)
        # hs shape (NUM_LEVEL, S, B, C)
-        hs = self.decoder(tgt, memory, memory_key_padding_mask=mask,
-                          pos=pos_embed, query_pos=query_embed)
+        hs = self.decoder(
+            tgt,
+            memory,
+            memory_key_padding_mask=mask,
+            pos=pos_embed,
+            query_pos=query_embed,
+        )
        # return shape (NUM_LEVEL, B, S, C) and (B, C, H, W)
        return hs.transpose(1, 2), memory.permute(1, 2, 0).view(bs, c, h, w)


 class TransformerEncoder(nn.Module):
-
    def __init__(self, encoder_layer, num_layers, norm=None):
        super().__init__()
        self.layers = _get_clones(encoder_layer, num_layers)
        self.num_layers = num_layers
        self.norm = norm

-    def forward(self, src,
+    def forward(
+        self,
+        src,
        mask: Optional[Tensor] = None,
        src_key_padding_mask: Optional[Tensor] = None,
-                pos: Optional[Tensor] = None):
+        pos: Optional[Tensor] = None,
+    ):
        output = src
        # mask, shape (L, L)
        # src_key_padding_mask, shape (B, L)
        for layer in self.layers:
-            output = layer(output, src_mask=mask,
-                           src_key_padding_mask=src_key_padding_mask, pos=pos)
+            output = layer(
+                output,
+                src_mask=mask,
+                src_key_padding_mask=src_key_padding_mask,
+                pos=pos,
+            )

        if self.norm is not None:
            output = self.norm(output)
@@ -95,7 +121,6 @@ class TransformerEncoder(nn.Module):


 class TransformerDecoder(nn.Module):
-
    def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False):
        super().__init__()
        self.layers = _get_clones(decoder_layer, num_layers)
@@ -103,13 +128,17 @@ class TransformerDecoder(nn.Module):
        self.norm = norm
        self.return_intermediate = return_intermediate

-    def forward(self, tgt, memory,
+    def forward(
+        self,
+        tgt,
+        memory,
        tgt_mask: Optional[Tensor] = None,
        memory_mask: Optional[Tensor] = None,
        tgt_key_padding_mask: Optional[Tensor] = None,
        memory_key_padding_mask: Optional[Tensor] = None,
        pos: Optional[Tensor] = None,
-                query_pos: Optional[Tensor] = None):
+        query_pos: Optional[Tensor] = None,
+    ):
        output = tgt

        intermediate = []
@@ -119,11 +148,16 @@ class TransformerDecoder(nn.Module):
        # memory_mask shape (L, S)
        # memory_key_padding_mask shape (B, S)
        for layer in self.layers:
-            output = layer(output, memory, tgt_mask=tgt_mask,
+            output = layer(
+                output,
+                memory,
+                tgt_mask=tgt_mask,
                memory_mask=memory_mask,
                tgt_key_padding_mask=tgt_key_padding_mask,
                memory_key_padding_mask=memory_key_padding_mask,
-                           pos=pos, query_pos=query_pos)
+                pos=pos,
+                query_pos=query_pos,
+            )
            if self.return_intermediate:
                intermediate.append(self.norm(output))

@@ -140,9 +174,15 @@ class TransformerDecoder(nn.Module):


 class TransformerEncoderLayer(nn.Module):
-
-    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
-                 activation="relu", normalize_before=False):
+    def __init__(
+        self,
+        d_model,
+        nhead,
+        dim_feedforward=2048,
+        dropout=0.1,
+        activation="relu",
+        normalize_before=False,
+    ):
        super().__init__()
        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
        # Implementation of Feedforward model
@@ -161,16 +201,19 @@ class TransformerEncoderLayer(nn.Module):
    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
        return tensor if pos is None else tensor + pos

-    def forward_post(self,
+    def forward_post(
+        self,
        src,
        src_mask: Optional[Tensor] = None,
        src_key_padding_mask: Optional[Tensor] = None,
-                     pos: Optional[Tensor] = None):
+        pos: Optional[Tensor] = None,
+    ):
        q = k = self.with_pos_embed(src, pos)  # shape (L, B, D)
        # src mask, shape (L, L)
        # src_key_padding_mask: shape (B, L)
-        src2 = self.self_attn(q, k, src, attn_mask=src_mask,
-                              key_padding_mask=src_key_padding_mask)[0]
+        src2 = self.self_attn(
+            q, k, src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask
+        )[0]
        src = src + self.dropout1(src2)
        src = self.norm1(src)
        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
@@ -178,33 +221,46 @@ class TransformerEncoderLayer(nn.Module):
        src = self.norm2(src)
        return src

-    def forward_pre(self, src,
+    def forward_pre(
+        self,
+        src,
        src_mask: Optional[Tensor] = None,
        src_key_padding_mask: Optional[Tensor] = None,
-                    pos: Optional[Tensor] = None):
+        pos: Optional[Tensor] = None,
+    ):
        src2 = self.norm1(src)
        q = k = self.with_pos_embed(src2, pos)
-        src2 = self.self_attn(q, k, src2, attn_mask=src_mask,
-                              key_padding_mask=src_key_padding_mask)[0]
+        src2 = self.self_attn(
+            q, k, src2, attn_mask=src_mask, key_padding_mask=src_key_padding_mask
+        )[0]
        src = src + self.dropout1(src2)
        src2 = self.norm2(src)
        src2 = self.linear2(self.dropout(self.activation(self.linear1(src2))))
        src = src + self.dropout2(src2)
        return src

-    def forward(self, src,
+    def forward(
+        self,
+        src,
        src_mask: Optional[Tensor] = None,
        src_key_padding_mask: Optional[Tensor] = None,
-                pos: Optional[Tensor] = None):
+        pos: Optional[Tensor] = None,
+    ):
        if self.normalize_before:
            return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
        return self.forward_post(src, src_mask, src_key_padding_mask, pos)


 class TransformerDecoderLayer(nn.Module):
-
-    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
-                 activation="relu", normalize_before=False):
+    def __init__(
+        self,
+        d_model,
+        nhead,
+        dim_feedforward=2048,
+        dropout=0.1,
+        activation="relu",
+        normalize_before=False,
+    ):
        super().__init__()
        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
        self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
@@ -226,28 +282,36 @@ class TransformerDecoderLayer(nn.Module):
    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
        return tensor if pos is None else tensor + pos

-    def forward_post(self, tgt, memory,
+    def forward_post(
+        self,
+        tgt,
+        memory,
        tgt_mask: Optional[Tensor] = None,
        memory_mask: Optional[Tensor] = None,
        tgt_key_padding_mask: Optional[Tensor] = None,
        memory_key_padding_mask: Optional[Tensor] = None,
        pos: Optional[Tensor] = None,
-                     query_pos: Optional[Tensor] = None):
+        query_pos: Optional[Tensor] = None,
+    ):
        # tgt shape (L, B, C)
        # tgt_mask shape (L, L)
        # tgt_key_padding_mask shape (B, L)
        q = k = self.with_pos_embed(tgt, query_pos)
-        tgt2 = self.self_attn(q, k, tgt, attn_mask=tgt_mask,
-                              key_padding_mask=tgt_key_padding_mask)[0]
+        tgt2 = self.self_attn(
+            q, k, tgt, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask
+        )[0]
        tgt = tgt + self.dropout1(tgt2)
        tgt = self.norm1(tgt)
        # memory_mask shape (L, S)
        # memory_key_padding_mask shape (B, S)
        # query_pos shape (L, B, C)
-        tgt2 = self.multihead_attn(self.with_pos_embed(tgt, query_pos),
+        tgt2 = self.multihead_attn(
+            self.with_pos_embed(tgt, query_pos),
            self.with_pos_embed(memory, pos),
-                                   memory, attn_mask=memory_mask,
-                                   key_padding_mask=memory_key_padding_mask)[0]
+            memory,
+            attn_mask=memory_mask,
+            key_padding_mask=memory_key_padding_mask,
+        )[0]
        tgt = tgt + self.dropout2(tgt2)
        tgt = self.norm2(tgt)
        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
@@ -256,41 +320,69 @@ class TransformerDecoderLayer(nn.Module):
        # return tgt shape (L, B, C)
        return tgt

-    def forward_pre(self, tgt, memory,
+    def forward_pre(
+        self,
+        tgt,
+        memory,
        tgt_mask: Optional[Tensor] = None,
        memory_mask: Optional[Tensor] = None,
        tgt_key_padding_mask: Optional[Tensor] = None,
        memory_key_padding_mask: Optional[Tensor] = None,
        pos: Optional[Tensor] = None,
-                    query_pos: Optional[Tensor] = None):
+        query_pos: Optional[Tensor] = None,
+    ):
        tgt2 = self.norm1(tgt)
        q = k = self.with_pos_embed(tgt2, query_pos)
-        tgt2 = self.self_attn(q, k, tgt2, attn_mask=tgt_mask,
-                              key_padding_mask=tgt_key_padding_mask)[0]
+        tgt2 = self.self_attn(
+            q, k, tgt2, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask
+        )[0]
        tgt = tgt + self.dropout1(tgt2)
        tgt2 = self.norm2(tgt)
-        tgt2 = self.multihead_attn(self.with_pos_embed(tgt2, query_pos),
+        tgt2 = self.multihead_attn(
+            self.with_pos_embed(tgt2, query_pos),
            self.with_pos_embed(memory, pos),
-                                   memory, attn_mask=memory_mask,
-                                   key_padding_mask=memory_key_padding_mask)[0]
+            memory,
+            attn_mask=memory_mask,
+            key_padding_mask=memory_key_padding_mask,
+        )[0]
        tgt = tgt + self.dropout2(tgt2)
        tgt2 = self.norm3(tgt)
        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
        tgt = tgt + self.dropout3(tgt2)
        return tgt

-    def forward(self, tgt, memory,
+    def forward(
+        self,
+        tgt,
+        memory,
        tgt_mask: Optional[Tensor] = None,
        memory_mask: Optional[Tensor] = None,
        tgt_key_padding_mask: Optional[Tensor] = None,
        memory_key_padding_mask: Optional[Tensor] = None,
        pos: Optional[Tensor] = None,
-                query_pos: Optional[Tensor] = None):
+        query_pos: Optional[Tensor] = None,
+    ):
        if self.normalize_before:
-            return self.forward_pre(tgt, memory, tgt_mask, memory_mask,
-                                    tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
-        return self.forward_post(tgt, memory, tgt_mask, memory_mask,
-                                 tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
+            return self.forward_pre(
+                tgt,
+                memory,
+                tgt_mask,
+                memory_mask,
+                tgt_key_padding_mask,
+                memory_key_padding_mask,
+                pos,
+                query_pos,
+            )
+        return self.forward_post(
+            tgt,
+            memory,
+            tgt_mask,
+            memory_mask,
+            tgt_key_padding_mask,
+            memory_key_padding_mask,
+            pos,
+            query_pos,
+        )


 def _get_clones(module, N):
@@ -318,4 +410,4 @@ def _get_activation_fn(activation):
        return F.gelu
    if activation == "glu":
        return F.glu
-    raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
+    raise RuntimeError(f"activation should be relu/gelu, not {activation}.")
--- a/projects_oss/detr/detr/modules/ms_deform_attn.py
+++ b/projects_oss/detr/detr/modules/ms_deform_attn.py
@@ -9,15 +9,15 @@
 # ------------------------------------------------------------------------------------------------

 from __future__ import absolute_import
-from __future__ import print_function
 from __future__ import division
+from __future__ import print_function

-import warnings
 import math
+import warnings

 import torch
-from torch import nn
 import torch.nn.functional as F
+from torch import nn
 from torch.nn.init import xavier_uniform_, constant_

 from ..functions import MSDeformAttnFunction
@@ -25,8 +25,10 @@ from ..functions import MSDeformAttnFunction

 def _is_power_of_2(n):
    if (not isinstance(n, int)) or (n < 0):
-        raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
-    return (n & (n-1) == 0) and n != 0
+        raise ValueError(
+            "invalid input for _is_power_of_2: {} (type: {})".format(n, type(n))
+        )
+    return (n & (n - 1) == 0) and n != 0


 class MSDeformAttn(nn.Module):
@@ -40,12 +42,18 @@ class MSDeformAttn(nn.Module):
        """
        super().__init__()
        if d_model % n_heads != 0:
-            raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
+            raise ValueError(
+                "d_model must be divisible by n_heads, but got {} and {}".format(
+                    d_model, n_heads
+                )
+            )
        _d_per_head = d_model // n_heads
        # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
        if not _is_power_of_2(_d_per_head):
-            warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
-                          "which is more efficient in our CUDA implementation.")
+            warnings.warn(
+                "You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
+                "which is more efficient in our CUDA implementation."
+            )

        self.im2col_step = 64

@@ -62,25 +70,39 @@ class MSDeformAttn(nn.Module):
        self._reset_parameters()

    def _reset_parameters(self):
-        constant_(self.sampling_offsets.weight.data, 0.)
+        constant_(self.sampling_offsets.weight.data, 0.0)
        # shape (num_heads,)
-        thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
+        thetas = torch.arange(self.n_heads, dtype=torch.float32) * (
+            2.0 * math.pi / self.n_heads
+        )
        # shape (num_heads, 2)
        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
        # shape (num_heads, num_levels, num_points, 2)
-        grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
+        grid_init = (
+            (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
+            .view(self.n_heads, 1, 1, 2)
+            .repeat(1, self.n_levels, self.n_points, 1)
+        )
        for i in range(self.n_points):
            grid_init[:, :, i, :] *= i + 1
        with torch.no_grad():
            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
-        constant_(self.attention_weights.weight.data, 0.)
-        constant_(self.attention_weights.bias.data, 0.)
+        constant_(self.attention_weights.weight.data, 0.0)
+        constant_(self.attention_weights.bias.data, 0.0)
        xavier_uniform_(self.value_proj.weight.data)
-        constant_(self.value_proj.bias.data, 0.)
+        constant_(self.value_proj.bias.data, 0.0)
        xavier_uniform_(self.output_proj.weight.data)
-        constant_(self.output_proj.bias.data, 0.)
-
-    def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
+        constant_(self.output_proj.bias.data, 0.0)
+
+    def forward(
+        self,
+        query,
+        reference_points,
+        input_flatten,
+        input_spatial_shapes,
+        input_level_start_index,
+        input_padding_mask=None,
+    ):
        """
        :param query                       (N, Length_{query}, C)
        :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
@@ -100,21 +122,45 @@ class MSDeformAttn(nn.Module):
        if input_padding_mask is not None:
            value = value.masked_fill(input_padding_mask[..., None], float(0))
        value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
-        sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
-        attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
-        attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
+        sampling_offsets = self.sampling_offsets(query).view(
+            N, Len_q, self.n_heads, self.n_levels, self.n_points, 2
+        )
+        attention_weights = self.attention_weights(query).view(
+            N, Len_q, self.n_heads, self.n_levels * self.n_points
+        )
+        attention_weights = F.softmax(attention_weights, -1).view(
+            N, Len_q, self.n_heads, self.n_levels, self.n_points
+        )
        # N, Len_q, n_heads, n_levels, n_points, 2
        if reference_points.shape[-1] == 2:
-            offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
-            sampling_locations = reference_points[:, :, None, :, None, :] \
+            offset_normalizer = torch.stack(
+                [input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1
+            )
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :]
                + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
+            )
        elif reference_points.shape[-1] == 4:
-            sampling_locations = reference_points[:, :, None, :, None, :2] \
-                                 + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :2]
+                + sampling_offsets
+                / self.n_points
+                * reference_points[:, :, None, :, None, 2:]
+                * 0.5
+            )
        else:
            raise ValueError(
-                'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
+                "Last dim of reference_points must be 2 or 4, but get {} instead.".format(
+                    reference_points.shape[-1]
+                )
+            )
        output = MSDeformAttnFunction.apply(
-            value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
+            value,
+            input_spatial_shapes,
+            input_level_start_index,
+            sampling_locations,
+            attention_weights,
+            self.im2col_step,
+        )
        output = self.output_proj(output)
        return output
--- a/projects_oss/detr/detr/runner.py
+++ b/projects_oss/detr/detr/runner.py
@@ -4,9 +4,9 @@ from d2go.config import CfgNode as CN
 from d2go.data.dataset_mappers.build import D2GO_DATA_MAPPER_REGISTRY
 from d2go.data.dataset_mappers.d2go_dataset_mapper import D2GoDatasetMapper
 from d2go.runner import GeneralizedRCNNRunner
-from detr.d2 import DetrDatasetMapper, add_detr_config
 from detr.backbone.deit import add_deit_backbone_config
 from detr.backbone.pit import add_pit_backbone_config
+from detr.d2 import DetrDatasetMapper, add_detr_config


 @D2GO_DATA_MAPPER_REGISTRY.register()

--- a/projects_oss/detr/detr/util/box_ops.py
+++ b/projects_oss/detr/detr/util/box_ops.py
@@ -10,15 +10,13 @@ from torchvision.ops.boxes import box_area

 def box_cxcywh_to_xyxy(x):
    x_c, y_c, w, h = x.unbind(-1)
-    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
-         (x_c + 0.5 * w), (y_c + 0.5 * h)]
+    b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)]
    return torch.stack(b, dim=-1)


 def box_xyxy_to_cxcywh(x):
    x0, y0, x1, y1 = x.unbind(-1)
-    b = [(x0 + x1) / 2, (y0 + y1) / 2,
-         (x1 - x0), (y1 - y0)]
+    b = [(x0 + x1) / 2, (y0 + y1) / 2, (x1 - x0), (y1 - y0)]
    return torch.stack(b, dim=-1)


@@ -79,11 +77,11 @@ def masks_to_boxes(masks):
    x = torch.arange(0, w, dtype=torch.float)
    y, x = torch.meshgrid(y, x)

-    x_mask = (masks * x.unsqueeze(0))
+    x_mask = masks * x.unsqueeze(0)
    x_max = x_mask.flatten(1).max(-1)[0]
    x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]

-    y_mask = (masks * y.unsqueeze(0))
+    y_mask = masks * y.unsqueeze(0)
    y_max = y_mask.flatten(1).max(-1)[0]
    y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]


--- a/projects_oss/detr/detr/util/misc.py
+++ b/projects_oss/detr/detr/util/misc.py
@@ -6,21 +6,22 @@ Misc functions, including distributed helpers.

 Mostly copy-paste from torchvision references.
 """
+import datetime
 import os
+import pickle
 import subprocess
 import time
 from collections import defaultdict, deque
-import datetime
-import pickle
+from distutils.version import LooseVersion
 from typing import Optional, List

 import torch
 import torch.distributed as dist
-from torch import Tensor

 # needed due to empty tensor bug in pytorch and torchvision 0.5
 import torchvision
-from distutils.version import LooseVersion
+from torch import Tensor
+
 if LooseVersion(torchvision.__version__) < LooseVersion("0.7.0"):
    from torchvision.ops import _new_empty_tensor
    from torchvision.ops.misc import _output_size
@@ -50,7 +51,7 @@ class SmoothedValue(object):
        """
        if not is_dist_avail_and_initialized():
            return
-        t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda")
        dist.barrier()
        dist.all_reduce(t)
        t = t.tolist()
@@ -85,7 +86,8 @@ class SmoothedValue(object):
            avg=self.avg,
            global_avg=self.global_avg,
            max=self.max,
-            value=self.value)
+            value=self.value,
+        )


 def all_gather(data):
@@ -119,14 +121,16 @@ def all_gather(data):
    for _ in size_list:
        tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda"))
    if local_size != max_size:
-        padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda")
+        padding = torch.empty(
+            size=(max_size - local_size,), dtype=torch.uint8, device="cuda"
+        )
        tensor = torch.cat((tensor, padding), dim=0)
    dist.all_gather(tensor_list, tensor)

    data_list = []
    for size, tensor in zip(size_list, tensor_list):
        buffer = tensor.cpu().numpy().tobytes()[:size]
-        data_list.append(pickle.loads(buffer)) #noqa
+        data_list.append(pickle.loads(buffer))  # noqa

    return data_list

@@ -175,15 +179,14 @@ class MetricLogger(object):
            return self.meters[attr]
        if attr in self.__dict__:
            return self.__dict__[attr]
-        raise AttributeError("'{}' object has no attribute '{}'".format(
-            type(self).__name__, attr))
+        raise AttributeError(
+            "'{}' object has no attribute '{}'".format(type(self).__name__, attr)
+        )

    def __str__(self):
        loss_str = []
        for name, meter in self.meters.items():
-            loss_str.append(
-                "{}: {}".format(name, str(meter))
-            )
+            loss_str.append("{}: {}".format(name, str(meter)))
        return self.delimiter.join(loss_str)

    def synchronize_between_processes(self):
@@ -196,31 +199,35 @@ class MetricLogger(object):
    def log_every(self, iterable, print_freq, header=None):
        i = 0
        if not header:
-            header = ''
+            header = ""
        start_time = time.time()
        end = time.time()
-        iter_time = SmoothedValue(fmt='{avg:.4f}')
-        data_time = SmoothedValue(fmt='{avg:.4f}')
-        space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
+        iter_time = SmoothedValue(fmt="{avg:.4f}")
+        data_time = SmoothedValue(fmt="{avg:.4f}")
+        space_fmt = ":" + str(len(str(len(iterable)))) + "d"
        if torch.cuda.is_available():
-            log_msg = self.delimiter.join([
+            log_msg = self.delimiter.join(
+                [
                    header,
-                '[{0' + space_fmt + '}/{1}]',
-                'eta: {eta}',
-                '{meters}',
-                'time: {time}',
-                'data: {data}',
-                'max mem: {memory:.0f}'
-            ])
+                    "[{0" + space_fmt + "}/{1}]",
+                    "eta: {eta}",
+                    "{meters}",
+                    "time: {time}",
+                    "data: {data}",
+                    "max mem: {memory:.0f}",
+                ]
+            )
        else:
-            log_msg = self.delimiter.join([
+            log_msg = self.delimiter.join(
+                [
                    header,
-                '[{0' + space_fmt + '}/{1}]',
-                'eta: {eta}',
-                '{meters}',
-                'time: {time}',
-                'data: {data}'
-            ])
+                    "[{0" + space_fmt + "}/{1}]",
+                    "eta: {eta}",
+                    "{meters}",
+                    "time: {time}",
+                    "data: {data}",
+                ]
+            )
        MB = 1024.0 * 1024.0
        for obj in iterable:
            data_time.update(time.time() - end)
@@ -230,38 +237,54 @@ class MetricLogger(object):
                eta_seconds = iter_time.global_avg * (len(iterable) - i)
                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
                if torch.cuda.is_available():
-                    print(log_msg.format(
-                        i, len(iterable), eta=eta_string,
+                    print(
+                        log_msg.format(
+                            i,
+                            len(iterable),
+                            eta=eta_string,
                            meters=str(self),
-                        time=str(iter_time), data=str(data_time),
-                        memory=torch.cuda.max_memory_allocated() / MB))
+                            time=str(iter_time),
+                            data=str(data_time),
+                            memory=torch.cuda.max_memory_allocated() / MB,
+                        )
+                    )
                else:
-                    print(log_msg.format(
-                        i, len(iterable), eta=eta_string,
+                    print(
+                        log_msg.format(
+                            i,
+                            len(iterable),
+                            eta=eta_string,
                            meters=str(self),
-                        time=str(iter_time), data=str(data_time)))
+                            time=str(iter_time),
+                            data=str(data_time),
+                        )
+                    )
            i += 1
            end = time.time()
        total_time = time.time() - start_time
        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
-        print('{} Total time: {} ({:.4f} s / it)'.format(
-            header, total_time_str, total_time / len(iterable)))
+        print(
+            "{} Total time: {} ({:.4f} s / it)".format(
+                header, total_time_str, total_time / len(iterable)
+            )
+        )


 def get_sha():
    cwd = os.path.dirname(os.path.abspath(__file__))

    def _run(command):
-        return subprocess.check_output(command, cwd=cwd).decode('ascii').strip()
-    sha = 'N/A'
+        return subprocess.check_output(command, cwd=cwd).decode("ascii").strip()
+
+    sha = "N/A"
    diff = "clean"
-    branch = 'N/A'
+    branch = "N/A"
    try:
-        sha = _run(['git', 'rev-parse', 'HEAD'])
-        subprocess.check_output(['git', 'diff'], cwd=cwd)
-        diff = _run(['git', 'diff-index', 'HEAD'])
+        sha = _run(["git", "rev-parse", "HEAD"])
+        subprocess.check_output(["git", "diff"], cwd=cwd)
+        diff = _run(["git", "diff-index", "HEAD"])
        diff = "has uncommited changes" if diff else "clean"
-        branch = _run(['git', 'rev-parse', '--abbrev-ref', 'HEAD'])
+        branch = _run(["git", "rev-parse", "--abbrev-ref", "HEAD"])
    except Exception:
        pass
    message = f"sha: {sha}, status: {diff}, branch: {branch}"
@@ -325,9 +348,9 @@ def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
        mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
        for img, pad_img, m in zip(tensor_list, tensor, mask):
            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
-            m[: img.shape[1], :img.shape[2]] = False
+            m[: img.shape[1], : img.shape[2]] = False
    else:
-        raise ValueError('not supported')
+        raise ValueError("not supported")
    return NestedTensor(tensor, mask)


@@ -337,7 +360,9 @@ def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
 def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
    max_size = []
    for i in range(tensor_list[0].dim()):
-        max_size_i = torch.max(torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)).to(torch.int64)
+        max_size_i = torch.max(
+            torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)
+        ).to(torch.int64)
        max_size.append(max_size_i)
    max_size = tuple(max_size)

@@ -349,11 +374,15 @@ def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTen
    padded_masks = []
    for img in tensor_list:
        padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
-        padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
+        padded_img = torch.nn.functional.pad(
+            img, (0, padding[2], 0, padding[1], 0, padding[0])
+        )
        padded_imgs.append(padded_img)

        m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
-        padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
+        padded_mask = torch.nn.functional.pad(
+            m, (0, padding[2], 0, padding[1]), "constant", 1
+        )
        padded_masks.append(padded_mask.to(torch.bool))

    tensor = torch.stack(padded_imgs)
@@ -367,10 +396,11 @@ def setup_for_distributed(is_master):
    This function disables printing when not in master process
    """
    import builtins as __builtin__
+
    builtin_print = __builtin__.print

    def print(*args, **kwargs):
-        force = kwargs.pop('force', False)
+        force = kwargs.pop("force", False)
        if is_master or force:
            builtin_print(*args, **kwargs)

@@ -407,26 +437,31 @@ def save_on_master(*args, **kwargs):


 def init_distributed_mode(args):
-    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+    if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
        args.rank = int(os.environ["RANK"])
-        args.world_size = int(os.environ['WORLD_SIZE'])
-        args.gpu = int(os.environ['LOCAL_RANK'])
-    elif 'SLURM_PROCID' in os.environ:
-        args.rank = int(os.environ['SLURM_PROCID'])
+        args.world_size = int(os.environ["WORLD_SIZE"])
+        args.gpu = int(os.environ["LOCAL_RANK"])
+    elif "SLURM_PROCID" in os.environ:
+        args.rank = int(os.environ["SLURM_PROCID"])
        args.gpu = args.rank % torch.cuda.device_count()
    else:
-        print('Not using distributed mode')
+        print("Not using distributed mode")
        args.distributed = False
        return

    args.distributed = True

    torch.cuda.set_device(args.gpu)
-    args.dist_backend = 'nccl'
-    print('| distributed init (rank {}): {}'.format(
-        args.rank, args.dist_url), flush=True)
-    torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
-                                         world_size=args.world_size, rank=args.rank)
+    args.dist_backend = "nccl"
+    print(
+        "| distributed init (rank {}): {}".format(args.rank, args.dist_url), flush=True
+    )
+    torch.distributed.init_process_group(
+        backend=args.dist_backend,
+        init_method=args.dist_url,
+        world_size=args.world_size,
+        rank=args.rank,
+    )
    torch.distributed.barrier()
    setup_for_distributed(args.rank == 0)

@@ -450,14 +485,16 @@ def accuracy(output, target, topk=(1,)):
    return res


-def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corners=None):
+def interpolate(
+    input, size=None, scale_factor=None, mode="nearest", align_corners=None
+):
    # type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor
    """
    Equivalent to nn.functional.interpolate, but with support for empty batch sizes.
    This will eventually be supported natively by PyTorch, and this
    class can go away.
    """
-    #if float(torchvision.__version__[:3]) < 0.7:
+    # if float(torchvision.__version__[:3]) < 0.7:
    if LooseVersion(torchvision.__version__) < LooseVersion("0.7.0"):
        if input.numel() > 0:
            return torch.nn.functional.interpolate(
@@ -468,10 +505,13 @@ def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corne
        output_shape = list(input.shape[:-2]) + list(output_shape)
        return _new_empty_tensor(input, output_shape)
    else:
-        return torchvision.ops.misc.interpolate(input, size, scale_factor, mode, align_corners)
+        return torchvision.ops.misc.interpolate(
+            input, size, scale_factor, mode, align_corners
+        )
+

 def inverse_sigmoid(x, eps=1e-5):
    x = x.clamp(min=0, max=1)
    x1 = x.clamp(min=eps)
    x2 = (1 - x).clamp(min=eps)
-    return torch.log(x1/x2)
+    return torch.log(x1 / x2)
--- a/projects_oss/detr/detr/util/plot_utils.py
+++ b/projects_oss/detr/detr/util/plot_utils.py
--- a/projects_oss/detr/main.py
+++ b/projects_oss/detr/main.py
--- a/projects_oss/detr/setup.py
+++ b/projects_oss/detr/setup.py
@@ -6,20 +6,19 @@
 # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 # ------------------------------------------------------------------------------------------------

-import os
 import glob
+import os

 import torch
-
-from torch.utils.cpp_extension import CUDA_HOME
-from torch.utils.cpp_extension import CppExtension
-from torch.utils.cpp_extension import CUDAExtension
-
 from setuptools import find_packages
 from setuptools import setup
+from torch.utils.cpp_extension import CUDAExtension
+from torch.utils.cpp_extension import CUDA_HOME
+from torch.utils.cpp_extension import CppExtension

 requirements = ["torch", "torchvision"]

+
 def get_extensions():
    this_dir = os.path.dirname(os.path.abspath(__file__))
    extensions_dir = os.path.join(this_dir, "detr/src")
@@ -49,7 +48,7 @@ def get_extensions():
            "-D__CUDA_NO_HALF2_OPERATORS__",
        ]
    else:
-        raise NotImplementedError('Cuda is not availabel')
+        raise NotImplementedError("Cuda is not availabel")

    sources = [os.path.join(extensions_dir, s) for s in sources]
    include_dirs = [extensions_dir]
@@ -64,13 +63,14 @@ def get_extensions():
    ]
    return ext_modules

-if __name__ == '__main__':
+
+if __name__ == "__main__":
    setup(
        name="detr",
        url="https://github.com/facebookresearch/d2go/detr",
-        license='Apache-2.0',
+        license="Apache-2.0",
        packages=find_packages(exclude=["test_all.py"]),
-        package_data={ 'detr': ['LICENSE']},
+        package_data={"detr": ["LICENSE"]},
        ext_modules=get_extensions(),
        cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
    )
--- a/projects_oss/detr/test_all.py
+++ b/projects_oss/detr/test_all.py
--- a/projects_oss/detr/test_deit_backbone.py
+++ b/projects_oss/detr/test_deit_backbone.py
--- a/projects_oss/detr/test_detr_runner.py
+++ b/projects_oss/detr/test_detr_runner.py
@@ -13,6 +13,7 @@ from d2go.utils.testing.data_loader_helper import create_local_dataset
 # RUN:
 # buck test mobile-vision/d2go/projects_oss/detr:test_detr_runner

+
 def _get_cfg(runner, output_dir, dataset_name):
    cfg = runner.get_default_cfg()
    cfg.MODEL.DEVICE = "cpu"

--- a/projects_oss/detr/test_op.py
+++ b/projects_oss/detr/test_op.py
--- a/setup.py
+++ b/setup.py
--- a/tests/data/test_d2go_datasets.py
+++ b/tests/data/test_d2go_datasets.py
--- a/tests/data/test_data_transforms_box_utils.py
+++ b/tests/data/test_data_transforms_box_utils.py
@@ -126,7 +126,5 @@ class TestDataTransformsBoxUtils(unittest.TestCase):

        boxes = np.array([[91, 46, 144, 111]])
        transformed_bboxs = enlarge_box_tfm[0].apply_coords(boxes)
-        err_msg = "transformed_bbox = {}, expected {}".format(
-            transformed_bboxs, boxes
-        )
+        err_msg = "transformed_bbox = {}, expected {}".format(transformed_bboxs, boxes)
        self.assertTrue(np.allclose(transformed_bboxs, boxes), err_msg)
--- a/tests/evaluation/test_prediction_count_evaluation.py
+++ b/tests/evaluation/test_prediction_count_evaluation.py
--- a/tests/misc/test_config.py
+++ b/tests/misc/test_config.py