init

e0a11e60 · luopl · e0a11e60 · e0a11e60 · e0a11e60 · e0a11e60
Commit e0a11e60 authored Aug 21, 2024 by luopl
16 changed files
--- a/diffusiondet/dataset_mapper.py
+++ b/diffusiondet/dataset_mapper.py
+# ========================================
+# Modified by Shoufa Chen
+# ========================================
+# Modified by Peize Sun, Rufeng Zhang
+# Contact: {sunpeize, cxrfzhang}@foxmail.com
+#
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import copy
+import logging
+import numpy as np
+import torch
+from detectron2.data import detection_utils as utils
+from detectron2.data import transforms as T
+__all__ = ["DiffusionDetDatasetMapper"]
+def build_transform_gen(cfg, is_train):
+    """
+    Create a list of :class:`TransformGen` from config.
+    Returns:
+        list[TransformGen]
+    """
+    if is_train:
+        min_size = cfg.INPUT.MIN_SIZE_TRAIN
+        max_size = cfg.INPUT.MAX_SIZE_TRAIN
+        sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING
+    else:
+        min_size = cfg.INPUT.MIN_SIZE_TEST
+        max_size = cfg.INPUT.MAX_SIZE_TEST
+        sample_style = "choice"
+    if sample_style == "range":
+        assert len(min_size) == 2, "more than 2 ({}) min_size(s) are provided for ranges".format(len(min_size))
+    logger = logging.getLogger(__name__)
+    tfm_gens = []
+    if is_train:
+        tfm_gens.append(T.RandomFlip())
+    # ResizeShortestEdge
+    tfm_gens.append(T.ResizeShortestEdge(min_size, max_size, sample_style))
+    if is_train:
+        logger.info("TransformGens used in training: " + str(tfm_gens))
+    return tfm_gens
+class DiffusionDetDatasetMapper:
+    """
+    A callable which takes a dataset dict in Detectron2 Dataset format,
+    and map it into a format used by DiffusionDet.
+    The callable currently does the following:
+    1. Read the image from "file_name"
+    2. Applies geometric transforms to the image and annotation
+    3. Find and applies suitable cropping to the image and annotation
+    4. Prepare image and annotation to Tensors
+    """
+    def __init__(self, cfg, is_train=True):
+        if cfg.INPUT.CROP.ENABLED and is_train:
+            self.crop_gen = [
+                T.ResizeShortestEdge([400, 500, 600], sample_style="choice"),
+                T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE),
+            ]
+        else:
+            self.crop_gen = None
+        self.tfm_gens = build_transform_gen(cfg, is_train)
+        logging.getLogger(__name__).info(
+            "Full TransformGens used in training: {}, crop: {}".format(str(self.tfm_gens), str(self.crop_gen))
+        )
+        self.img_format = cfg.INPUT.FORMAT
+        self.is_train = is_train
+    def __call__(self, dataset_dict):
+        """
+        Args:
+            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
+        Returns:
+            dict: a format that builtin models in detectron2 accept
+        """
+        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
+        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
+        utils.check_image_size(dataset_dict, image)
+        if self.crop_gen is None:
+            image, transforms = T.apply_transform_gens(self.tfm_gens, image)
+        else:
+            if np.random.rand() > 0.5:
+                image, transforms = T.apply_transform_gens(self.tfm_gens, image)
+            else:
+                image, transforms = T.apply_transform_gens(
+                    self.tfm_gens[:-1] + self.crop_gen + self.tfm_gens[-1:], image
+                )
+        image_shape = image.shape[:2]  # h, w
+        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
+        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
+        # Therefore it's important to use torch.Tensor.
+        dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
+        if not self.is_train:
+            # USER: Modify this if you want to keep them for some reason.
+            dataset_dict.pop("annotations", None)
+            return dataset_dict
+        if "annotations" in dataset_dict:
+            # USER: Modify this if you want to keep them for some reason.
+            for anno in dataset_dict["annotations"]:
+                anno.pop("segmentation", None)
+                anno.pop("keypoints", None)
+            # USER: Implement additional transformations if you have other types of data
+            annos = [
+                utils.transform_instance_annotations(obj, transforms, image_shape)
+                for obj in dataset_dict.pop("annotations")
+                if obj.get("iscrowd", 0) == 0
+            ]
+            instances = utils.annotations_to_instances(annos, image_shape)
+            dataset_dict["instances"] = utils.filter_empty_instances(instances)
+        return dataset_dict
--- a/diffusiondet/detector.py
+++ b/diffusiondet/detector.py
--- a/diffusiondet/head.py
+++ b/diffusiondet/head.py
+# ========================================
+# Modified by Shoufa Chen
+# ========================================
+# Modified by Peize Sun, Rufeng Zhang
+# Contact: {sunpeize, cxrfzhang}@foxmail.com
+#
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+DiffusionDet Transformer class.
+Copy-paste from torch.nn.Transformer with modifications:
+    * positional encodings are passed in MHattention
+    * extra LN at the end of encoder is removed
+    * decoder returns a stack of activations from all decoding layers
+"""
+import copy
+import math
+import numpy as np
+import torch
+from torch import nn, Tensor
+import torch.nn.functional as F
+from detectron2.modeling.poolers import ROIPooler
+from detectron2.structures import Boxes
+_DEFAULT_SCALE_CLAMP = math.log(100000.0 / 16)
+class SinusoidalPositionEmbeddings(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+    def forward(self, time):
+        device = time.device
+        half_dim = self.dim // 2
+        embeddings = math.log(10000) / (half_dim - 1)
+        embeddings = torch.exp(torch.arange(half_dim, device=device) * -embeddings)
+        embeddings = time[:, None] * embeddings[None, :]
+        embeddings = torch.cat((embeddings.sin(), embeddings.cos()), dim=-1)
+        return embeddings
+class GaussianFourierProjection(nn.Module):
+    """Gaussian random features for encoding time steps."""
+    def __init__(self, embed_dim, scale=30.):
+        super().__init__()
+        # Randomly sample weights during initialization. These weights are fixed
+        # during optimization and are not trainable.
+        self.W = nn.Parameter(torch.randn(embed_dim // 2) * scale, requires_grad=False)
+    def forward(self, x):
+        x_proj = x[:, None] * self.W[None, :] * 2 * np.pi
+        return torch.cat([torch.sin(x_proj), torch.cos(x_proj)], dim=-1)
+class Dense(nn.Module):
+    """A fully connected layer that reshapes outputs to feature maps."""
+    def __init__(self, input_dim, output_dim):
+        super().__init__()
+        self.dense = nn.Linear(input_dim, output_dim)
+    def forward(self, x):
+        return self.dense(x)
+class DynamicHead(nn.Module):
+    def __init__(self, cfg, roi_input_shape):
+        super().__init__()
+        # Build RoI.
+        box_pooler = self._init_box_pooler(cfg, roi_input_shape)
+        self.box_pooler = box_pooler
+        # Build heads.
+        num_classes = cfg.MODEL.DiffusionDet.NUM_CLASSES
+        d_model = cfg.MODEL.DiffusionDet.HIDDEN_DIM
+        dim_feedforward = cfg.MODEL.DiffusionDet.DIM_FEEDFORWARD
+        nhead = cfg.MODEL.DiffusionDet.NHEADS
+        dropout = cfg.MODEL.DiffusionDet.DROPOUT
+        activation = cfg.MODEL.DiffusionDet.ACTIVATION
+        num_heads = cfg.MODEL.DiffusionDet.NUM_HEADS
+        rcnn_head = RCNNHead(cfg, d_model, num_classes, dim_feedforward, nhead, dropout, activation)
+        self.head_series = _get_clones(rcnn_head, num_heads)
+        self.num_heads = num_heads
+        self.return_intermediate = cfg.MODEL.DiffusionDet.DEEP_SUPERVISION
+        # Gaussian random feature embedding layer for time
+        self.d_model = d_model
+        time_dim = d_model * 4
+        self.time_mlp = nn.Sequential(
+            SinusoidalPositionEmbeddings(d_model),
+            nn.Linear(d_model, time_dim),
+            nn.GELU(),
+            nn.Linear(time_dim, time_dim),
+        )
+        # Init parameters.
+        self.use_focal = cfg.MODEL.DiffusionDet.USE_FOCAL
+        self.use_fed_loss = cfg.MODEL.DiffusionDet.USE_FED_LOSS
+        self.num_classes = num_classes
+        if self.use_focal or self.use_fed_loss:
+            prior_prob = cfg.MODEL.DiffusionDet.PRIOR_PROB
+            self.bias_value = -math.log((1 - prior_prob) / prior_prob)
+        self._reset_parameters()
+    def _reset_parameters(self):
+        # init all parameters.
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+            # initialize the bias for focal loss and fed loss.
+            if self.use_focal or self.use_fed_loss:
+                if p.shape[-1] == self.num_classes or p.shape[-1] == self.num_classes + 1:
+                    nn.init.constant_(p, self.bias_value)
+    @staticmethod
+    def _init_box_pooler(cfg, input_shape):
+        in_features = cfg.MODEL.ROI_HEADS.IN_FEATURES
+        pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
+        pooler_scales = tuple(1.0 / input_shape[k].stride for k in in_features)
+        sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
+        pooler_type = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE
+        # If StandardROIHeads is applied on multiple feature maps (as in FPN),
+        # then we share the same predictors and therefore the channel counts must be the same
+        in_channels = [input_shape[f].channels for f in in_features]
+        # Check all channel counts are equal
+        assert len(set(in_channels)) == 1, in_channels
+        box_pooler = ROIPooler(
+            output_size=pooler_resolution,
+            scales=pooler_scales,
+            sampling_ratio=sampling_ratio,
+            pooler_type=pooler_type,
+        )
+        return box_pooler
+    def forward(self, features, init_bboxes, t, init_features):
+        # assert t shape (batch_size)
+        time = self.time_mlp(t)
+        inter_class_logits = []
+        inter_pred_bboxes = []
+        bs = len(features[0])
+        bboxes = init_bboxes
+        num_boxes = bboxes.shape[1]
+        if init_features is not None:
+            init_features = init_features[None].repeat(1, bs, 1)
+            proposal_features = init_features.clone()
+        else:
+            proposal_features = None
+        for head_idx, rcnn_head in enumerate(self.head_series):
+            class_logits, pred_bboxes, proposal_features = rcnn_head(features, bboxes, proposal_features, self.box_pooler, time)
+            if self.return_intermediate:
+                inter_class_logits.append(class_logits)
+                inter_pred_bboxes.append(pred_bboxes)
+            bboxes = pred_bboxes.detach()
+        if self.return_intermediate:
+            return torch.stack(inter_class_logits), torch.stack(inter_pred_bboxes)
+        return class_logits[None], pred_bboxes[None]
+class RCNNHead(nn.Module):
+    def __init__(self, cfg, d_model, num_classes, dim_feedforward=2048, nhead=8, dropout=0.1, activation="relu",
+                 scale_clamp: float = _DEFAULT_SCALE_CLAMP, bbox_weights=(2.0, 2.0, 1.0, 1.0)):
+        super().__init__()
+        self.d_model = d_model
+        # dynamic.
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        self.inst_interact = DynamicConv(cfg)
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.dropout3 = nn.Dropout(dropout)
+        self.activation = _get_activation_fn(activation)
+        # block time mlp
+        self.block_time_mlp = nn.Sequential(nn.SiLU(), nn.Linear(d_model * 4, d_model * 2))
+        # cls.
+        num_cls = cfg.MODEL.DiffusionDet.NUM_CLS
+        cls_module = list()
+        for _ in range(num_cls):
+            cls_module.append(nn.Linear(d_model, d_model, False))
+            cls_module.append(nn.LayerNorm(d_model))
+            cls_module.append(nn.ReLU(inplace=True))
+        self.cls_module = nn.ModuleList(cls_module)
+        # reg.
+        num_reg = cfg.MODEL.DiffusionDet.NUM_REG
+        reg_module = list()
+        for _ in range(num_reg):
+            reg_module.append(nn.Linear(d_model, d_model, False))
+            reg_module.append(nn.LayerNorm(d_model))
+            reg_module.append(nn.ReLU(inplace=True))
+        self.reg_module = nn.ModuleList(reg_module)
+        # pred.
+        self.use_focal = cfg.MODEL.DiffusionDet.USE_FOCAL
+        self.use_fed_loss = cfg.MODEL.DiffusionDet.USE_FED_LOSS
+        if self.use_focal or self.use_fed_loss:
+            self.class_logits = nn.Linear(d_model, num_classes)
+        else:
+            self.class_logits = nn.Linear(d_model, num_classes + 1)
+        self.bboxes_delta = nn.Linear(d_model, 4)
+        self.scale_clamp = scale_clamp
+        self.bbox_weights = bbox_weights
+    def forward(self, features, bboxes, pro_features, pooler, time_emb):
+        """
+        :param bboxes: (N, nr_boxes, 4)
+        :param pro_features: (N, nr_boxes, d_model)
+        """
+        N, nr_boxes = bboxes.shape[:2]
+        # roi_feature.
+        proposal_boxes = list()
+        for b in range(N):
+            proposal_boxes.append(Boxes(bboxes[b]))
+        roi_features = pooler(features, proposal_boxes)
+        if pro_features is None:
+            pro_features = roi_features.view(N, nr_boxes, self.d_model, -1).mean(-1)
+        roi_features = roi_features.view(N * nr_boxes, self.d_model, -1).permute(2, 0, 1)
+        # self_att.
+        pro_features = pro_features.view(N, nr_boxes, self.d_model).permute(1, 0, 2)
+        pro_features2 = self.self_attn(pro_features, pro_features, value=pro_features)[0]
+        pro_features = pro_features + self.dropout1(pro_features2)
+        pro_features = self.norm1(pro_features)
+        # inst_interact.
+        pro_features = pro_features.view(nr_boxes, N, self.d_model).permute(1, 0, 2).reshape(1, N * nr_boxes, self.d_model)
+        pro_features2 = self.inst_interact(pro_features, roi_features)
+        pro_features = pro_features + self.dropout2(pro_features2)
+        obj_features = self.norm2(pro_features)
+        # obj_feature.
+        obj_features2 = self.linear2(self.dropout(self.activation(self.linear1(obj_features))))
+        obj_features = obj_features + self.dropout3(obj_features2)
+        obj_features = self.norm3(obj_features)
+        fc_feature = obj_features.transpose(0, 1).reshape(N * nr_boxes, -1)
+        scale_shift = self.block_time_mlp(time_emb)
+        scale_shift = torch.repeat_interleave(scale_shift, nr_boxes, dim=0)
+        scale, shift = scale_shift.chunk(2, dim=1)
+        fc_feature = fc_feature * (scale + 1) + shift
+        cls_feature = fc_feature.clone()
+        reg_feature = fc_feature.clone()
+        for cls_layer in self.cls_module:
+            cls_feature = cls_layer(cls_feature)
+        for reg_layer in self.reg_module:
+            reg_feature = reg_layer(reg_feature)
+        class_logits = self.class_logits(cls_feature)
+        bboxes_deltas = self.bboxes_delta(reg_feature)
+        pred_bboxes = self.apply_deltas(bboxes_deltas, bboxes.view(-1, 4))
+        return class_logits.view(N, nr_boxes, -1), pred_bboxes.view(N, nr_boxes, -1), obj_features
+    def apply_deltas(self, deltas, boxes):
+        """
+        Apply transformation `deltas` (dx, dy, dw, dh) to `boxes`.
+        Args:
+            deltas (Tensor): transformation deltas of shape (N, k*4), where k >= 1.
+                deltas[i] represents k potentially different class-specific
+                box transformations for the single box boxes[i].
+            boxes (Tensor): boxes to transform, of shape (N, 4)
+        """
+        boxes = boxes.to(deltas.dtype)
+        widths = boxes[:, 2] - boxes[:, 0]
+        heights = boxes[:, 3] - boxes[:, 1]
+        ctr_x = boxes[:, 0] + 0.5 * widths
+        ctr_y = boxes[:, 1] + 0.5 * heights
+        wx, wy, ww, wh = self.bbox_weights
+        dx = deltas[:, 0::4] / wx
+        dy = deltas[:, 1::4] / wy
+        dw = deltas[:, 2::4] / ww
+        dh = deltas[:, 3::4] / wh
+        # Prevent sending too large values into torch.exp()
+        dw = torch.clamp(dw, max=self.scale_clamp)
+        dh = torch.clamp(dh, max=self.scale_clamp)
+        pred_ctr_x = dx * widths[:, None] + ctr_x[:, None]
+        pred_ctr_y = dy * heights[:, None] + ctr_y[:, None]
+        pred_w = torch.exp(dw) * widths[:, None]
+        pred_h = torch.exp(dh) * heights[:, None]
+        pred_boxes = torch.zeros_like(deltas)
+        pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w  # x1
+        pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h  # y1
+        pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w  # x2
+        pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h  # y2
+        return pred_boxes
+class DynamicConv(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.hidden_dim = cfg.MODEL.DiffusionDet.HIDDEN_DIM
+        self.dim_dynamic = cfg.MODEL.DiffusionDet.DIM_DYNAMIC
+        self.num_dynamic = cfg.MODEL.DiffusionDet.NUM_DYNAMIC
+        self.num_params = self.hidden_dim * self.dim_dynamic
+        self.dynamic_layer = nn.Linear(self.hidden_dim, self.num_dynamic * self.num_params)
+        self.norm1 = nn.LayerNorm(self.dim_dynamic)
+        self.norm2 = nn.LayerNorm(self.hidden_dim)
+        self.activation = nn.ReLU(inplace=True)
+        pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
+        num_output = self.hidden_dim * pooler_resolution ** 2
+        self.out_layer = nn.Linear(num_output, self.hidden_dim)
+        self.norm3 = nn.LayerNorm(self.hidden_dim)
+    def forward(self, pro_features, roi_features):
+        '''
+        pro_features: (1,  N * nr_boxes, self.d_model)
+        roi_features: (49, N * nr_boxes, self.d_model)
+        '''
+        features = roi_features.permute(1, 0, 2)
+        parameters = self.dynamic_layer(pro_features).permute(1, 0, 2)
+        param1 = parameters[:, :, :self.num_params].view(-1, self.hidden_dim, self.dim_dynamic)
+        param2 = parameters[:, :, self.num_params:].view(-1, self.dim_dynamic, self.hidden_dim)
+        features = torch.bmm(features, param1)
+        features = self.norm1(features)
+        features = self.activation(features)
+        features = torch.bmm(features, param2)
+        features = self.norm2(features)
+        features = self.activation(features)
+        features = features.flatten(1)
+        features = self.out_layer(features)
+        features = self.norm3(features)
+        features = self.activation(features)
+        return features
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+def _get_activation_fn(activation):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
--- a/diffusiondet/loss.py
+++ b/diffusiondet/loss.py
--- a/diffusiondet/predictor.py
+++ b/diffusiondet/predictor.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+import atexit
+import bisect
+import multiprocessing as mp
+from collections import deque
+import cv2
+import torch
+from detectron2.data import MetadataCatalog
+from detectron2.engine.defaults import DefaultPredictor
+from detectron2.utils.video_visualizer import VideoVisualizer
+from detectron2.utils.visualizer import ColorMode, Visualizer
+class VisualizationDemo(object):
+    def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False):
+        """
+        Args:
+            cfg (CfgNode):
+            instance_mode (ColorMode):
+            parallel (bool): whether to run the model in different processes from visualization.
+                Useful since the visualization logic can be slow.
+        """
+        self.metadata = MetadataCatalog.get(
+            cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused"
+        )
+        self.cpu_device = torch.device("cpu")
+        self.instance_mode = instance_mode
+        self.parallel = parallel
+        if parallel:
+            num_gpu = torch.cuda.device_count()
+            self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu)
+        else:
+            self.predictor = DefaultPredictor(cfg)
+        self.threshold = cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST  # workaround
+    def run_on_image(self, image):
+        """
+        Args:
+            image (np.ndarray): an image of shape (H, W, C) (in BGR order).
+                This is the format used by OpenCV.
+        Returns:
+            predictions (dict): the output of the model.
+            vis_output (VisImage): the visualized image output.
+        """
+        vis_output = None
+        predictions = self.predictor(image)
+        # Filter
+        instances = predictions['instances']
+        new_instances = instances[instances.scores > self.threshold]
+        predictions = {'instances': new_instances}
+        # Convert image from OpenCV BGR format to Matplotlib RGB format.
+        image = image[:, :, ::-1]
+        visualizer = Visualizer(image, self.metadata, instance_mode=self.instance_mode)
+        if "panoptic_seg" in predictions:
+            panoptic_seg, segments_info = predictions["panoptic_seg"]
+            vis_output = visualizer.draw_panoptic_seg_predictions(
+                panoptic_seg.to(self.cpu_device), segments_info
+            )
+        else:
+            if "sem_seg" in predictions:
+                vis_output = visualizer.draw_sem_seg(
+                    predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
+                )
+            if "instances" in predictions:
+                instances = predictions["instances"].to(self.cpu_device)
+                vis_output = visualizer.draw_instance_predictions(predictions=instances)
+        return predictions, vis_output
+    def _frame_from_video(self, video):
+        while video.isOpened():
+            success, frame = video.read()
+            if success:
+                yield frame
+            else:
+                break
+    def run_on_video(self, video):
+        """
+        Visualizes predictions on frames of the input video.
+        Args:
+            video (cv2.VideoCapture): a :class:`VideoCapture` object, whose source can be
+                either a webcam or a video file.
+        Yields:
+            ndarray: BGR visualizations of each video frame.
+        """
+        video_visualizer = VideoVisualizer(self.metadata, self.instance_mode)
+        def process_predictions(frame, predictions):
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            if "panoptic_seg" in predictions:
+                panoptic_seg, segments_info = predictions["panoptic_seg"]
+                vis_frame = video_visualizer.draw_panoptic_seg_predictions(
+                    frame, panoptic_seg.to(self.cpu_device), segments_info
+                )
+            elif "instances" in predictions:
+                predictions = predictions["instances"].to(self.cpu_device)
+                vis_frame = video_visualizer.draw_instance_predictions(frame, predictions)
+            elif "sem_seg" in predictions:
+                vis_frame = video_visualizer.draw_sem_seg(
+                    frame, predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
+                )
+            # Converts Matplotlib RGB format to OpenCV BGR format
+            vis_frame = cv2.cvtColor(vis_frame.get_image(), cv2.COLOR_RGB2BGR)
+            return vis_frame
+        frame_gen = self._frame_from_video(video)
+        if self.parallel:
+            buffer_size = self.predictor.default_buffer_size
+            frame_data = deque()
+            for cnt, frame in enumerate(frame_gen):
+                frame_data.append(frame)
+                self.predictor.put(frame)
+                if cnt >= buffer_size:
+                    frame = frame_data.popleft()
+                    predictions = self.predictor.get()
+                    yield process_predictions(frame, predictions)
+            while len(frame_data):
+                frame = frame_data.popleft()
+                predictions = self.predictor.get()
+                yield process_predictions(frame, predictions)
+        else:
+            for frame in frame_gen:
+                yield process_predictions(frame, self.predictor(frame))
+class AsyncPredictor:
+    """
+    A predictor that runs the model asynchronously, possibly on >1 GPUs.
+    Because rendering the visualization takes considerably amount of time,
+    this helps improve throughput a little bit when rendering videos.
+    """
+    class _StopToken:
+        pass
+    class _PredictWorker(mp.Process):
+        def __init__(self, cfg, task_queue, result_queue):
+            self.cfg = cfg
+            self.task_queue = task_queue
+            self.result_queue = result_queue
+            super().__init__()
+        def run(self):
+            predictor = DefaultPredictor(self.cfg)
+            while True:
+                task = self.task_queue.get()
+                if isinstance(task, AsyncPredictor._StopToken):
+                    break
+                idx, data = task
+                result = predictor(data)
+                self.result_queue.put((idx, result))
+    def __init__(self, cfg, num_gpus: int = 1):
+        """
+        Args:
+            cfg (CfgNode):
+            num_gpus (int): if 0, will run on CPU
+        """
+        num_workers = max(num_gpus, 1)
+        self.task_queue = mp.Queue(maxsize=num_workers * 3)
+        self.result_queue = mp.Queue(maxsize=num_workers * 3)
+        self.procs = []
+        for gpuid in range(max(num_gpus, 1)):
+            cfg = cfg.clone()
+            cfg.defrost()
+            cfg.MODEL.DEVICE = "cuda:{}".format(gpuid) if num_gpus > 0 else "cpu"
+            self.procs.append(
+                AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue)
+            )
+        self.put_idx = 0
+        self.get_idx = 0
+        self.result_rank = []
+        self.result_data = []
+        for p in self.procs:
+            p.start()
+        atexit.register(self.shutdown)
+    def put(self, image):
+        self.put_idx += 1
+        self.task_queue.put((self.put_idx, image))
+    def get(self):
+        self.get_idx += 1  # the index needed for this request
+        if len(self.result_rank) and self.result_rank[0] == self.get_idx:
+            res = self.result_data[0]
+            del self.result_data[0], self.result_rank[0]
+            return res
+        while True:
+            # make sure the results are returned in the correct order
+            idx, res = self.result_queue.get()
+            if idx == self.get_idx:
+                return res
+            insert = bisect.bisect(self.result_rank, idx)
+            self.result_rank.insert(insert, idx)
+            self.result_data.insert(insert, res)
+    def __len__(self):
+        return self.put_idx - self.get_idx
+    def __call__(self, image):
+        self.put(image)
+        return self.get()
+    def shutdown(self):
+        for _ in self.procs:
+            self.task_queue.put(AsyncPredictor._StopToken())
+    @property
+    def default_buffer_size(self):
+        return len(self.procs) * 5
--- a/diffusiondet/swintransformer.py
+++ b/diffusiondet/swintransformer.py
--- a/diffusiondet/test_time_augmentation.py
+++ b/diffusiondet/test_time_augmentation.py
+# ========================================
+# Modified by Shoufa Chen
+# ========================================
+# Modified by Rufeng Zhang, Peize Sun
+# Contact: {sunpeize, cxrfzhang}@foxmail.com
+# 
+# Copyright (c) Megvii, Inc. and its affiliates. All Rights Reserved
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#
+from itertools import count
+import torch
+import torch.nn as nn
+from torch.nn.parallel import DistributedDataParallel
+from detectron2.modeling import GeneralizedRCNNWithTTA, DatasetMapperTTA
+from detectron2.modeling.roi_heads.fast_rcnn import fast_rcnn_inference_single_image
+from detectron2.structures import Instances, Boxes
+class DiffusionDetWithTTA(GeneralizedRCNNWithTTA):
+    """
+        A DiffusionDet with test-time augmentation enabled.
+        Its :meth:`__call__` method has the same interface as :meth:`DiffusionDet.forward`.
+    """
+    def __init__(self, cfg, model, tta_mapper=None, batch_size=3):
+        """
+            Args:
+                cfg (CfgNode):
+                model (DiffusionDet): a DiffusionDet to apply TTA on.
+                tta_mapper (callable): takes a dataset dict and returns a list of
+                    augmented versions of the dataset dict. Defaults to
+                    `DatasetMapperTTA(cfg)`.
+                batch_size (int): batch the augmented images into this batch size for inference.
+        """
+        # fix the issue: cannot assign module before Module.__init__() call
+        nn.Module.__init__(self)
+        if isinstance(model, DistributedDataParallel):
+            model = model.module
+        self.cfg = cfg.clone()
+        self.model = model
+        if tta_mapper is None:
+            tta_mapper = DatasetMapperTTA(cfg)
+        self.tta_mapper = tta_mapper
+        self.batch_size = batch_size
+        # cvpods tta.
+        self.enable_cvpods_tta = cfg.TEST.AUG.CVPODS_TTA
+        self.enable_scale_filter = cfg.TEST.AUG.SCALE_FILTER
+        self.scale_ranges = cfg.TEST.AUG.SCALE_RANGES
+        self.max_detection = cfg.MODEL.DiffusionDet.NUM_PROPOSALS
+    def _batch_inference(self, batched_inputs, detected_instances=None):
+        """
+        Execute inference on a list of inputs,
+        using batch size = self.batch_size, instead of the length of the list.
+        Inputs & outputs have the same format as :meth:`DiffusionDet.forward`
+        """
+        if detected_instances is None:
+            detected_instances = [None] * len(batched_inputs)
+        factors = 2 if self.tta_mapper.flip else 1
+        if self.enable_scale_filter:
+            assert len(batched_inputs) == len(self.scale_ranges) * factors
+        outputs = []
+        inputs, instances = [], []
+        for idx, input, instance in zip(count(), batched_inputs, detected_instances):
+            inputs.append(input)
+            instances.append(instance)
+            if self.enable_cvpods_tta:
+                output = self.model.forward(inputs, do_postprocess=False)[0]
+                if self.enable_scale_filter:
+                    pred_boxes = output.get("pred_boxes")
+                    keep = self.filter_boxes(pred_boxes.tensor, *self.scale_ranges[idx // factors])
+                    output = Instances(
+                        image_size=output.image_size,
+                        pred_boxes=Boxes(pred_boxes.tensor[keep]),
+                        pred_classes=output.pred_classes[keep],
+                        scores=output.scores[keep])
+                outputs.extend([output])
+            else:
+                if len(inputs) == self.batch_size or idx == len(batched_inputs) - 1:
+                    outputs.extend(
+                        self.model.forward(
+                            inputs,
+                            do_postprocess=False,
+                        )
+                    )
+            inputs, instances = [], []
+        return outputs
+    @staticmethod
+    def filter_boxes(boxes, min_scale, max_scale):
+        """
+        boxes: (N, 4) shape
+        """
+        # assert boxes.mode == "xyxy"
+        w = boxes[:, 2] - boxes[:, 0]
+        h = boxes[:, 3] - boxes[:, 1]
+        keep = (w * h > min_scale * min_scale) & (w * h < max_scale * max_scale)
+        return keep
+    def _inference_one_image(self, input):
+        """
+        Args:
+            input (dict): one dataset dict with "image" field being a CHW tensor
+        Returns:
+            dict: one output dict
+        """
+        orig_shape = (input["height"], input["width"])
+        augmented_inputs, tfms = self._get_augmented_inputs(input)
+        # Detect boxes from all augmented versions
+        all_boxes, all_scores, all_classes = self._get_augmented_boxes(augmented_inputs, tfms)
+        # merge all detected boxes to obtain final predictions for boxes
+        if self.enable_cvpods_tta:
+            merged_instances = self._merge_detections_cvpods_tta(all_boxes, all_scores, all_classes, orig_shape)
+        else:
+            merged_instances = self._merge_detections(all_boxes, all_scores, all_classes, orig_shape)
+        return {"instances": merged_instances}
+    def _merge_detections(self, all_boxes, all_scores, all_classes, shape_hw):
+        # select from the union of all results
+        num_boxes = len(all_boxes)
+        num_classes = self.cfg.MODEL.DiffusionDet.NUM_CLASSES
+        # +1 because fast_rcnn_inference expects background scores as well
+        all_scores_2d = torch.zeros(num_boxes, num_classes + 1, device=all_boxes.device)
+        for idx, cls, score in zip(count(), all_classes, all_scores):
+            all_scores_2d[idx, cls] = score
+        merged_instances, _ = fast_rcnn_inference_single_image(
+            all_boxes,
+            all_scores_2d,
+            shape_hw,
+            1e-8,
+            self.cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST,
+            self.cfg.TEST.DETECTIONS_PER_IMAGE,
+        )
+        return merged_instances
+    def _merge_detections_cvpods_tta(self, all_boxes, all_scores, all_classes, shape_hw):
+        all_scores = torch.tensor(all_scores).to(all_boxes.device)
+        all_classes = torch.tensor(all_classes).to(all_boxes.device)
+        all_boxes, all_scores, all_classes = self.merge_result_from_multi_scales(
+            all_boxes, all_scores, all_classes,
+            nms_type="soft_vote", vote_thresh=0.65,
+            max_detection=self.max_detection
+        )
+        all_boxes = Boxes(all_boxes)
+        all_boxes.clip(shape_hw)
+        result = Instances(shape_hw)
+        result.pred_boxes = all_boxes
+        result.scores = all_scores
+        result.pred_classes = all_classes.long()
+        return result
+    def merge_result_from_multi_scales(
+            self, boxes, scores, labels, nms_type="soft-vote", vote_thresh=0.65, max_detection=100
+    ):
+        boxes, scores, labels = self.batched_vote_nms(
+            boxes, scores, labels, nms_type, vote_thresh
+        )
+        number_of_detections = boxes.shape[0]
+        # Limit to max_per_image detections **over all classes**
+        if number_of_detections > max_detection > 0:
+            boxes = boxes[:max_detection]
+            scores = scores[:max_detection]
+            labels = labels[:max_detection]
+        return boxes, scores, labels
+    def batched_vote_nms(self, boxes, scores, labels, vote_type, vote_thresh=0.65):
+        # apply per class level nms, add max_coordinates on boxes first, then remove it.
+        labels = labels.float()
+        max_coordinates = boxes.max() + 1
+        offsets = labels.reshape(-1, 1) * max_coordinates
+        boxes = boxes + offsets
+        boxes, scores, labels = self.bbox_vote(boxes, scores, labels, vote_thresh, vote_type)
+        boxes -= labels.reshape(-1, 1) * max_coordinates
+        return boxes, scores, labels
+    def bbox_vote(self, boxes, scores, labels, vote_thresh, vote_type="softvote"):
+        assert boxes.shape[0] == scores.shape[0] == labels.shape[0]
+        det = torch.cat((boxes, scores.reshape(-1, 1), labels.reshape(-1, 1)), dim=1)
+        vote_results = torch.zeros(0, 6, device=det.device)
+        if det.numel() == 0:
+            return vote_results[:, :4], vote_results[:, 4], vote_results[:, 5]
+        order = scores.argsort(descending=True)
+        det = det[order]
+        while det.shape[0] > 0:
+            # IOU
+            area = (det[:, 2] - det[:, 0]) * (det[:, 3] - det[:, 1])
+            xx1 = torch.max(det[0, 0], det[:, 0])
+            yy1 = torch.max(det[0, 1], det[:, 1])
+            xx2 = torch.min(det[0, 2], det[:, 2])
+            yy2 = torch.min(det[0, 3], det[:, 3])
+            w = torch.clamp(xx2 - xx1, min=0.)
+            h = torch.clamp(yy2 - yy1, min=0.)
+            inter = w * h
+            iou = inter / (area[0] + area[:] - inter)
+            # get needed merge det and delete these det
+            merge_index = torch.where(iou >= vote_thresh)[0]
+            vote_det = det[merge_index, :]
+            det = det[iou < vote_thresh]
+            if merge_index.shape[0] <= 1:
+                vote_results = torch.cat((vote_results, vote_det), dim=0)
+            else:
+                if vote_type == "soft_vote":
+                    vote_det_iou = iou[merge_index]
+                    det_accu_sum = self.get_soft_dets_sum(vote_det, vote_det_iou)
+                elif vote_type == "vote":
+                    det_accu_sum = self.get_dets_sum(vote_det)
+                vote_results = torch.cat((vote_results, det_accu_sum), dim=0)
+        order = vote_results[:, 4].argsort(descending=True)
+        vote_results = vote_results[order, :]
+        return vote_results[:, :4], vote_results[:, 4], vote_results[:, 5]
+    @staticmethod
+    def get_dets_sum(vote_det):
+        vote_det[:, :4] *= vote_det[:, 4:5].repeat(1, 4)
+        max_score = vote_det[:, 4].max()
+        det_accu_sum = torch.zeros((1, 6), device=vote_det.device)
+        det_accu_sum[:, :4] = torch.sum(vote_det[:, :4], dim=0) / torch.sum(vote_det[:, 4])
+        det_accu_sum[:, 4] = max_score
+        det_accu_sum[:, 5] = vote_det[0, 5]
+        return det_accu_sum
+    @staticmethod
+    def get_soft_dets_sum(vote_det, vote_det_iou):
+        soft_vote_det = vote_det.detach().clone()
+        soft_vote_det[:, 4] *= (1 - vote_det_iou)
+        INFERENCE_TH = 0.05
+        soft_index = torch.where(soft_vote_det[:, 4] >= INFERENCE_TH)[0]
+        soft_vote_det = soft_vote_det[soft_index, :]
+        vote_det[:, :4] *= vote_det[:, 4:5].repeat(1, 4)
+        max_score = vote_det[:, 4].max()
+        det_accu_sum = torch.zeros((1, 6), device=vote_det.device)
+        det_accu_sum[:, :4] = torch.sum(vote_det[:, :4], dim=0) / torch.sum(vote_det[:, 4])
+        det_accu_sum[:, 4] = max_score
+        det_accu_sum[:, 5] = vote_det[0, 5]
+        if soft_vote_det.shape[0] > 0:
+            det_accu_sum = torch.cat((det_accu_sum, soft_vote_det), dim=0)
+        return det_accu_sum
--- a/diffusiondet/util/__init__.py
+++ b/diffusiondet/util/__init__.py
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
--- a/diffusiondet/util/box_ops.py
+++ b/diffusiondet/util/box_ops.py
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Utilities for bounding box manipulation and GIoU.
+"""
+import torch
+from torchvision.ops.boxes import box_area
+def box_cxcywh_to_xyxy(x):
+    x_c, y_c, w, h = x.unbind(-1)
+    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
+         (x_c + 0.5 * w), (y_c + 0.5 * h)]
+    return torch.stack(b, dim=-1)
+def box_xyxy_to_cxcywh(x):
+    x0, y0, x1, y1 = x.unbind(-1)
+    b = [(x0 + x1) / 2, (y0 + y1) / 2,
+         (x1 - x0), (y1 - y0)]
+    return torch.stack(b, dim=-1)
+# modified from torchvision to also return the union
+def box_iou(boxes1, boxes2):
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
+    union = area1[:, None] + area2 - inter
+    iou = inter / union
+    return iou, union
+def generalized_box_iou(boxes1, boxes2):
+    """
+    Generalized IoU from https://giou.stanford.edu/
+    The boxes should be in [x0, y0, x1, y1] format
+    Returns a [N, M] pairwise matrix, where N = len(boxes1)
+    and M = len(boxes2)
+    """
+    # degenerate boxes gives inf / nan results
+    # so do an early check
+    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
+    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
+    iou, union = box_iou(boxes1, boxes2)
+    lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
+    rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    area = wh[:, :, 0] * wh[:, :, 1]
+    return iou - (area - union) / area
+def masks_to_boxes(masks):
+    """Compute the bounding boxes around the provided masks
+    The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
+    Returns a [N, 4] tensors, with the boxes in xyxy format
+    """
+    if masks.numel() == 0:
+        return torch.zeros((0, 4), device=masks.device)
+    h, w = masks.shape[-2:]
+    y = torch.arange(0, h, dtype=torch.float)
+    x = torch.arange(0, w, dtype=torch.float)
+    y, x = torch.meshgrid(y, x)
+    x_mask = (masks * x.unsqueeze(0))
+    x_max = x_mask.flatten(1).max(-1)[0]
+    x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
+    y_mask = (masks * y.unsqueeze(0))
+    y_max = y_mask.flatten(1).max(-1)[0]
+    y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
+    return torch.stack([x_min, y_min, x_max, y_max], 1)
--- a/diffusiondet/util/colormap.py
+++ b/diffusiondet/util/colormap.py
+import numpy as np
+def colormap(rgb=False):
+    color_list = np.array(
+        [
+            0.000, 0.447, 0.741,
+            0.850, 0.325, 0.098,
+            0.929, 0.694, 0.125,
+            0.494, 0.184, 0.556,
+            0.466, 0.674, 0.188,
+            0.301, 0.745, 0.933,
+            0.635, 0.078, 0.184,
+            0.300, 0.300, 0.300,
+            0.600, 0.600, 0.600,
+            1.000, 0.000, 0.000,
+            1.000, 0.500, 0.000,
+            0.749, 0.749, 0.000,
+            0.000, 1.000, 0.000,
+            0.000, 0.000, 1.000,
+            0.667, 0.000, 1.000,
+            0.333, 0.333, 0.000,
+            0.333, 0.667, 0.000,
+            0.333, 1.000, 0.000,
+            0.667, 0.333, 0.000,
+            0.667, 0.667, 0.000,
+            0.667, 1.000, 0.000,
+            1.000, 0.333, 0.000,
+            1.000, 0.667, 0.000,
+            1.000, 1.000, 0.000,
+            0.000, 0.333, 0.500,
+            0.000, 0.667, 0.500,
+            0.000, 1.000, 0.500,
+            0.333, 0.000, 0.500,
+            0.333, 0.333, 0.500,
+            0.333, 0.667, 0.500,
+            0.333, 1.000, 0.500,
+            0.667, 0.000, 0.500,
+            0.667, 0.333, 0.500,
+            0.667, 0.667, 0.500,
+            0.667, 1.000, 0.500,
+            1.000, 0.000, 0.500,
+            1.000, 0.333, 0.500,
+            1.000, 0.667, 0.500,
+            1.000, 1.000, 0.500,
+            0.000, 0.333, 1.000,
+            0.000, 0.667, 1.000,
+            0.000, 1.000, 1.000,
+            0.333, 0.000, 1.000,
+            0.333, 0.333, 1.000,
+            0.333, 0.667, 1.000,
+            0.333, 1.000, 1.000,
+            0.667, 0.000, 1.000,
+            0.667, 0.333, 1.000,
+            0.667, 0.667, 1.000,
+            0.667, 1.000, 1.000,
+            1.000, 0.000, 1.000,
+            1.000, 0.333, 1.000,
+            1.000, 0.667, 1.000,
+            0.167, 0.000, 0.000,
+            0.333, 0.000, 0.000,
+            0.500, 0.000, 0.000,
+            0.667, 0.000, 0.000,
+            0.833, 0.000, 0.000,
+            1.000, 0.000, 0.000,
+            0.000, 0.167, 0.000,
+            0.000, 0.333, 0.000,
+            0.000, 0.500, 0.000,
+            0.000, 0.667, 0.000,
+            0.000, 0.833, 0.000,
+            0.000, 1.000, 0.000,
+            0.000, 0.000, 0.167,
+            0.000, 0.000, 0.333,
+            0.000, 0.000, 0.500,
+            0.000, 0.000, 0.667,
+            0.000, 0.000, 0.833,
+            0.000, 0.000, 1.000,
+            0.000, 0.000, 0.000,
+            0.143, 0.143, 0.143,
+            0.286, 0.286, 0.286,
+            0.429, 0.429, 0.429,
+            0.571, 0.571, 0.571,
+            0.714, 0.714, 0.714,
+            0.857, 0.857, 0.857,
+            1.000, 1.000, 1.000
+        ]
+    ).astype(np.float32)
+    color_list = color_list.reshape((-1, 3)) * 255
+    if not rgb:
+        color_list = color_list[:, ::-1]
+    return color_list
+def category():
+    category = [
+        "person",
+        "bicycle",
+        "car",
+        "motorbike",
+        "aeroplane",
+        "bus",
+        "train",
+        "truck",
+        "boat",
+        "traffic light",
+        "fire hydrant",
+        "stop sign",
+        "parking meter",
+        "bench",
+        "bird",
+        "cat",
+        "dog",
+        "horse",
+        "sheep",
+        "cow",
+        "elephant",
+        "bear",
+        "zebra",
+        "giraffe",
+        "backpack",
+        "umbrella",
+        "handbag",
+        "tie",
+        "suitcase",
+        "frisbee",
+        "skis",
+        "snowboard",
+        "sports ball",
+        "kite",
+        "baseball bat",
+        "baseball glove",
+        "skateboard",
+        "surfboard",
+        "tennis racket",
+        "bottle",
+        "wine glass",
+        "cup",
+        "fork",
+        "knife",
+        "spoon",
+        "bowl",
+        "banana",
+        "apple",
+        "sandwich",
+        "orange",
+        "broccoli",
+        "carrot",
+        "hot dog",
+        "pizza",
+        "donut",
+        "cake",
+        "chair",
+        "sofa",
+        "pottedplant",
+        "bed",
+        "diningtable",
+        "toilet",
+        "tvmonitor",
+        "laptop",
+        "mouse",
+        "remote",
+        "keyboard",
+        "cell phone",
+        "microwave",
+        "oven",
+        "toaster",
+        "sink",
+        "refrigerator",
+        "book",
+        "clock",
+        "vase",
+        "scissors",
+        "teddy bear",
+        "hair drier",
+        "toothbrush"]
+    return category
\ No newline at end of file
--- a/diffusiondet/util/misc.py
+++ b/diffusiondet/util/misc.py
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Misc functions, including distributed helpers.
+Mostly copy-paste from torchvision references.
+"""
+import os
+import subprocess
+import time
+from collections import defaultdict, deque
+import datetime
+import pickle
+from typing import Optional, List
+import torch
+import torch.distributed as dist
+from torch import Tensor
+# needed due to empty tensor bug in pytorch and torchvision 0.5
+import torchvision
+TORCH_MAJOR = int(torchvision.__version__.split('.')[0])
+TORCH_MINOR = int(torchvision.__version__.split('.')[1])
+if TORCH_MAJOR == 0 and TORCH_MINOR < 7:
+    from torchvision.ops import _new_empty_tensor
+    from torchvision.ops.misc import _output_size
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not is_dist_avail_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
+        dist.barrier()
+        dist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+    @property
+    def global_avg(self):
+        return self.total / self.count
+    @property
+    def max(self):
+        return max(self.deque)
+    @property
+    def value(self):
+        return self.deque[-1]
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value)
+def all_gather(data):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data: any picklable object
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    world_size = get_world_size()
+    if world_size == 1:
+        return [data]
+    # serialized to a Tensor
+    buffer = pickle.dumps(data)
+    storage = torch.ByteStorage.from_buffer(buffer)
+    tensor = torch.ByteTensor(storage).to("cuda")
+    # obtain Tensor size of each rank
+    local_size = torch.tensor([tensor.numel()], device="cuda")
+    size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)]
+    dist.all_gather(size_list, local_size)
+    size_list = [int(size.item()) for size in size_list]
+    max_size = max(size_list)
+    # receiving Tensor from all ranks
+    # we pad the tensor because torch all_gather does not support
+    # gathering tensors of different shapes
+    tensor_list = []
+    for _ in size_list:
+        tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda"))
+    if local_size != max_size:
+        padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda")
+        tensor = torch.cat((tensor, padding), dim=0)
+    dist.all_gather(tensor_list, tensor)
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        buffer = tensor.cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+    return data_list
+def reduce_dict(input_dict, average=True):
+    """
+    Args:
+        input_dict (dict): all the values will be reduced
+        average (bool): whether to do average or sum
+    Reduce the values in the dictionary from all processes so that all processes
+    have the averaged results. Returns a dict with the same fields as
+    input_dict, after reduction.
+    """
+    world_size = get_world_size()
+    if world_size < 2:
+        return input_dict
+    with torch.no_grad():
+        names = []
+        values = []
+        # sort the keys so that they are consistent across processes
+        for k in sorted(input_dict.keys()):
+            names.append(k)
+            values.append(input_dict[k])
+        values = torch.stack(values, dim=0)
+        dist.all_reduce(values)
+        if average:
+            values /= world_size
+        reduced_dict = {k: v for k, v in zip(names, values)}
+    return reduced_dict
+class MetricLogger(object):
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, attr))
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append(
+                "{}: {}".format(name, str(meter))
+            )
+        return self.delimiter.join(loss_str)
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+    def log_every(self, iterable, print_freq, header=None):
+        i = 0
+        if not header:
+            header = ''
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt='{avg:.4f}')
+        data_time = SmoothedValue(fmt='{avg:.4f}')
+        space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
+        if torch.cuda.is_available():
+            log_msg = self.delimiter.join([
+                header,
+                '[{0' + space_fmt + '}/{1}]',
+                'eta: {eta}',
+                '{meters}',
+                'time: {time}',
+                'data: {data}',
+                'max mem: {memory:.0f}'
+            ])
+        else:
+            log_msg = self.delimiter.join([
+                header,
+                '[{0' + space_fmt + '}/{1}]',
+                'eta: {eta}',
+                '{meters}',
+                'time: {time}',
+                'data: {data}'
+            ])
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == len(iterable) - 1:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time),
+                        memory=torch.cuda.max_memory_allocated() / MB))
+                else:
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time)))
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print('{} Total time: {} ({:.4f} s / it)'.format(
+            header, total_time_str, total_time / len(iterable)))
+def get_sha():
+    cwd = os.path.dirname(os.path.abspath(__file__))
+    def _run(command):
+        return subprocess.check_output(command, cwd=cwd).decode('ascii').strip()
+    sha = 'N/A'
+    diff = "clean"
+    branch = 'N/A'
+    try:
+        sha = _run(['git', 'rev-parse', 'HEAD'])
+        subprocess.check_output(['git', 'diff'], cwd=cwd)
+        diff = _run(['git', 'diff-index', 'HEAD'])
+        diff = "has uncommited changes" if diff else "clean"
+        branch = _run(['git', 'rev-parse', '--abbrev-ref', 'HEAD'])
+    except Exception:
+        pass
+    message = f"sha: {sha}, status: {diff}, branch: {branch}"
+    return message
+def collate_fn(batch):
+    batch = list(zip(*batch))
+    batch[0] = nested_tensor_from_tensor_list(batch[0])
+    return tuple(batch)
+def _max_by_axis(the_list):
+    # type: (List[List[int]]) -> List[int]
+    maxes = the_list[0]
+    for sublist in the_list[1:]:
+        for index, item in enumerate(sublist):
+            maxes[index] = max(maxes[index], item)
+    return maxes
+class NestedTensor(object):
+    def __init__(self, tensors, mask: Optional[Tensor]):
+        self.tensors = tensors
+        self.mask = mask
+    def to(self, device):
+        # type: (Device) -> NestedTensor # noqa
+        cast_tensor = self.tensors.to(device)
+        mask = self.mask
+        if mask is not None:
+            assert mask is not None
+            cast_mask = mask.to(device)
+        else:
+            cast_mask = None
+        return NestedTensor(cast_tensor, cast_mask)
+    def decompose(self):
+        return self.tensors, self.mask
+    def __repr__(self):
+        return str(self.tensors)
+def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
+    # TODO make this more general
+    if tensor_list[0].ndim == 3:
+        if torchvision._is_tracing():
+            # nested_tensor_from_tensor_list() does not export well to ONNX
+            # call _onnx_nested_tensor_from_tensor_list() instead
+            return _onnx_nested_tensor_from_tensor_list(tensor_list)
+        # TODO make it support different-sized images
+        max_size = _max_by_axis([list(img.shape) for img in tensor_list])
+        # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
+        batch_shape = [len(tensor_list)] + max_size
+        b, c, h, w = batch_shape
+        dtype = tensor_list[0].dtype
+        device = tensor_list[0].device
+        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
+        mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
+        for img, pad_img, m in zip(tensor_list, tensor, mask):
+            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+            m[: img.shape[1], :img.shape[2]] = False
+    else:
+        raise ValueError('not supported')
+    return NestedTensor(tensor, mask)
+# _onnx_nested_tensor_from_tensor_list() is an implementation of
+# nested_tensor_from_tensor_list() that is supported by ONNX tracing.
+@torch.jit.unused
+def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
+    max_size = []
+    for i in range(tensor_list[0].dim()):
+        max_size_i = torch.max(torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)).to(torch.int64)
+        max_size.append(max_size_i)
+    max_size = tuple(max_size)
+    # work around for
+    # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+    # m[: img.shape[1], :img.shape[2]] = False
+    # which is not yet supported in onnx
+    padded_imgs = []
+    padded_masks = []
+    for img in tensor_list:
+        padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
+        padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
+        padded_imgs.append(padded_img)
+        m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
+        padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
+        padded_masks.append(padded_mask.to(torch.bool))
+    tensor = torch.stack(padded_imgs)
+    mask = torch.stack(padded_masks)
+    return NestedTensor(tensor, mask=mask)
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+    __builtin__.print = print
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+def is_main_process():
+    return get_rank() == 0
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+def init_distributed_mode(args):
+    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ['WORLD_SIZE'])
+        args.gpu = int(os.environ['LOCAL_RANK'])
+    elif 'SLURM_PROCID' in os.environ:
+        args.rank = int(os.environ['SLURM_PROCID'])
+        args.gpu = args.rank % torch.cuda.device_count()
+    else:
+        print('Not using distributed mode')
+        args.distributed = False
+        return
+    args.distributed = True
+    torch.cuda.set_device(args.gpu)
+    args.dist_backend = 'nccl'
+    print('| distributed init (rank {}): {}'.format(
+        args.rank, args.dist_url), flush=True)
+    torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                         world_size=args.world_size, rank=args.rank)
+    torch.distributed.barrier()
+    setup_for_distributed(args.rank == 0)
+@torch.no_grad()
+def accuracy(output, target, topk=(1,)):
+    """Computes the precision@k for the specified values of k"""
+    if target.numel() == 0:
+        return [torch.zeros([], device=output.device)]
+    maxk = max(topk)
+    batch_size = target.size(0)
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+    res = []
+    for k in topk:
+        correct_k = correct[:k].view(-1).float().sum(0)
+        res.append(correct_k.mul_(100.0 / batch_size))
+    return res
+def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corners=None):
+    # type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor
+    """
+    Equivalent to nn.functional.interpolate, but with support for empty batch sizes.
+    This will eventually be supported natively by PyTorch, and this
+    class can go away.
+    """
+    if float(torchvision.__version__[:3]) < 0.7:
+        if input.numel() > 0:
+            return torch.nn.functional.interpolate(
+                input, size, scale_factor, mode, align_corners
+            )
+        output_shape = _output_size(2, input, size, scale_factor)
+        output_shape = list(input.shape[:-2]) + list(output_shape)
+        return _new_empty_tensor(input, output_shape)
+    else:
+        return torchvision.ops.misc.interpolate(input, size, scale_factor, mode, align_corners)
--- a/diffusiondet/util/model_ema.py
+++ b/diffusiondet/util/model_ema.py
--- a/diffusiondet/util/plot_utils.py
+++ b/diffusiondet/util/plot_utils.py
+"""
+Plotting utilities to visualize training logs.
+"""
+import torch
+import pandas as pd
+import numpy as np
+import seaborn as sns
+import matplotlib.pyplot as plt
+from pathlib import Path, PurePath
+def plot_logs(logs, fields=('class_error', 'loss_bbox_unscaled', 'mAP'), ewm_col=0, log_name='log.txt'):
+    '''
+    Function to plot specific fields from training log(s). Plots both training and test results.
+    :: Inputs - logs = list containing Path objects, each pointing to individual dir with a log file
+              - fields = which results to plot from each log file - plots both training and test for each field.
+              - ewm_col = optional, which column to use as the exponential weighted smoothing of the plots
+              - log_name = optional, name of log file if different than default 'log.txt'.
+    :: Outputs - matplotlib plots of results in fields, color coded for each log file.
+               - solid lines are training results, dashed lines are test results.
+    '''
+    func_name = "plot_utils.py::plot_logs"
+    # verify logs is a list of Paths (list[Paths]) or single Pathlib object Path,
+    # convert single Path to list to avoid 'not iterable' error
+    if not isinstance(logs, list):
+        if isinstance(logs, PurePath):
+            logs = [logs]
+            print(f"{func_name} info: logs param expects a list argument, converted to list[Path].")
+        else:
+            raise ValueError(f"{func_name} - invalid argument for logs parameter.\n \
+            Expect list[Path] or single Path obj, received {type(logs)}")
+    # Quality checks - verify valid dir(s), that every item in list is Path object, and that log_name exists in each dir
+    for i, dir in enumerate(logs):
+        if not isinstance(dir, PurePath):
+            raise ValueError(f"{func_name} - non-Path object in logs argument of {type(dir)}: \n{dir}")
+        if not dir.exists():
+            raise ValueError(f"{func_name} - invalid directory in logs argument:\n{dir}")
+        # verify log_name exists
+        fn = Path(dir / log_name)
+        if not fn.exists():
+            print(f"-> missing {log_name}.  Have you gotten to Epoch 1 in training?")
+            print(f"--> full path of missing log file: {fn}")
+            return
+    # load log file(s) and plot
+    dfs = [pd.read_json(Path(p) / log_name, lines=True) for p in logs]
+    fig, axs = plt.subplots(ncols=len(fields), figsize=(16, 5))
+    for df, color in zip(dfs, sns.color_palette(n_colors=len(logs))):
+        for j, field in enumerate(fields):
+            if field == 'mAP':
+                coco_eval = pd.DataFrame(
+                    np.stack(df.test_coco_eval_bbox.dropna().values)[:, 1]
+                ).ewm(com=ewm_col).mean()
+                axs[j].plot(coco_eval, c=color)
+            else:
+                df.interpolate().ewm(com=ewm_col).mean().plot(
+                    y=[f'train_{field}', f'test_{field}'],
+                    ax=axs[j],
+                    color=[color] * 2,
+                    style=['-', '--']
+                )
+    for ax, field in zip(axs, fields):
+        ax.legend([Path(p).name for p in logs])
+        ax.set_title(field)
+def plot_precision_recall(files, naming_scheme='iter'):
+    if naming_scheme == 'exp_id':
+        # name becomes exp_id
+        names = [f.parts[-3] for f in files]
+    elif naming_scheme == 'iter':
+        names = [f.stem for f in files]
+    else:
+        raise ValueError(f'not supported {naming_scheme}')
+    fig, axs = plt.subplots(ncols=2, figsize=(16, 5))
+    for f, color, name in zip(files, sns.color_palette("Blues", n_colors=len(files)), names):
+        data = torch.load(f)
+        # precision is n_iou, n_points, n_cat, n_area, max_det
+        precision = data['precision']
+        recall = data['params'].recThrs
+        scores = data['scores']
+        # take precision for all classes, all areas and 100 detections
+        precision = precision[0, :, :, 0, -1].mean(1)
+        scores = scores[0, :, :, 0, -1].mean(1)
+        prec = precision.mean()
+        rec = data['recall'][0, :, 0, -1].mean()
+        print(f'{naming_scheme} {name}: mAP@50={prec * 100: 05.1f}, ' +
+              f'score={scores.mean():0.3f}, ' +
+              f'f1={2 * prec * rec / (prec + rec + 1e-8):0.3f}'
+              )
+        axs[0].plot(recall, precision, c=color)
+        axs[1].plot(recall, scores, c=color)
+    axs[0].set_title('Precision / Recall')
+    axs[0].legend(names)
+    axs[1].set_title('Scores / Recall')
+    axs[1].legend(names)
+    return fig, axs
--- a/icon.png
+++ b/icon.png
--- a/model.properties
+++ b/model.properties
+# 模型唯一标识
+modelCode=901
+# 模型名称
+modelName=diffusiondet_pytorch
+# 模型描述
+modelDescription= Diffusion Model for Object Detection
+# 应用场景
+appScenario=推理,训练,科研,制造,医疗,家居,教育
+# 框架类型
+frameType=pytorch
--- a/train_net.py
+++ b/train_net.py