Release DINO model with InternImage-T and -L (#99)

bdd98bcb · Zhe Chen · GitHub · 1e6e309d · bdd98bcb · bdd98bcb
Unverified Commit bdd98bcb authored Apr 14, 2023 by Zhe Chen Committed by GitHub Apr 14, 2023
14 changed files
--- a/detection/configs/coco/README.md
+++ b/detection/configs/coco/README.md
@@ -41,3 +41,12 @@ Based on community feedback, in 2017 the training/validation split was changed f
 - Training speed is measured with A100 GPUs using current code and may be faster than the speed in logs.
 - Some logs are our recent newly trained ones. There might be slight differences between the results in logs and our paper.
 - Please set `with_cp=True` to save memory if you meet `out-of-memory` issues.
+
+
+### DINO + InternImage
+|    backbone    |  lr type     | pretrain    |       schd | box mAP | 	train time | #param | Config | Download |
+| :------------: |  :---------: |:---------: | :---------: | :-----: |  :---: | :-----: | :---: | :---: | 
+| InternImage-T  | layer-wise lr    | ImageNet-1K  |     1x      |  53.9   |  9.5h |  49M    | [config](./dino_4scale_internimage_t_1x_coco_layer_wise_lr.py)     | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_t_1x_coco.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_t_1x_coco.json) |
+| InternImage-L  | layer-wise lr    | ImageNet-22K |     1x      |  57.5   |   18h |  241M   |  [config](./dino_4scale_internimage_l_1x_coco_layer_wise_lr.py)    | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_l_1x_coco_layer_wise_lr.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_l_1x_coco_layer_wise_lr.log.json) |
+| InternImage-L  | 0.1x backbone lr | ImageNet-22K |     1x      |  57.6   |   18h |  241M   |  [config](./dino_4scale_internimage_l_1x_coco_0.1x_backbone_lr.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_l_1x_coco_0.1x_backbone_lr.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_l_1x_coco_0.1x_backbone_lr.log.json) |
+
--- a/detection/configs/coco/dino_4scale_internimage_l_1x_coco_0.1x_backbone_lr.py
+++ b/detection/configs/coco/dino_4scale_internimage_l_1x_coco_0.1x_backbone_lr.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_1x.py',
+]
+pretrained = 'https://huggingface.co/OpenGVLab/InternImage/resolve/main/internimage_l_22k_192to384.pth'
+model = dict(
+    type='DINO',
+    backbone=dict(
+        type='InternImage',
+        core_op='DCNv3',
+        channels=160,
+        depths=[5, 5, 22, 5],
+        groups=[10, 20, 40, 80],
+        mlp_ratio=4.,
+        drop_path_rate=0.4,
+        norm_layer='LN',
+        layer_scale=1.0,
+        offset_scale=2.0,
+        post_norm=True,
+        with_cp=False,
+        out_indices=(1, 2, 3),
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    neck=dict(
+        type='ChannelMapper',
+        in_channels=[320, 640, 1280],
+        kernel_size=1,
+        out_channels=256,
+        act_cfg=None,
+        norm_cfg=dict(type='GN', num_groups=32),
+        num_outs=4),
+    bbox_head=dict(
+        type='DINOHead',
+        num_query=900,
+        num_classes=80,
+        in_channels=2048,
+        sync_cls_avg_factor=True,
+        as_two_stage=True,
+        with_box_refine=True,
+        dn_cfg=dict(
+            type='CdnQueryGenerator',
+            noise_scale=dict(label=0.5, box=1.0),
+            group_cfg=dict(dynamic=True, num_groups=None, num_dn_queries=100)),
+        transformer=dict(
+            type='DinoTransformer',
+            two_stage_num_proposals=900,
+            encoder=dict(
+                type='DetrTransformerEncoder',
+                num_layers=6,
+                transformerlayers=dict(
+                    type='BaseTransformerLayer',
+                    attn_cfgs=dict(
+                        type='MultiScaleDeformableAttention',
+                        embed_dims=256,
+                        dropout=0.0),
+                    feedforward_channels=2048,
+                    ffn_dropout=0.0,  # 0.1 for DeformDETR
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
+            decoder=dict(
+                type='DinoTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=256,
+                            num_heads=8,
+                            dropout=0.0),
+                        dict(
+                            type='MultiScaleDeformableAttention',
+                            embed_dims=256,
+                            dropout=0.0),
+                    ],
+                    feedforward_channels=2048,
+                    ffn_dropout=0.0,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        positional_encoding=dict(
+            type='SinePositionalEncoding',
+            num_feats=128,
+            temperature=20,
+            normalize=True),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=2.0)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='HungarianAssigner',
+            cls_cost=dict(type='FocalLossCost', weight=2.0),
+            reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+            iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0))),
+    test_cfg=dict(max_per_img=300))
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
+# from the default setting in mmdet.
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='AutoAugment',
+        policies=[
+            [
+                dict(
+                    type='Resize',
+                    img_scale=[(480, 1333), (512, 1333), (544, 1333),
+                               (576, 1333), (608, 1333), (640, 1333),
+                               (672, 1333), (704, 1333), (736, 1333),
+                               (768, 1333), (800, 1333)],
+                    multiscale_mode='value',
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='Resize',
+                    img_scale=[(400, 4200), (500, 4200), (600, 4200)],
+                    multiscale_mode='value',
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=False),
+                dict(
+                    type='Resize',
+                    img_scale=[(480, 1333), (512, 1333), (544, 1333),
+                               (576, 1333), (608, 1333), (640, 1333),
+                               (672, 1333), (704, 1333), (736, 1333),
+                               (768, 1333), (800, 1333)],
+                    multiscale_mode='value',
+                    override=True,
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
+]
+# By default, models are trained on 8 GPUs with 2 images per GPU
+data = dict(
+    samples_per_gpu=2,
+    train=dict(pipeline=train_pipeline))
+# optimizer
+optimizer = dict(
+    _delete_=True, type='AdamW', lr=0.0001, weight_decay=0.05,
+    paramwise_cfg=dict(
+        custom_keys={
+            'backbone': dict(lr_mult=0.1),
+}))
+optimizer_config = dict(_delete_=True, grad_clip=dict(max_norm=0.1, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[11])
+evaluation = dict(save_best='auto')
+checkpoint_config = dict(
+    interval=1,
+    max_keep_ckpts=3,
+    save_last=True,
+)
\ No newline at end of file
--- a/detection/configs/coco/dino_4scale_internimage_l_1x_coco_layer_wise_lr.py
+++ b/detection/configs/coco/dino_4scale_internimage_l_1x_coco_layer_wise_lr.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_1x.py',
+]
+pretrained = 'https://huggingface.co/OpenGVLab/InternImage/resolve/main/internimage_l_22k_192to384.pth'
+model = dict(
+    type='DINO',
+    backbone=dict(
+        type='InternImage',
+        core_op='DCNv3',
+        channels=160,
+        depths=[5, 5, 22, 5],
+        groups=[10, 20, 40, 80],
+        mlp_ratio=4.,
+        drop_path_rate=0.4,
+        norm_layer='LN',
+        layer_scale=1.0,
+        offset_scale=2.0,
+        post_norm=True,
+        with_cp=False,
+        out_indices=(1, 2, 3),
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    neck=dict(
+        type='ChannelMapper',
+        in_channels=[320, 640, 1280],
+        kernel_size=1,
+        out_channels=256,
+        act_cfg=None,
+        norm_cfg=dict(type='GN', num_groups=32),
+        num_outs=4),
+    bbox_head=dict(
+        type='DINOHead',
+        num_query=900,
+        num_classes=80,
+        in_channels=2048,
+        sync_cls_avg_factor=True,
+        as_two_stage=True,
+        with_box_refine=True,
+        dn_cfg=dict(
+            type='CdnQueryGenerator',
+            noise_scale=dict(label=0.5, box=1.0),
+            group_cfg=dict(dynamic=True, num_groups=None, num_dn_queries=100)),
+        transformer=dict(
+            type='DinoTransformer',
+            two_stage_num_proposals=900,
+            encoder=dict(
+                type='DetrTransformerEncoder',
+                num_layers=6,
+                transformerlayers=dict(
+                    type='BaseTransformerLayer',
+                    attn_cfgs=dict(
+                        type='MultiScaleDeformableAttention',
+                        embed_dims=256,
+                        dropout=0.0),
+                    feedforward_channels=2048,
+                    ffn_dropout=0.0,  # 0.1 for DeformDETR
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
+            decoder=dict(
+                type='DinoTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=256,
+                            num_heads=8,
+                            dropout=0.0),
+                        dict(
+                            type='MultiScaleDeformableAttention',
+                            embed_dims=256,
+                            dropout=0.0),
+                    ],
+                    feedforward_channels=2048,
+                    ffn_dropout=0.0,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        positional_encoding=dict(
+            type='SinePositionalEncoding',
+            num_feats=128,
+            temperature=20,
+            normalize=True),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=2.0)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='HungarianAssigner',
+            cls_cost=dict(type='FocalLossCost', weight=2.0),
+            reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+            iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0))),
+    test_cfg=dict(max_per_img=300))
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
+# from the default setting in mmdet.
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='AutoAugment',
+        policies=[
+            [
+                dict(
+                    type='Resize',
+                    img_scale=[(480, 1333), (512, 1333), (544, 1333),
+                               (576, 1333), (608, 1333), (640, 1333),
+                               (672, 1333), (704, 1333), (736, 1333),
+                               (768, 1333), (800, 1333)],
+                    multiscale_mode='value',
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='Resize',
+                    img_scale=[(400, 4200), (500, 4200), (600, 4200)],
+                    multiscale_mode='value',
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=False),
+                dict(
+                    type='Resize',
+                    img_scale=[(480, 1333), (512, 1333), (544, 1333),
+                               (576, 1333), (608, 1333), (640, 1333),
+                               (672, 1333), (704, 1333), (736, 1333),
+                               (768, 1333), (800, 1333)],
+                    multiscale_mode='value',
+                    override=True,
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
+]
+# By default, models are trained on 8 GPUs with 2 images per GPU
+data = dict(
+    samples_per_gpu=2,
+    train=dict(pipeline=train_pipeline))
+# optimizer
+optimizer = dict(
+    _delete_=True, type='AdamW', lr=0.0001, weight_decay=0.0001,
+    constructor='CustomLayerDecayOptimizerConstructor',
+    paramwise_cfg=dict(num_layers=37, layer_decay_rate=0.90,
+                       depths=[5, 5, 22, 5]))
+optimizer_config = dict(_delete_=True, grad_clip=dict(max_norm=0.1, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[11])
+evaluation = dict(save_best='auto')
+checkpoint_config = dict(
+    interval=1,
+    max_keep_ckpts=3,
+    save_last=True,
+)
\ No newline at end of file
--- a/detection/configs/coco/dino_4scale_internimage_t_1x_coco_layer_wise_lr.py
+++ b/detection/configs/coco/dino_4scale_internimage_t_1x_coco_layer_wise_lr.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_1x.py',
+]
+pretrained = 'https://huggingface.co/OpenGVLab/InternImage/resolve/main/internimage_t_1k_224.pth'
+model = dict(
+    type='DINO',
+    backbone=dict(
+        type='InternImage',
+        core_op='DCNv3',
+        channels=64,
+        depths=[4, 4, 18, 4],
+        groups=[4, 8, 16, 32],
+        mlp_ratio=4.,
+        drop_path_rate=0.2,
+        norm_layer='LN',
+        layer_scale=1.0,
+        offset_scale=1.0,
+        post_norm=False,
+        with_cp=True,
+        out_indices=(1, 2, 3),
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    neck=dict(
+        type='ChannelMapper',
+        in_channels=[128, 256, 512],
+        kernel_size=1,
+        out_channels=256,
+        act_cfg=None,
+        norm_cfg=dict(type='GN', num_groups=32),
+        num_outs=4),
+    bbox_head=dict(
+        type='DINOHead',
+        num_query=900,
+        num_classes=80,
+        in_channels=2048,
+        sync_cls_avg_factor=True,
+        as_two_stage=True,
+        with_box_refine=True,
+        dn_cfg=dict(
+            type='CdnQueryGenerator',
+            noise_scale=dict(label=0.5, box=1.0),
+            group_cfg=dict(dynamic=True, num_groups=None, num_dn_queries=100)),
+        transformer=dict(
+            type='DinoTransformer',
+            two_stage_num_proposals=900,
+            encoder=dict(
+                type='DetrTransformerEncoder',
+                num_layers=6,
+                transformerlayers=dict(
+                    type='BaseTransformerLayer',
+                    attn_cfgs=dict(
+                        type='MultiScaleDeformableAttention',
+                        embed_dims=256,
+                        dropout=0.0),
+                    feedforward_channels=2048,
+                    ffn_dropout=0.0,  # 0.1 for DeformDETR
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
+            decoder=dict(
+                type='DinoTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=256,
+                            num_heads=8,
+                            dropout=0.0),
+                        dict(
+                            type='MultiScaleDeformableAttention',
+                            embed_dims=256,
+                            dropout=0.0),
+                    ],
+                    feedforward_channels=2048,
+                    ffn_dropout=0.0,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        positional_encoding=dict(
+            type='SinePositionalEncoding',
+            num_feats=128,
+            temperature=20,
+            normalize=True),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=2.0)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='HungarianAssigner',
+            cls_cost=dict(type='FocalLossCost', weight=2.0),
+            reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+            iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0))),
+    test_cfg=dict(max_per_img=100))
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
+# from the default setting in mmdet.
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='AutoAugment',
+        policies=[
+            [
+                dict(
+                    type='Resize',
+                    img_scale=[(480, 1333), (512, 1333), (544, 1333),
+                               (576, 1333), (608, 1333), (640, 1333),
+                               (672, 1333), (704, 1333), (736, 1333),
+                               (768, 1333), (800, 1333)],
+                    multiscale_mode='value',
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='Resize',
+                    img_scale=[(400, 4200), (500, 4200), (600, 4200)],
+                    multiscale_mode='value',
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=False),
+                dict(
+                    type='Resize',
+                    img_scale=[(480, 1333), (512, 1333), (544, 1333),
+                               (576, 1333), (608, 1333), (640, 1333),
+                               (672, 1333), (704, 1333), (736, 1333),
+                               (768, 1333), (800, 1333)],
+                    multiscale_mode='value',
+                    override=True,
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
+]
+# By default, models are trained on 8 GPUs with 2 images per GPU
+data = dict(
+    samples_per_gpu=2,
+    train=dict(pipeline=train_pipeline))
+# optimizer
+optimizer = dict(
+    _delete_=True, type='AdamW', lr=0.0001, weight_decay=0.0001,
+    constructor='CustomLayerDecayOptimizerConstructor',
+    paramwise_cfg=dict(num_layers=30, layer_decay_rate=0.9,
+                       depths=[4, 4, 18, 4]))
+optimizer_config = dict(_delete_=True, grad_clip=dict(max_norm=0.1, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[11])
+evaluation = dict(save_best='auto')
+checkpoint_config = dict(
+    interval=1,
+    max_keep_ckpts=3,
+    save_last=True,
+)
\ No newline at end of file
--- a/detection/mmdet_custom/models/__init__.py
+++ b/detection/mmdet_custom/models/__init__.py
@@ -4,4 +4,7 @@
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------

-from .backbones import *  # noqa: F401,F403
\ No newline at end of file
+from .backbones import *  # noqa: F401,F403
+from .dense_heads import *  # noqa: F401,F403
+from .detectors import *  # noqa: F401,F403
+from .utils import *  # noqa: F401,F403
\ No newline at end of file
--- a/detection/mmdet_custom/models/dense_heads/__init__.py
+++ b/detection/mmdet_custom/models/dense_heads/__init__.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+
+from .deformable_detr_head import DeformableDETRHead
+from .detr_head import DETRHead
+from .dino_head import DINOHead
+
+__all__ = ['DeformableDETRHead', 'DETRHead', 'DINOHead']
\ No newline at end of file
--- a/detection/mmdet_custom/models/dense_heads/deformable_detr_head.py
+++ b/detection/mmdet_custom/models/dense_heads/deformable_detr_head.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Linear, bias_init_with_prob, constant_init
+from mmcv.runner import force_fp32
+
+from mmdet.core import multi_apply
+from mmdet.models.utils.transformer import inverse_sigmoid
+from mmdet.models.builder import HEADS
+from .detr_head import DETRHead
+
+
+@HEADS.register_module(force=True)
+class DeformableDETRHead(DETRHead):
+    """Head of DeformDETR: Deformable DETR: Deformable Transformers for End-to-
+    End Object Detection.
+
+    Code is modified from the `official github repo
+    <https://github.com/fundamentalvision/Deformable-DETR>`_.
+
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2010.04159>`_ .
+
+    Args:
+        with_box_refine (bool): Whether to refine the reference points
+            in the decoder. Defaults to False.
+        as_two_stage (bool) : Whether to generate the proposal from
+            the outputs of encoder.
+        transformer (obj:`ConfigDict`): ConfigDict is used for building
+            the Encoder and Decoder.
+    """
+
+    def __init__(self,
+                 *args,
+                 with_box_refine=False,
+                 as_two_stage=False,
+                 transformer=None,
+                 use_2fc_cls_branch=False,
+                 **kwargs):
+        self.with_box_refine = with_box_refine
+        self.as_two_stage = as_two_stage
+        self.use_2fc_cls_branch = use_2fc_cls_branch
+        if self.as_two_stage:
+            transformer['as_two_stage'] = self.as_two_stage
+
+        super(DeformableDETRHead, self).__init__(
+            *args, transformer=transformer, **kwargs)
+
+    def _init_layers(self):
+        """Initialize classification branch and regression branch of head."""
+        
+        if not self.use_2fc_cls_branch:
+            fc_cls = Linear(self.embed_dims, self.cls_out_channels)
+        else:
+            fc_cls = nn.Sequential(*[
+                Linear(self.embed_dims, int(self.embed_dims * 1.5)),
+                nn.LayerNorm(int(self.embed_dims * 1.5)),
+                nn.GELU(),
+                Linear(int(self.embed_dims * 1.5), self.cls_out_channels),
+            ])
+            fc_cls.out_features = self.cls_out_channels
+
+        reg_branch = []
+        for _ in range(self.num_reg_fcs):
+            reg_branch.append(Linear(self.embed_dims, self.embed_dims))
+            reg_branch.append(nn.ReLU())
+        reg_branch.append(Linear(self.embed_dims, 4))
+        reg_branch = nn.Sequential(*reg_branch)
+
+        def _get_clones(module, N):
+            return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+        # last reg_branch is used to generate proposal from
+        # encode feature map when as_two_stage is True.
+        num_pred = (self.transformer.decoder.num_layers + 1) if \
+            self.as_two_stage else self.transformer.decoder.num_layers
+
+        if self.with_box_refine:
+            self.cls_branches = _get_clones(fc_cls, num_pred)
+            self.reg_branches = _get_clones(reg_branch, num_pred)
+        else:
+
+            self.cls_branches = nn.ModuleList(
+                [fc_cls for _ in range(num_pred)])
+            self.reg_branches = nn.ModuleList(
+                [reg_branch for _ in range(num_pred)])
+
+        if not self.as_two_stage:
+            self.query_embedding = nn.Embedding(
+                self.num_query,
+                self.embed_dims * 2)
+
+    def init_weights(self):
+        """Initialize weights of the DeformDETR head."""
+        self.transformer.init_weights()
+        if self.loss_cls.use_sigmoid:
+            bias_init = bias_init_with_prob(0.01)
+            if not self.use_2fc_cls_branch:
+                for m in self.cls_branches:
+                    nn.init.constant_(m.bias, bias_init)
+        for m in self.reg_branches:
+            constant_init(m[-1], 0, bias=0)
+        nn.init.constant_(self.reg_branches[0][-1].bias.data[2:], -2.0)
+        if self.as_two_stage:
+            for m in self.reg_branches:
+                nn.init.constant_(m[-1].bias.data[2:], 0.0)
+
+    def forward(self, mlvl_feats, img_metas):
+        """Forward function.
+
+        Args:
+            mlvl_feats (tuple[Tensor]): Features from the upstream
+                network, each is a 4D-tensor with shape
+                (N, C, H, W).
+            img_metas (list[dict]): List of image information.
+
+        Returns:
+            all_cls_scores (Tensor): Outputs from the classification head, \
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression \
+                head with normalized coordinate format (cx, cy, w, h). \
+                Shape [nb_dec, bs, num_query, 4].
+            enc_outputs_class (Tensor): The score of each point on encode \
+                feature map, has shape (N, h*w, num_class). Only when \
+                as_two_stage is True it would be returned, otherwise \
+                `None` would be returned.
+            enc_outputs_coord (Tensor): The proposal generate from the \
+                encode feature map, has shape (N, h*w, 4). Only when \
+                as_two_stage is True it would be returned, otherwise \
+                `None` would be returned.
+        """
+
+        batch_size = mlvl_feats[0].size(0)
+        input_img_h, input_img_w = img_metas[0]['batch_input_shape']
+        img_masks = mlvl_feats[0].new_ones(
+            (batch_size, input_img_h, input_img_w))
+        for img_id in range(batch_size):
+            img_h, img_w, _ = img_metas[img_id]['img_shape']
+            img_masks[img_id, :img_h, :img_w] = 0
+
+        mlvl_masks = []
+        mlvl_positional_encodings = []
+        for feat in mlvl_feats:
+            mlvl_masks.append(
+                F.interpolate(img_masks[None],
+                              size=feat.shape[-2:]).to(torch.bool).squeeze(0))
+            mlvl_positional_encodings.append(
+                self.positional_encoding(mlvl_masks[-1]))
+
+        query_embeds = None
+        if not self.as_two_stage:
+            query_embeds = self.query_embedding.weight
+        hs, init_reference, inter_references, \
+            enc_outputs_class, enc_outputs_coord = self.transformer(
+                    mlvl_feats,
+                    mlvl_masks,
+                    query_embeds,
+                    mlvl_positional_encodings,
+                    reg_branches=self.reg_branches if self.with_box_refine else None,  # noqa:E501
+                    cls_branches=self.cls_branches if self.as_two_stage else None  # noqa:E501
+            )
+        hs = hs.permute(0, 2, 1, 3)
+        outputs_classes = []
+        outputs_coords = []
+
+        for lvl in range(hs.shape[0]):
+            if lvl == 0:
+                reference = init_reference
+            else:
+                reference = inter_references[lvl - 1]
+            reference = inverse_sigmoid(reference)
+            outputs_class = self.cls_branches[lvl](hs[lvl])
+            tmp = self.reg_branches[lvl](hs[lvl])
+            if reference.shape[-1] == 4:
+                tmp += reference
+            else:
+                assert reference.shape[-1] == 2
+                tmp[..., :2] += reference
+            outputs_coord = tmp.sigmoid()
+            outputs_classes.append(outputs_class)
+            outputs_coords.append(outputs_coord)
+
+        outputs_classes = torch.stack(outputs_classes)
+        outputs_coords = torch.stack(outputs_coords)
+        if self.as_two_stage:
+            return outputs_classes, outputs_coords, \
+                enc_outputs_class, \
+                enc_outputs_coord.sigmoid()
+        else:
+            return outputs_classes, outputs_coords, \
+                None, None
+
+    @force_fp32(apply_to=('all_cls_scores_list', 'all_bbox_preds_list'))
+    def loss(self,
+             all_cls_scores,
+             all_bbox_preds,
+             enc_cls_scores,
+             enc_bbox_preds,
+             gt_bboxes_list,
+             gt_labels_list,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """"Loss function.
+
+        Args:
+            all_cls_scores (Tensor): Classification score of all
+                decoder layers, has shape
+                [nb_dec, bs, num_query, cls_out_channels].
+            all_bbox_preds (Tensor): Sigmoid regression
+                outputs of all decode layers. Each is a 4D-tensor with
+                normalized coordinate format (cx, cy, w, h) and shape
+                [nb_dec, bs, num_query, 4].
+            enc_cls_scores (Tensor): Classification scores of
+                points on encode feature map , has shape
+                (N, h*w, num_classes). Only be passed when as_two_stage is
+                True, otherwise is None.
+            enc_bbox_preds (Tensor): Regression results of each points
+                on the encode feature map, has shape (N, h*w, 4). Only be
+                passed when as_two_stage is True, otherwise is None.
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            img_metas (list[dict]): List of image meta information.
+            gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
+                which can be ignored for each image. Default None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert gt_bboxes_ignore is None, \
+            f'{self.__class__.__name__} only supports ' \
+            f'for gt_bboxes_ignore setting to None.'
+
+        num_dec_layers = len(all_cls_scores)
+        all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
+        all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
+        all_gt_bboxes_ignore_list = [
+            gt_bboxes_ignore for _ in range(num_dec_layers)
+        ]
+        img_metas_list = [img_metas for _ in range(num_dec_layers)]
+
+        losses_cls, losses_bbox, losses_iou = multi_apply(
+            self.loss_single, all_cls_scores, all_bbox_preds,
+            all_gt_bboxes_list, all_gt_labels_list, img_metas_list,
+            all_gt_bboxes_ignore_list)
+
+        loss_dict = dict()
+        # loss of proposal generated from encode feature map.
+        if enc_cls_scores is not None:
+            binary_labels_list = [
+                torch.zeros_like(gt_labels_list[i])
+                for i in range(len(img_metas))
+            ]
+            enc_loss_cls, enc_losses_bbox, enc_losses_iou = \
+                self.loss_single(enc_cls_scores, enc_bbox_preds,
+                                 gt_bboxes_list, binary_labels_list,
+                                 img_metas, gt_bboxes_ignore)
+            loss_dict['enc_loss_cls'] = enc_loss_cls
+            loss_dict['enc_loss_bbox'] = enc_losses_bbox
+            loss_dict['enc_loss_iou'] = enc_losses_iou
+
+        # loss from the last decoder layer
+        loss_dict['loss_cls'] = losses_cls[-1]
+        loss_dict['loss_bbox'] = losses_bbox[-1]
+        loss_dict['loss_iou'] = losses_iou[-1]
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for loss_cls_i, loss_bbox_i, loss_iou_i in zip(losses_cls[:-1],
+                                                       losses_bbox[:-1],
+                                                       losses_iou[:-1]):
+            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
+            loss_dict[f'd{num_dec_layer}.loss_iou'] = loss_iou_i
+            num_dec_layer += 1
+        return loss_dict
+
+    @force_fp32(apply_to=('all_cls_scores_list', 'all_bbox_preds_list'))
+    def get_bboxes(self,
+                   all_cls_scores,
+                   all_bbox_preds,
+                   enc_cls_scores,
+                   enc_bbox_preds,
+                   img_metas,
+                   rescale=False):
+        """Transform network outputs for a batch into bbox predictions.
+
+        Args:
+            all_cls_scores (Tensor): Classification score of all
+                decoder layers, has shape
+                [nb_dec, bs, num_query, cls_out_channels].
+            all_bbox_preds (Tensor): Sigmoid regression
+                outputs of all decode layers. Each is a 4D-tensor with
+                normalized coordinate format (cx, cy, w, h) and shape
+                [nb_dec, bs, num_query, 4].
+            enc_cls_scores (Tensor): Classification scores of
+                points on encode feature map , has shape
+                (N, h*w, num_classes). Only be passed when as_two_stage is
+                True, otherwise is None.
+            enc_bbox_preds (Tensor): Regression results of each points
+                on the encode feature map, has shape (N, h*w, 4). Only be
+                passed when as_two_stage is True, otherwise is None.
+            img_metas (list[dict]): Meta information of each image.
+            rescale (bool, optional): If True, return boxes in original
+                image space. Default False.
+
+        Returns:
+            list[list[Tensor, Tensor]]: Each item in result_list is 2-tuple. \
+                The first item is an (n, 5) tensor, where the first 4 columns \
+                are bounding box positions (tl_x, tl_y, br_x, br_y) and the \
+                5-th column is a score between 0 and 1. The second item is a \
+                (n,) tensor where each item is the predicted class label of \
+                the corresponding box.
+        """
+        cls_scores = all_cls_scores[-1]
+        bbox_preds = all_bbox_preds[-1]
+
+        result_list = []
+        for img_id in range(len(img_metas)):
+            cls_score = cls_scores[img_id]
+            bbox_pred = bbox_preds[img_id]
+            img_shape = img_metas[img_id]['img_shape']
+            scale_factor = img_metas[img_id]['scale_factor']
+            proposals = self._get_bboxes_single(cls_score, bbox_pred,
+                                                img_shape, scale_factor,
+                                                rescale)
+            result_list.append(proposals)
+        return result_list
--- a/detection/mmdet_custom/models/dense_heads/detr_head.py
+++ b/detection/mmdet_custom/models/dense_heads/detr_head.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Conv2d, Linear, build_activation_layer
+from mmcv.cnn.bricks.transformer import FFN, build_positional_encoding
+from mmcv.runner import force_fp32
+
+from mmdet.core import (bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh,
+                        build_assigner, build_sampler, multi_apply,
+                        reduce_mean)
+from mmdet.models.utils import build_transformer
+from mmdet.models.builder import HEADS, build_loss
+from mmdet.models.dense_heads.anchor_free_head import AnchorFreeHead
+import numpy as np
+
+
+@HEADS.register_module(force=True)
+class DETRHead(AnchorFreeHead):
+    """Implements the DETR transformer head.
+
+    See `paper: End-to-End Object Detection with Transformers
+    <https://arxiv.org/pdf/2005.12872>`_ for details.
+
+    Args:
+        num_classes (int): Number of categories excluding the background.
+        in_channels (int): Number of channels in the input feature map.
+        num_query (int): Number of query in Transformer.
+        num_reg_fcs (int, optional): Number of fully-connected layers used in
+            `FFN`, which is then used for the regression head. Default 2.
+        transformer (obj:`mmcv.ConfigDict`|dict): Config for transformer.
+            Default: None.
+        sync_cls_avg_factor (bool): Whether to sync the avg_factor of
+            all ranks. Default to False.
+        positional_encoding (obj:`mmcv.ConfigDict`|dict):
+            Config for position encoding.
+        loss_cls (obj:`mmcv.ConfigDict`|dict): Config of the
+            classification loss. Default `CrossEntropyLoss`.
+        loss_bbox (obj:`mmcv.ConfigDict`|dict): Config of the
+            regression loss. Default `L1Loss`.
+        loss_iou (obj:`mmcv.ConfigDict`|dict): Config of the
+            regression iou loss. Default `GIoULoss`.
+        tran_cfg (obj:`mmcv.ConfigDict`|dict): Training config of
+            transformer head.
+        test_cfg (obj:`mmcv.ConfigDict`|dict): Testing config of
+            transformer head.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    _version = 2
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 num_query=100,
+                 num_reg_fcs=2,
+                 transformer=None,
+                 sync_cls_avg_factor=False,
+                 positional_encoding=dict(
+                     type='SinePositionalEncoding',
+                     num_feats=128,
+                     normalize=True),
+                 loss_cls=dict(
+                     type='CrossEntropyLoss',
+                     bg_cls_weight=0.1,
+                     use_sigmoid=False,
+                     loss_weight=1.0,
+                     class_weight=1.0),
+                 loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+                 loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+                 train_cfg=dict(
+                     assigner=dict(
+                         type='HungarianAssigner',
+                         cls_cost=dict(type='ClassificationCost', weight=1.),
+                         reg_cost=dict(type='BBoxL1Cost', weight=5.0),
+                         iou_cost=dict(
+                             type='IoUCost', iou_mode='giou', weight=2.0))),
+                 test_cfg=dict(max_per_img=100),
+                 init_cfg=None,
+                 **kwargs):
+        # NOTE here use `AnchorFreeHead` instead of `TransformerHead`,
+        # since it brings inconvenience when the initialization of
+        # `AnchorFreeHead` is called.
+        super(AnchorFreeHead, self).__init__(init_cfg)
+
+        self.bg_cls_weight = 0
+        self.sync_cls_avg_factor = sync_cls_avg_factor
+        class_weight = loss_cls.get('class_weight', None)
+        if class_weight is not None and (self.__class__ is DETRHead):
+            # assert isinstance(class_weight, float), 'Expected ' \
+            #     'class_weight to have type float. Found ' \
+            #     f'{type(class_weight)}.'
+           
+            # NOTE following the official DETR rep0, bg_cls_weight means
+            # relative classification weight of the no-object class.
+            bg_cls_weight = loss_cls.get('bg_cls_weight', class_weight)
+            
+            assert isinstance(bg_cls_weight, float), 'Expected ' \
+                'bg_cls_weight to have type float. Found ' \
+                f'{type(bg_cls_weight)}.'
+            if isinstance(class_weight, list):
+                class_weight.append(bg_cls_weight)
+                class_weight = np.array(class_weight)
+                class_weight = torch.from_numpy(class_weight)
+                class_weight = torch.ones(num_classes + 1) * class_weight
+            elif isinstance(class_weight, float):
+                class_weight = torch.ones(num_classes + 1) * class_weight
+                # set background class as the last indice
+                class_weight[num_classes] = bg_cls_weight
+            loss_cls.update({'class_weight': class_weight})
+            if 'bg_cls_weight' in loss_cls:
+                loss_cls.pop('bg_cls_weight')
+            self.bg_cls_weight = bg_cls_weight
+
+        if train_cfg:
+            assert 'assigner' in train_cfg, 'assigner should be provided ' \
+                'when train_cfg is set.'
+            assigner = train_cfg['assigner']
+            # assert loss_cls['loss_weight'] == assigner['cls_cost']['weight'],
+            #     'The classification weight for loss and matcher should be' \
+            #     'exactly the same.'
+            # assert loss_bbox['loss_weight'] == assigner['reg_cost'][
+            #     'weight'], 'The regression L1 weight for loss and matcher '\
+            #     'should be exactly the same.'
+            # assert loss_iou['loss_weight'] == assigner['iou_cost']['weight'],
+            #     'The regression iou weight for loss and matcher should be' \
+            #     'exactly the same.'
+            self.assigner = build_assigner(assigner)
+            # DETR sampling=False, so use PseudoSampler
+            sampler_cfg = dict(type='PseudoSampler')
+            self.sampler = build_sampler(sampler_cfg, context=self)
+            
+        self.num_query = num_query
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.num_reg_fcs = num_reg_fcs
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.fp16_enabled = False
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_bbox = build_loss(loss_bbox)
+        self.loss_iou = build_loss(loss_iou)
+
+        if self.loss_cls.use_sigmoid:
+            self.cls_out_channels = num_classes
+        else:
+            self.cls_out_channels = num_classes + 1
+        self.act_cfg = transformer.get('act_cfg',
+                                       dict(type='ReLU', inplace=True))
+        self.activate = build_activation_layer(self.act_cfg)
+        self.positional_encoding = build_positional_encoding(
+            positional_encoding)
+        self.transformer = build_transformer(transformer)
+        self.embed_dims = self.transformer.embed_dims
+        assert 'num_feats' in positional_encoding
+        num_feats = positional_encoding['num_feats']
+        assert num_feats * 2 == self.embed_dims, 'embed_dims should' \
+            f' be exactly 2 times of num_feats. Found {self.embed_dims}' \
+            f' and {num_feats}.'
+
+        self._init_layers()
+
+    def _init_layers(self):
+        """Initialize layers of the transformer head."""
+        self.input_proj = Conv2d(
+            self.in_channels, self.embed_dims, kernel_size=1)
+        self.fc_cls = Linear(self.embed_dims, self.cls_out_channels)
+        self.reg_ffn = FFN(
+            self.embed_dims,
+            self.embed_dims,
+            self.num_reg_fcs,
+            self.act_cfg,
+            dropout=0.0,
+            add_residual=False)
+        self.fc_reg = Linear(self.embed_dims, 4)
+        self.query_embedding = nn.Embedding(self.num_query, self.embed_dims)
+
+    def init_weights(self):
+        """Initialize weights of the transformer head."""
+        # The initialization for transformer is important
+        self.transformer.init_weights()
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        """load checkpoints."""
+        # NOTE here use `AnchorFreeHead` instead of `TransformerHead`,
+        # since `AnchorFreeHead._load_from_state_dict` should not be
+        # called here. Invoking the default `Module._load_from_state_dict`
+        # is enough.
+
+        # Names of some parameters in has been changed.
+        version = local_metadata.get('version', None)
+        if (version is None or version < 2) and self.__class__ is DETRHead:
+            convert_dict = {
+                '.self_attn.': '.attentions.0.',
+                '.ffn.': '.ffns.0.',
+                '.multihead_attn.': '.attentions.1.',
+                '.decoder.norm.': '.decoder.post_norm.'
+            }
+            state_dict_keys = list(state_dict.keys())
+            for k in state_dict_keys:
+                for ori_key, convert_key in convert_dict.items():
+                    if ori_key in k:
+                        convert_key = k.replace(ori_key, convert_key)
+                        state_dict[convert_key] = state_dict[k]
+                        del state_dict[k]
+
+        super(AnchorFreeHead,
+              self)._load_from_state_dict(state_dict, prefix, local_metadata,
+                                          strict, missing_keys,
+                                          unexpected_keys, error_msgs)
+
+    def forward(self, feats, img_metas):
+        """Forward function.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+
+        Returns:
+            tuple[list[Tensor], list[Tensor]]: Outputs for all scale levels.
+
+                - all_cls_scores_list (list[Tensor]): Classification scores \
+                    for each scale level. Each is a 4D-tensor with shape \
+                    [nb_dec, bs, num_query, cls_out_channels]. Note \
+                    `cls_out_channels` should includes background.
+                - all_bbox_preds_list (list[Tensor]): Sigmoid regression \
+                    outputs for each scale level. Each is a 4D-tensor with \
+                    normalized coordinate format (cx, cy, w, h) and shape \
+                    [nb_dec, bs, num_query, 4].
+        """
+        num_levels = len(feats)
+        img_metas_list = [img_metas for _ in range(num_levels)]
+        return multi_apply(self.forward_single, feats, img_metas_list)
+
+    def forward_single(self, x, img_metas):
+        """"Forward function for a single feature level.
+
+        Args:
+            x (Tensor): Input feature from backbone's single stage, shape
+                [bs, c, h, w].
+            img_metas (list[dict]): List of image information.
+
+        Returns:
+            all_cls_scores (Tensor): Outputs from the classification head,
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression
+                head with normalized coordinate format (cx, cy, w, h).
+                Shape [nb_dec, bs, num_query, 4].
+        """
+        # construct binary masks which used for the transformer.
+        # NOTE following the official DETR repo, non-zero values representing
+        # ignored positions, while zero values means valid positions.
+        batch_size = x.size(0)
+        input_img_h, input_img_w = img_metas[0]['batch_input_shape']
+        masks = x.new_ones((batch_size, input_img_h, input_img_w))
+        for img_id in range(batch_size):
+            img_h, img_w, _ = img_metas[img_id]['img_shape']
+            masks[img_id, :img_h, :img_w] = 0
+
+        x = self.input_proj(x)
+        # interpolate masks to have the same spatial shape with x
+        masks = F.interpolate(
+            masks.unsqueeze(1), size=x.shape[-2:]).to(torch.bool).squeeze(1)
+        # position encoding
+        pos_embed = self.positional_encoding(masks)  # [bs, embed_dim, h, w]
+        # outs_dec: [nb_dec, bs, num_query, embed_dim]
+        outs_dec, _ = self.transformer(x, masks, self.query_embedding.weight,
+                                       pos_embed)
+
+        all_cls_scores = self.fc_cls(outs_dec)
+        all_bbox_preds = self.fc_reg(self.activate(
+            self.reg_ffn(outs_dec))).sigmoid()
+        return all_cls_scores, all_bbox_preds
+
+    @force_fp32(apply_to=('all_cls_scores_list', 'all_bbox_preds_list'))
+    def loss(self,
+             all_cls_scores_list,
+             all_bbox_preds_list,
+             gt_bboxes_list,
+             gt_labels_list,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """"Loss function.
+
+        Only outputs from the last feature level are used for computing
+        losses by default.
+
+        Args:
+            all_cls_scores_list (list[Tensor]): Classification outputs
+                for each feature level. Each is a 4D-tensor with shape
+                [nb_dec, bs, num_query, cls_out_channels].
+            all_bbox_preds_list (list[Tensor]): Sigmoid regression
+                outputs for each feature level. Each is a 4D-tensor with
+                normalized coordinate format (cx, cy, w, h) and shape
+                [nb_dec, bs, num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            img_metas (list[dict]): List of image meta information.
+            gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
+                which can be ignored for each image. Default None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        # NOTE defaultly only the outputs from the last feature scale is used.
+        all_cls_scores = all_cls_scores_list[-1]
+        all_bbox_preds = all_bbox_preds_list[-1]
+        assert gt_bboxes_ignore is None, \
+            'Only supports for gt_bboxes_ignore setting to None.'
+
+        num_dec_layers = len(all_cls_scores)
+        all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
+        all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
+        all_gt_bboxes_ignore_list = [
+            gt_bboxes_ignore for _ in range(num_dec_layers)
+        ]
+        img_metas_list = [img_metas for _ in range(num_dec_layers)]
+
+        losses_cls, losses_bbox, losses_iou = multi_apply(
+            self.loss_single, all_cls_scores, all_bbox_preds,
+            all_gt_bboxes_list, all_gt_labels_list, img_metas_list,
+            all_gt_bboxes_ignore_list)
+
+        loss_dict = dict()
+        # loss from the last decoder layer
+        loss_dict['loss_cls'] = losses_cls[-1]
+        loss_dict['loss_bbox'] = losses_bbox[-1]
+        loss_dict['loss_iou'] = losses_iou[-1]
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for loss_cls_i, loss_bbox_i, loss_iou_i in zip(losses_cls[:-1],
+                                                       losses_bbox[:-1],
+                                                       losses_iou[:-1]):
+            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
+            loss_dict[f'd{num_dec_layer}.loss_iou'] = loss_iou_i
+            num_dec_layer += 1
+        return loss_dict
+
+    def get_fed_loss_classes(self, gt_classes, num_fed_loss_classes, num_classes, weight):
+        """
+        Args:
+            gt_classes: a long tensor of shape R that contains the gt class label of each proposal.
+            num_fed_loss_classes: minimum number of classes to keep when calculating federated loss.
+            Will sample negative classes if number of unique gt_classes is smaller than this value.
+            num_classes: number of foreground classes
+            weight: probabilities used to sample negative classes
+        Returns:
+            Tensor:
+                classes to keep when calculating the federated loss, including both unique gt
+                classes and sampled negative classes.
+        """
+        unique_gt_classes = torch.unique(gt_classes)
+        prob = unique_gt_classes.new_ones(num_classes + 1).float()
+        prob[-1] = 0
+        if len(unique_gt_classes) < num_fed_loss_classes:
+            prob[:num_classes] = weight.float().clone()
+            prob[unique_gt_classes] = 0
+            sampled_negative_classes = torch.multinomial(
+                prob, num_fed_loss_classes - len(unique_gt_classes), replacement=False
+            )
+            fed_loss_classes = torch.cat([unique_gt_classes, sampled_negative_classes])
+        else:
+            fed_loss_classes = unique_gt_classes
+        return fed_loss_classes
+    
+    def loss_single(self,
+                    cls_scores,
+                    bbox_preds,
+                    gt_bboxes_list,
+                    gt_labels_list,
+                    img_metas,
+                    gt_bboxes_ignore_list=None):
+        """"Loss function for outputs from a single decoder layer of a single
+        feature level.
+
+        Args:
+            cls_scores (Tensor): Box score logits from a single decoder layer
+                for all images. Shape [bs, num_query, cls_out_channels].
+            bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
+                for all images, with normalized coordinate (cx, cy, w, h) and
+                shape [bs, num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            img_metas (list[dict]): List of image meta information.
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components for outputs from
+                a single decoder layer.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)]
+        cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list,
+                                           gt_bboxes_list, gt_labels_list,
+                                           img_metas, gt_bboxes_ignore_list)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+
+        labels = torch.cat(labels_list, 0)
+        label_weights = torch.cat(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+
+        # classification loss
+        cls_scores = cls_scores.reshape(-1, self.cls_out_channels)
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = num_total_pos * 1.0 + \
+            num_total_neg * self.bg_cls_weight
+        if self.sync_cls_avg_factor:
+            cls_avg_factor = reduce_mean(
+                cls_scores.new_tensor([cls_avg_factor]))
+        cls_avg_factor = max(cls_avg_factor, 1)
+
+        loss_cls = self.loss_cls(
+            cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
+
+        # Compute the average number of gt boxes across all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+
+        # construct factors used for rescale bboxes
+        factors = []
+        for img_meta, bbox_pred in zip(img_metas, bbox_preds):
+            img_h, img_w, _ = img_meta['img_shape']
+            factor = bbox_pred.new_tensor([img_w, img_h, img_w,
+                                           img_h]).unsqueeze(0).repeat(
+                                               bbox_pred.size(0), 1)
+            factors.append(factor)
+        factors = torch.cat(factors, 0)
+
+        # DETR regress the relative position of boxes (cxcywh) in the image,
+        # thus the learning target is normalized by the image size. So here
+        # we need to re-scale them for calculating IoU loss
+        bbox_preds = bbox_preds.reshape(-1, 4)
+        bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors
+        bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors
+
+        # regression IoU loss, defaultly GIoU loss
+        loss_iou = self.loss_iou(
+            bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos)
+
+        # regression L1 loss
+        loss_bbox = self.loss_bbox(
+            bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos)
+        return loss_cls, loss_bbox, loss_iou
+
+    def get_targets(self,
+                    cls_scores_list,
+                    bbox_preds_list,
+                    gt_bboxes_list,
+                    gt_labels_list,
+                    img_metas,
+                    gt_bboxes_ignore_list=None):
+        """"Compute regression and classification targets for a batch image.
+
+        Outputs from a single decoder layer of a single feature level are used.
+
+        Args:
+            cls_scores_list (list[Tensor]): Box score logits from a single
+                decoder layer for each image with shape [num_query,
+                cls_out_channels].
+            bbox_preds_list (list[Tensor]): Sigmoid outputs from a single
+                decoder layer for each image, with normalized coordinate
+                (cx, cy, w, h) and shape [num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            img_metas (list[dict]): List of image meta information.
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+
+        Returns:
+            tuple: a tuple containing the following targets.
+
+                - labels_list (list[Tensor]): Labels for all images.
+                - label_weights_list (list[Tensor]): Label weights for all \
+                    images.
+                - bbox_targets_list (list[Tensor]): BBox targets for all \
+                    images.
+                - bbox_weights_list (list[Tensor]): BBox weights for all \
+                    images.
+                - num_total_pos (int): Number of positive samples in all \
+                    images.
+                - num_total_neg (int): Number of negative samples in all \
+                    images.
+        """
+        assert gt_bboxes_ignore_list is None, \
+            'Only supports for gt_bboxes_ignore setting to None.'
+        num_imgs = len(cls_scores_list)
+        gt_bboxes_ignore_list = [
+            gt_bboxes_ignore_list for _ in range(num_imgs)
+        ]
+
+        (labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, pos_inds_list, neg_inds_list) = multi_apply(
+             self._get_target_single, cls_scores_list, bbox_preds_list,
+             gt_bboxes_list, gt_labels_list, img_metas, gt_bboxes_ignore_list)
+        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
+        return (labels_list, label_weights_list, bbox_targets_list,
+                bbox_weights_list, num_total_pos, num_total_neg)
+
+    def _get_area_thr(self, img_shape, type):
+        MIN_V = 0
+        MAX_V = 1e10
+        short_edge = min(img_shape[0], img_shape[1])
+        if type == 'v1':
+            DELTA = 4
+            if short_edge <= 600:
+                min_edge = 128 - DELTA
+                max_edge = MAX_V
+            elif 600 < short_edge <= 800:
+                min_edge = 96 - DELTA
+                max_edge = MAX_V
+            elif 800 < short_edge <= 1000:
+                min_edge = 64 - DELTA
+                max_edge = MAX_V
+            elif 1000 < short_edge <= 1200:
+                min_edge = 32 - DELTA
+                max_edge = MAX_V
+            elif 1200 < short_edge <= 1400:
+                min_edge = MIN_V
+                max_edge = MAX_V
+            else:
+                min_edge = MIN_V
+                max_edge = 2 + DELTA
+        elif type == 'v2':
+            if short_edge <= 1000:
+                min_edge = 112
+                max_edge = MAX_V
+            elif 1000 < short_edge <= 1400:
+                min_edge = 32
+                max_edge = 160
+            elif short_edge > 1400:
+                min_edge = 0
+                max_edge = 80
+        elif type == 'v3':
+            if short_edge <= 800:
+                min_edge = 96
+                max_edge = MAX_V
+            elif 800 < short_edge <= 1000:
+                min_edge = 64
+                max_edge = MAX_V
+            elif 1000 < short_edge <= 1400:
+                min_edge = MIN_V
+                max_edge = MAX_V
+            elif 1400 < short_edge <= 1600:
+                min_edge = MIN_V
+                max_edge = 96
+            elif short_edge > 1600:
+                min_edge = MIN_V
+                max_edge = 64
+        elif type == 'v4':
+            DELTA = 4
+            if short_edge <= 800:
+                min_edge = 96 - DELTA
+                max_edge = MAX_V
+            elif 800 < short_edge <= 1000:
+                min_edge = 64 - DELTA
+                max_edge = MAX_V
+            elif 1000 < short_edge <= 1400:
+                min_edge = MIN_V
+                max_edge = MAX_V
+            elif 1400 < short_edge <= 1600:
+                min_edge = MIN_V
+                max_edge = 64 + DELTA
+            elif short_edge > 1600:
+                min_edge = MIN_V
+                max_edge = 32 + DELTA
+
+        return min_edge ** 2, max_edge ** 2
+
+    def _get_target_single(self,
+                           cls_score,
+                           bbox_pred,
+                           gt_bboxes,
+                           gt_labels,
+                           img_meta,
+                           gt_bboxes_ignore=None):
+        """"Compute regression and classification targets for one image.
+
+        Outputs from a single decoder layer of a single feature level are used.
+
+        Args:
+            cls_score (Tensor): Box score logits from a single decoder layer
+                for one image. Shape [num_query, cls_out_channels].
+            bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
+                for one image, with normalized coordinate (cx, cy, w, h) and
+                shape [num_query, 4].
+            gt_bboxes (Tensor): Ground truth bboxes for one image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (Tensor): Ground truth class indices for one image
+                with shape (num_gts, ).
+            img_meta (dict): Meta information for one image.
+            gt_bboxes_ignore (Tensor, optional): Bounding boxes
+                which can be ignored. Default None.
+
+        Returns:
+            tuple[Tensor]: a tuple containing the following for one image.
+
+                - labels (Tensor): Labels of each image.
+                - label_weights (Tensor]): Label weights of each image.
+                - bbox_targets (Tensor): BBox targets of each image.
+                - bbox_weights (Tensor): BBox weights of each image.
+                - pos_inds (Tensor): Sampled positive indices for each image.
+                - neg_inds (Tensor): Sampled negative indices for each image.
+        """
+
+        num_bboxes = bbox_pred.size(0)
+        # assigner and sampler
+        assign_result = self.assigner.assign(bbox_pred, cls_score, gt_bboxes,
+                                             gt_labels, img_meta,
+                                             gt_bboxes_ignore)
+        sampling_result = self.sampler.sample(assign_result, bbox_pred,
+                                              gt_bboxes)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        # label targets
+        labels = gt_bboxes.new_full((num_bboxes, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_bboxes.new_ones(num_bboxes)
+
+        # bbox targets
+        bbox_targets = torch.zeros_like(bbox_pred)
+        bbox_weights = torch.zeros_like(bbox_pred)
+        bbox_weights[pos_inds] = 1.0
+        img_h, img_w, _ = img_meta['img_shape']
+
+        # DETR regress the relative position of boxes (cxcywh) in the image.
+        # Thus the learning target should be normalized by the image size, also
+        # the box format should be converted from defaultly x1y1x2y2 to cxcywh.
+        factor = bbox_pred.new_tensor([img_w, img_h, img_w,
+                                       img_h]).unsqueeze(0)
+        pos_gt_bboxes_normalized = sampling_result.pos_gt_bboxes / factor
+        pos_gt_bboxes_targets = bbox_xyxy_to_cxcywh(pos_gt_bboxes_normalized)
+        bbox_targets[pos_inds] = pos_gt_bboxes_targets
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                neg_inds)
+
+    # over-write because img_metas are needed as inputs for bbox_head.
+    def forward_train(self,
+                      x,
+                      img_metas,
+                      gt_bboxes,
+                      gt_labels=None,
+                      gt_bboxes_ignore=None,
+                      proposal_cfg=None,
+                      **kwargs):
+        """Forward function for training mode.
+
+        Args:
+            x (list[Tensor]): Features from backbone.
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes (Tensor): Ground truth bboxes of the image,
+                shape (num_gts, 4).
+            gt_labels (Tensor): Ground truth labels of each box,
+                shape (num_gts,).
+            gt_bboxes_ignore (Tensor): Ground truth bboxes to be
+                ignored, shape (num_ignored_gts, 4).
+            proposal_cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert proposal_cfg is None, '"proposal_cfg" must be None'
+        outs = self(x, img_metas)
+        if gt_labels is None:
+            loss_inputs = outs + (gt_bboxes, img_metas)
+        else:
+            loss_inputs = outs + (gt_bboxes, gt_labels, img_metas)
+        losses = self.loss(*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+        return losses
+
+    @force_fp32(apply_to=('all_cls_scores_list', 'all_bbox_preds_list'))
+    def get_bboxes(self,
+                   all_cls_scores_list,
+                   all_bbox_preds_list,
+                   img_metas,
+                   rescale=False):
+        """Transform network outputs for a batch into bbox predictions.
+
+        Args:
+            all_cls_scores_list (list[Tensor]): Classification outputs
+                for each feature level. Each is a 4D-tensor with shape
+                [nb_dec, bs, num_query, cls_out_channels].
+            all_bbox_preds_list (list[Tensor]): Sigmoid regression
+                outputs for each feature level. Each is a 4D-tensor with
+                normalized coordinate format (cx, cy, w, h) and shape
+                [nb_dec, bs, num_query, 4].
+            img_metas (list[dict]): Meta information of each image.
+            rescale (bool, optional): If True, return boxes in original
+                image space. Default False.
+
+        Returns:
+            list[list[Tensor, Tensor]]: Each item in result_list is 2-tuple. \
+                The first item is an (n, 5) tensor, where the first 4 columns \
+                are bounding box positions (tl_x, tl_y, br_x, br_y) and the \
+                5-th column is a score between 0 and 1. The second item is a \
+                (n,) tensor where each item is the predicted class label of \
+                the corresponding box.
+        """
+        # NOTE defaultly only using outputs from the last feature level,
+        # and only the outputs from the last decoder layer is used.
+        cls_scores = all_cls_scores_list[-1][-1]
+        bbox_preds = all_bbox_preds_list[-1][-1]
+
+        result_list = []
+        for img_id in range(len(img_metas)):
+            cls_score = cls_scores[img_id]
+            bbox_pred = bbox_preds[img_id]
+            img_shape = img_metas[img_id]['img_shape']
+            scale_factor = img_metas[img_id]['scale_factor']
+            proposals = self._get_bboxes_single(cls_score, bbox_pred,
+                                                img_shape, scale_factor,
+                                                rescale)
+            result_list.append(proposals)
+
+        return result_list
+
+    def _get_bboxes_single(self,
+                           cls_score,
+                           bbox_pred,
+                           img_shape,
+                           scale_factor,
+                           rescale=False):
+        """Transform outputs from the last decoder layer into bbox predictions
+        for each image.
+
+        Args:
+            cls_score (Tensor): Box score logits from the last decoder layer
+                for each image. Shape [num_query, cls_out_channels].
+            bbox_pred (Tensor): Sigmoid outputs from the last decoder layer
+                for each image, with coordinate format (cx, cy, w, h) and
+                shape [num_query, 4].
+            img_shape (tuple[int]): Shape of input image, (height, width, 3).
+            scale_factor (ndarray, optional): Scale factor of the image arange
+                as (w_scale, h_scale, w_scale, h_scale).
+            rescale (bool, optional): If True, return boxes in original image
+                space. Default False.
+
+        Returns:
+            tuple[Tensor]: Results of detected bboxes and labels.
+
+                - det_bboxes: Predicted bboxes with shape [num_query, 5], \
+                    where the first 4 columns are bounding box positions \
+                    (tl_x, tl_y, br_x, br_y) and the 5-th column are scores \
+                    between 0 and 1.
+                - det_labels: Predicted labels of the corresponding box with \
+                    shape [num_query].
+        """
+        assert len(cls_score) == len(bbox_pred)
+        max_per_img = self.test_cfg.get('max_per_img', self.num_query)
+        # exclude background
+        if self.loss_cls.use_sigmoid:
+            cls_score = cls_score.sigmoid()
+            scores, indexes = cls_score.view(-1).topk(max_per_img)
+            det_labels = indexes % self.num_classes
+            bbox_index = indexes // self.num_classes
+            bbox_pred = bbox_pred[bbox_index]
+        else:
+            scores, det_labels = F.softmax(cls_score, dim=-1)[..., :-1].max(-1)
+            scores, bbox_index = scores.topk(max_per_img)
+            bbox_pred = bbox_pred[bbox_index]
+            det_labels = det_labels[bbox_index]
+
+        det_bboxes = bbox_cxcywh_to_xyxy(bbox_pred)
+        det_bboxes[:, 0::2] = det_bboxes[:, 0::2] * img_shape[1]
+        det_bboxes[:, 1::2] = det_bboxes[:, 1::2] * img_shape[0]
+        det_bboxes[:, 0::2].clamp_(min=0, max=img_shape[1])
+        det_bboxes[:, 1::2].clamp_(min=0, max=img_shape[0])
+        if rescale:
+            det_bboxes /= det_bboxes.new_tensor(scale_factor)
+        det_bboxes = torch.cat((det_bboxes, scores.unsqueeze(1)), -1)
+
+        return det_bboxes, det_labels
+
+    def simple_test_bboxes(self, feats, img_metas, rescale=False):
+        """Test det bboxes without test-time augmentation.
+
+        Args:
+            feats (tuple[torch.Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
+                The first item is ``bboxes`` with shape (n, 5),
+                where 5 represent (tl_x, tl_y, br_x, br_y, score).
+                The shape of the second tensor in the tuple is ``labels``
+                with shape (n,)
+        """
+        # forward of this head requires img_metas
+        outs = self.forward(feats, img_metas)
+        results_list = self.get_bboxes(*outs, img_metas, rescale=rescale)
+        return results_list
+
+    def forward_onnx(self, feats, img_metas):
+        """Forward function for exporting to ONNX.
+
+        Over-write `forward` because: `masks` is directly created with
+        zero (valid position tag) and has the same spatial size as `x`.
+        Thus the construction of `masks` is different from that in `forward`.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+
+        Returns:
+            tuple[list[Tensor], list[Tensor]]: Outputs for all scale levels.
+
+                - all_cls_scores_list (list[Tensor]): Classification scores \
+                    for each scale level. Each is a 4D-tensor with shape \
+                    [nb_dec, bs, num_query, cls_out_channels]. Note \
+                    `cls_out_channels` should includes background.
+                - all_bbox_preds_list (list[Tensor]): Sigmoid regression \
+                    outputs for each scale level. Each is a 4D-tensor with \
+                    normalized coordinate format (cx, cy, w, h) and shape \
+                    [nb_dec, bs, num_query, 4].
+        """
+        num_levels = len(feats)
+        img_metas_list = [img_metas for _ in range(num_levels)]
+        return multi_apply(self.forward_single_onnx, feats, img_metas_list)
+
+    def forward_single_onnx(self, x, img_metas):
+        """"Forward function for a single feature level with ONNX exportation.
+
+        Args:
+            x (Tensor): Input feature from backbone's single stage, shape
+                [bs, c, h, w].
+            img_metas (list[dict]): List of image information.
+
+        Returns:
+            all_cls_scores (Tensor): Outputs from the classification head,
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression
+                head with normalized coordinate format (cx, cy, w, h).
+                Shape [nb_dec, bs, num_query, 4].
+        """
+        # Note `img_shape` is not dynamically traceable to ONNX,
+        # since the related augmentation was done with numpy under
+        # CPU. Thus `masks` is directly created with zeros (valid tag)
+        # and the same spatial shape as `x`.
+        # The difference between torch and exported ONNX model may be
+        # ignored, since the same performance is achieved (e.g.
+        # 40.1 vs 40.1 for DETR)
+        batch_size = x.size(0)
+        h, w = x.size()[-2:]
+        masks = x.new_zeros((batch_size, h, w))  # [B,h,w]
+
+        x = self.input_proj(x)
+        # interpolate masks to have the same spatial shape with x
+        masks = F.interpolate(
+            masks.unsqueeze(1), size=x.shape[-2:]).to(torch.bool).squeeze(1)
+        pos_embed = self.positional_encoding(masks)
+        outs_dec, _ = self.transformer(x, masks, self.query_embedding.weight,
+                                       pos_embed)
+
+        all_cls_scores = self.fc_cls(outs_dec)
+        all_bbox_preds = self.fc_reg(self.activate(
+            self.reg_ffn(outs_dec))).sigmoid()
+        return all_cls_scores, all_bbox_preds
+
+    def onnx_export(self, all_cls_scores_list, all_bbox_preds_list, img_metas):
+        """Transform network outputs into bbox predictions, with ONNX
+        exportation.
+
+        Args:
+            all_cls_scores_list (list[Tensor]): Classification outputs
+                for each feature level. Each is a 4D-tensor with shape
+                [nb_dec, bs, num_query, cls_out_channels].
+            all_bbox_preds_list (list[Tensor]): Sigmoid regression
+                outputs for each feature level. Each is a 4D-tensor with
+                normalized coordinate format (cx, cy, w, h) and shape
+                [nb_dec, bs, num_query, 4].
+            img_metas (list[dict]): Meta information of each image.
+
+        Returns:
+            tuple[Tensor, Tensor]: dets of shape [N, num_det, 5]
+                and class labels of shape [N, num_det].
+        """
+        assert len(img_metas) == 1, \
+            'Only support one input image while in exporting to ONNX'
+
+        cls_scores = all_cls_scores_list[-1][-1]
+        bbox_preds = all_bbox_preds_list[-1][-1]
+
+        # Note `img_shape` is not dynamically traceable to ONNX,
+        # here `img_shape_for_onnx` (padded shape of image tensor)
+        # is used.
+        img_shape = img_metas[0]['img_shape_for_onnx']
+        max_per_img = self.test_cfg.get('max_per_img', self.num_query)
+        batch_size = cls_scores.size(0)
+        # `batch_index_offset` is used for the gather of concatenated tensor
+        batch_index_offset = torch.arange(batch_size).to(
+            cls_scores.device) * max_per_img
+        batch_index_offset = batch_index_offset.unsqueeze(1).expand(
+            batch_size, max_per_img)
+
+        # supports dynamical batch inference
+        if self.loss_cls.use_sigmoid:
+            cls_scores = cls_scores.sigmoid()
+            scores, indexes = cls_scores.view(batch_size, -1).topk(
+                max_per_img, dim=1)
+            det_labels = indexes % self.num_classes
+            bbox_index = indexes // self.num_classes
+            bbox_index = (bbox_index + batch_index_offset).view(-1)
+            bbox_preds = bbox_preds.view(-1, 4)[bbox_index]
+            bbox_preds = bbox_preds.view(batch_size, -1, 4)
+        else:
+            scores, det_labels = F.softmax(
+                cls_scores, dim=-1)[..., :-1].max(-1)
+            scores, bbox_index = scores.topk(max_per_img, dim=1)
+            bbox_index = (bbox_index + batch_index_offset).view(-1)
+            bbox_preds = bbox_preds.view(-1, 4)[bbox_index]
+            det_labels = det_labels.view(-1)[bbox_index]
+            bbox_preds = bbox_preds.view(batch_size, -1, 4)
+            det_labels = det_labels.view(batch_size, -1)
+
+        det_bboxes = bbox_cxcywh_to_xyxy(bbox_preds)
+        # use `img_shape_tensor` for dynamically exporting to ONNX
+        img_shape_tensor = img_shape.flip(0).repeat(2)  # [w,h,w,h]
+        img_shape_tensor = img_shape_tensor.unsqueeze(0).unsqueeze(0).expand(
+            batch_size, det_bboxes.size(1), 4)
+        det_bboxes = det_bboxes * img_shape_tensor
+        # dynamically clip bboxes
+        x1, y1, x2, y2 = det_bboxes.split((1, 1, 1, 1), dim=-1)
+        from mmdet.core.export import dynamic_clip_for_onnx
+        x1, y1, x2, y2 = dynamic_clip_for_onnx(x1, y1, x2, y2, img_shape)
+        det_bboxes = torch.cat([x1, y1, x2, y2], dim=-1)
+        det_bboxes = torch.cat((det_bboxes, scores.unsqueeze(-1)), -1)
+
+        return det_bboxes, det_labels
--- a/detection/mmdet_custom/models/dense_heads/dino_head.py
+++ b/detection/mmdet_custom/models/dense_heads/dino_head.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmdet.core import (bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh, multi_apply,
+                        reduce_mean)
+from ..utils import build_dn_generator
+from mmdet.models.utils.transformer import inverse_sigmoid
+from mmdet.models.builder import HEADS
+from .deformable_detr_head import DeformableDETRHead
+from mmcv.runner import force_fp32
+
+
+@HEADS.register_module()
+class DINOHead(DeformableDETRHead):
+
+    def __init__(self, *args, dn_cfg=None, **kwargs):
+        super(DINOHead, self).__init__(*args, **kwargs)
+        self._init_layers()
+        self.init_denoising(dn_cfg)
+        assert self.as_two_stage, \
+            'as_two_stage must be True for DINO'
+        assert self.with_box_refine, \
+            'with_box_refine must be True for DINO'
+
+    def _init_layers(self):
+        super()._init_layers()
+        # NOTE The original repo of DINO set the num_embeddings 92 for coco,
+        # 91 (0~90) of which represents target classes and the 92 (91)
+        # indicates [Unknown] class. However, the embedding of unknown class
+        # is not used in the original DINO
+        self.label_embedding = nn.Embedding(self.cls_out_channels,
+                                            self.embed_dims)
+
+    def init_denoising(self, dn_cfg):
+        if dn_cfg is not None:
+            dn_cfg['num_classes'] = self.num_classes
+            dn_cfg['num_queries'] = self.num_query
+            dn_cfg['hidden_dim'] = self.embed_dims
+        self.dn_generator = build_dn_generator(dn_cfg)
+
+    def forward_train(self,
+                      x,
+                      img_metas,
+                      gt_bboxes,
+                      gt_labels=None,
+                      gt_bboxes_ignore=None,
+                      proposal_cfg=None,
+                      **kwargs):
+        assert proposal_cfg is None, '"proposal_cfg" must be None'
+        assert self.dn_generator is not None, '"dn_cfg" must be set'
+        dn_label_query, dn_bbox_query, attn_mask, dn_meta = \
+            self.dn_generator(gt_bboxes, gt_labels,
+                              self.label_embedding, img_metas)
+        outs = self(x, img_metas, dn_label_query, dn_bbox_query, attn_mask)
+        if gt_labels is None:
+            loss_inputs = outs + (gt_bboxes, img_metas, dn_meta)
+        else:
+            loss_inputs = outs + (gt_bboxes, gt_labels, img_metas, dn_meta)
+        losses = self.loss(*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+        return losses
+
+    def forward(self,
+                mlvl_feats,
+                img_metas,
+                dn_label_query=None,
+                dn_bbox_query=None,
+                attn_mask=None):
+        batch_size = mlvl_feats[0].size(0)
+        input_img_h, input_img_w = img_metas[0]['batch_input_shape']
+        img_masks = mlvl_feats[0].new_ones(
+            (batch_size, input_img_h, input_img_w))
+        for img_id in range(batch_size):
+            img_h, img_w, _ = img_metas[img_id]['img_shape']
+            img_masks[img_id, :img_h, :img_w] = 0
+
+        mlvl_masks = []
+        mlvl_positional_encodings = []
+        for feat in mlvl_feats:
+            mlvl_masks.append(
+                F.interpolate(
+                    img_masks[None],
+                    size=feat.shape[-2:]).to(torch.bool).squeeze(0))
+            mlvl_positional_encodings.append(
+                self.positional_encoding(mlvl_masks[-1]))
+
+        query_embeds = None
+        hs, inter_references, topk_score, topk_anchor = \
+            self.transformer(
+                mlvl_feats,
+                mlvl_masks,
+                query_embeds,
+                mlvl_positional_encodings,
+                dn_label_query,
+                dn_bbox_query,
+                attn_mask,
+                reg_branches=self.reg_branches if self.with_box_refine else None,  # noqa:E501
+                cls_branches=self.cls_branches if self.as_two_stage else None  # noqa:E501
+            )
+        hs = hs.permute(0, 2, 1, 3)
+
+        if dn_label_query is not None and dn_label_query.size(1) == 0:
+            # NOTE: If there is no target in the image, the parameters of
+            # label_embedding won't be used in producing loss, which raises
+            # RuntimeError when using distributed mode.
+            hs[0] += self.label_embedding.weight[0, 0] * 0.0
+            
+        outputs_classes = []
+        outputs_coords = []
+
+        for lvl in range(hs.shape[0]):
+            reference = inter_references[lvl]
+            reference = inverse_sigmoid(reference, eps=1e-3)
+            outputs_class = self.cls_branches[lvl](hs[lvl])
+            tmp = self.reg_branches[lvl](hs[lvl])
+            if reference.shape[-1] == 4:
+                tmp += reference
+            else:
+                assert reference.shape[-1] == 2
+                tmp[..., :2] += reference
+            outputs_coord = tmp.sigmoid()
+            outputs_classes.append(outputs_class)
+            outputs_coords.append(outputs_coord)
+
+        outputs_classes = torch.stack(outputs_classes)
+        outputs_coords = torch.stack(outputs_coords)
+
+        return outputs_classes, outputs_coords, topk_score, topk_anchor
+
+    @force_fp32(apply_to=('all_cls_scores', 'all_bbox_preds'))
+    def loss(self,
+             all_cls_scores,
+             all_bbox_preds,
+             enc_topk_scores,
+             enc_topk_anchors,
+             gt_bboxes_list,
+             gt_labels_list,
+             img_metas,
+             dn_meta=None,
+             gt_bboxes_ignore=None):
+        assert gt_bboxes_ignore is None, \
+            f'{self.__class__.__name__} only supports ' \
+            f'for gt_bboxes_ignore setting to None.'
+
+        loss_dict = dict()
+
+        # extract denoising and matching part of outputs
+        all_cls_scores, all_bbox_preds, dn_cls_scores, dn_bbox_preds = \
+            self.extract_dn_outputs(all_cls_scores, all_bbox_preds, dn_meta)
+
+        if enc_topk_scores is not None:
+            # calculate loss from encode feature maps
+            # NOTE The DeformDETR calculate binary cls loss
+            # for all encoder embeddings, while DINO calculate
+            # multi-class loss for topk embeddings.
+            enc_loss_cls, enc_losses_bbox, enc_losses_iou = \
+                self.loss_single(enc_topk_scores, enc_topk_anchors,
+                                 gt_bboxes_list, gt_labels_list,
+                                 img_metas, gt_bboxes_ignore)
+
+            # collate loss from encode feature maps
+            loss_dict['interm_loss_cls'] = enc_loss_cls
+            loss_dict['interm_loss_bbox'] = enc_losses_bbox
+            loss_dict['interm_loss_iou'] = enc_losses_iou
+
+        # calculate loss from all decoder layers
+        num_dec_layers = len(all_cls_scores)
+        all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
+        all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
+        all_gt_bboxes_ignore_list = [
+            gt_bboxes_ignore for _ in range(num_dec_layers)
+        ]
+        img_metas_list = [img_metas for _ in range(num_dec_layers)]
+        losses_cls, losses_bbox, losses_iou = multi_apply(
+            self.loss_single, all_cls_scores, all_bbox_preds,
+            all_gt_bboxes_list, all_gt_labels_list, img_metas_list,
+            all_gt_bboxes_ignore_list)
+
+        # collate loss from the last decoder layer
+        loss_dict['loss_cls'] = losses_cls[-1]
+        loss_dict['loss_bbox'] = losses_bbox[-1]
+        loss_dict['loss_iou'] = losses_iou[-1]
+
+        # collate loss from other decoder layers
+        num_dec_layer = 0
+        for loss_cls_i, loss_bbox_i, loss_iou_i in zip(losses_cls[:-1],
+                                                       losses_bbox[:-1],
+                                                       losses_iou[:-1]):
+            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
+            loss_dict[f'd{num_dec_layer}.loss_iou'] = loss_iou_i
+            num_dec_layer += 1
+
+        if dn_cls_scores is not None:
+            # calculate denoising loss from all decoder layers
+            dn_meta = [dn_meta for _ in img_metas]
+            dn_losses_cls, dn_losses_bbox, dn_losses_iou = self.loss_dn(
+                dn_cls_scores, dn_bbox_preds, gt_bboxes_list, gt_labels_list,
+                img_metas, dn_meta)
+    
+            # collate denoising loss
+            loss_dict['dn_loss_cls'] = dn_losses_cls[-1]
+            loss_dict['dn_loss_bbox'] = dn_losses_bbox[-1]
+            loss_dict['dn_loss_iou'] = dn_losses_iou[-1]
+            num_dec_layer = 0
+            for loss_cls_i, loss_bbox_i, loss_iou_i in zip(
+                    dn_losses_cls[:-1], dn_losses_bbox[:-1],
+                    dn_losses_iou[:-1]):
+                loss_dict[f'd{num_dec_layer}.dn_loss_cls'] = loss_cls_i
+                loss_dict[f'd{num_dec_layer}.dn_loss_bbox'] = loss_bbox_i
+                loss_dict[f'd{num_dec_layer}.dn_loss_iou'] = loss_iou_i
+                num_dec_layer += 1
+                
+            return loss_dict
+
+    def loss_dn(self, dn_cls_scores, dn_bbox_preds, gt_bboxes_list,
+                gt_labels_list, img_metas, dn_meta):
+        num_dec_layers = len(dn_cls_scores)
+        all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
+        all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
+        img_metas_list = [img_metas for _ in range(num_dec_layers)]
+        dn_meta_list = [dn_meta for _ in range(num_dec_layers)]
+        return multi_apply(self.loss_dn_single, dn_cls_scores, dn_bbox_preds,
+                           all_gt_bboxes_list, all_gt_labels_list,
+                           img_metas_list, dn_meta_list)
+
+    def loss_dn_single(self, dn_cls_scores, dn_bbox_preds, gt_bboxes_list,
+                       gt_labels_list, img_metas, dn_meta):
+        num_imgs = dn_cls_scores.size(0)
+        bbox_preds_list = [dn_bbox_preds[i] for i in range(num_imgs)]
+        cls_reg_targets = self.get_dn_target(bbox_preds_list, gt_bboxes_list,
+                                             gt_labels_list, img_metas,
+                                             dn_meta)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        labels = torch.cat(labels_list, 0)
+        label_weights = torch.cat(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+
+        # classification loss
+        cls_scores = dn_cls_scores.reshape(-1, self.cls_out_channels)
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = \
+            num_total_pos * 1.0 + num_total_neg * self.bg_cls_weight
+        if self.sync_cls_avg_factor:
+            cls_avg_factor = reduce_mean(
+                cls_scores.new_tensor([cls_avg_factor]))
+        cls_avg_factor = max(cls_avg_factor, 1)
+
+        if len(cls_scores) > 0:
+            loss_cls = self.loss_cls(
+                cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
+        else:
+            loss_cls = torch.zeros(  # TODO: How to better return zero loss
+                1,
+                dtype=cls_scores.dtype,
+                device=cls_scores.device)
+            
+        # Compute the average number of gt boxes across all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+
+        # construct factors used for rescale bboxes
+        factors = []
+        for img_meta, bbox_pred in zip(img_metas, dn_bbox_preds):
+            img_h, img_w, _ = img_meta['img_shape']
+            factor = bbox_pred.new_tensor([img_w, img_h, img_w,
+                                           img_h]).unsqueeze(0).repeat(
+                                               bbox_pred.size(0), 1)
+            factors.append(factor)
+        factors = torch.cat(factors, 0)
+
+        # DETR regress the relative position of boxes (cxcywh) in the image,
+        # thus the learning target is normalized by the image size. So here
+        # we need to re-scale them for calculating IoU loss
+        bbox_preds = dn_bbox_preds.reshape(-1, 4)
+        bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors
+        bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors
+
+        # regression IoU loss, defaultly GIoU loss
+        loss_iou = self.loss_iou(
+            bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos)
+
+        # regression L1 loss
+        loss_bbox = self.loss_bbox(
+            bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos)
+        return loss_cls, loss_bbox, loss_iou
+
+    def get_dn_target(self, dn_bbox_preds_list, gt_bboxes_list, gt_labels_list,
+                      img_metas, dn_meta):
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         pos_inds_list,
+         neg_inds_list) = multi_apply(self._get_dn_target_single,
+                                      dn_bbox_preds_list, gt_bboxes_list,
+                                      gt_labels_list, img_metas, dn_meta)
+        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
+        return (labels_list, label_weights_list, bbox_targets_list,
+                bbox_weights_list, num_total_pos, num_total_neg)
+
+    def _get_dn_target_single(self, dn_bbox_pred, gt_bboxes, gt_labels,
+                              img_meta, dn_meta):
+        num_groups = dn_meta['num_dn_group']
+        pad_size = dn_meta['pad_size']
+        assert pad_size % num_groups == 0
+        single_pad = pad_size // num_groups
+        num_bboxes = dn_bbox_pred.size(0)
+
+        if len(gt_labels) > 0:
+            t = torch.range(0, len(gt_labels) - 1).long().cuda()
+            t = t.unsqueeze(0).repeat(num_groups, 1)
+            pos_assigned_gt_inds = t.flatten()
+            pos_inds = (torch.tensor(range(num_groups)) *
+                        single_pad).long().cuda().unsqueeze(1) + t
+            pos_inds = pos_inds.flatten()
+        else:
+            pos_inds = pos_assigned_gt_inds = torch.tensor([]).long().cuda()
+        neg_inds = pos_inds + single_pad // 2
+
+        # label targets
+        labels = gt_bboxes.new_full((num_bboxes, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[pos_assigned_gt_inds]
+        label_weights = gt_bboxes.new_ones(num_bboxes)
+
+        # bbox targets
+        bbox_targets = torch.zeros_like(dn_bbox_pred)
+        bbox_weights = torch.zeros_like(dn_bbox_pred)
+        bbox_weights[pos_inds] = 1.0
+        img_h, img_w, _ = img_meta['img_shape']
+
+        # DETR regress the relative position of boxes (cxcywh) in the image.
+        # Thus the learning target should be normalized by the image size, also
+        # the box format should be converted from defaultly x1y1x2y2 to cxcywh.
+        factor = dn_bbox_pred.new_tensor([img_w, img_h, img_w,
+                                          img_h]).unsqueeze(0)
+        gt_bboxes_normalized = gt_bboxes / factor
+        gt_bboxes_targets = bbox_xyxy_to_cxcywh(gt_bboxes_normalized)
+        bbox_targets[pos_inds] = gt_bboxes_targets.repeat([num_groups, 1])
+
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                neg_inds)
+
+    @staticmethod
+    def extract_dn_outputs(all_cls_scores, all_bbox_preds, dn_meta):
+        # if dn_meta and dn_meta['pad_size'] > 0:
+        if dn_meta is not None:
+            denoising_cls_scores = all_cls_scores[:, :, :
+                                                  dn_meta['pad_size'], :]
+            denoising_bbox_preds = all_bbox_preds[:, :, :
+                                                  dn_meta['pad_size'], :]
+            matching_cls_scores = all_cls_scores[:, :, dn_meta['pad_size']:, :]
+            matching_bbox_preds = all_bbox_preds[:, :, dn_meta['pad_size']:, :]
+        else:
+            denoising_cls_scores = None
+            denoising_bbox_preds = None
+            matching_cls_scores = all_cls_scores
+            matching_bbox_preds = all_bbox_preds
+        return (matching_cls_scores, matching_bbox_preds, denoising_cls_scores,
+                denoising_bbox_preds)
--- a/detection/mmdet_custom/models/detectors/__init__.py
+++ b/detection/mmdet_custom/models/detectors/__init__.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+
+from .dino import DINO
+
+__all__ = ['DINO']
\ No newline at end of file
--- a/detection/mmdet_custom/models/detectors/dino.py
+++ b/detection/mmdet_custom/models/detectors/dino.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.models.builder import DETECTORS
+from mmdet.models.detectors.detr import DETR
+
+
+@DETECTORS.register_module()
+class DINO(DETR):
+    
+    def __init__(self, *args, **kwargs):
+        super(DETR, self).__init__(*args, **kwargs)
\ No newline at end of file
--- a/detection/mmdet_custom/models/utils/__init__.py
+++ b/detection/mmdet_custom/models/utils/__init__.py
+from .query_denoising import build_dn_generator
+from .transformer import (DinoTransformer, DinoTransformerDecoder)
+
+
+__all__ = ['build_dn_generator', 'DinoTransformer', 'DinoTransformerDecoder']
\ No newline at end of file
--- a/detection/mmdet_custom/models/utils/query_denoising.py
+++ b/detection/mmdet_custom/models/utils/query_denoising.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.runner import BaseModule
+
+from mmdet.core import bbox_xyxy_to_cxcywh
+from mmdet.models.utils.transformer import inverse_sigmoid
+
+
+class DnQueryGenerator(BaseModule):
+
+    def __init__(self,
+                 num_queries,
+                 hidden_dim,
+                 num_classes,
+                 noise_scale=dict(label=0.5, box=0.4),
+                 group_cfg=dict(
+                     dynamic=True, num_groups=None, num_dn_queries=None)):
+        super(DnQueryGenerator, self).__init__()
+        self.num_queries = num_queries
+        self.hidden_dim = hidden_dim
+        self.num_classes = num_classes
+        self.label_noise_scale = noise_scale['label']
+        self.box_noise_scale = noise_scale['box']
+        self.dynamic_dn_groups = group_cfg.get('dynamic', False)
+        if self.dynamic_dn_groups:
+            assert 'num_dn_queries' in group_cfg, \
+                'num_dn_queries should be set when using ' \
+                'dynamic dn groups'
+            self.num_dn = group_cfg['num_dn_queries']
+        else:
+            assert 'num_groups' in group_cfg, \
+                'num_groups should be set when using ' \
+                'static dn groups'
+            self.num_dn = group_cfg['num_groups']
+        assert isinstance(self.num_dn, int) and self.num_dn >= 1, \
+            f'Expected the num in group_cfg to have type int. ' \
+            f'Found {type(self.num_dn)} '
+
+    def get_num_groups(self, group_queries=None):
+        """
+        Args:
+            group_queries (int): Number of dn queries in one group.
+        """
+        if self.dynamic_dn_groups:
+            assert group_queries is not None, \
+                'group_queries should be provided when using ' \
+                'dynamic dn groups'
+            if group_queries == 0:
+                num_groups = 1
+            else:
+                num_groups = self.num_dn // group_queries
+        else:
+            num_groups = self.num_dn
+        if num_groups < 1: # avoid num_groups < 1 in query generator
+            num_groups = 1
+        return int(num_groups)
+
+    def forward(self,
+                gt_bboxes,
+                gt_labels=None,
+                label_enc=None,
+                img_metas=None):
+        """
+        Args:
+            gt_bboxes (List[Tensor]): List of ground truth bboxes
+                of the image, shape of each (num_gts, 4).
+            gt_labels (List[Tensor]): List of ground truth labels
+                of the image, shape of each (num_gts,), if None,
+                TODO:noisy_label would be None.
+        Returns:
+            TODO
+        """
+        # TODO: temp only support for CDN
+        # TODO: temp assert gt_labels is not None and label_enc is not None
+
+        if self.training:
+            if gt_labels is not None:
+                assert len(gt_bboxes) == len(gt_labels), \
+                    f'the length of provided gt_labels ' \
+                    f'{len(gt_labels)} should be equal to' \
+                    f' that of gt_bboxes {len(gt_bboxes)}'
+            assert gt_labels is not None \
+                   and label_enc is not None \
+                   and img_metas is not None  # TODO: adjust args
+            batch_size = len(gt_bboxes)
+
+            # convert bbox
+            gt_bboxes_list = []
+            for img_meta, bboxes in zip(img_metas, gt_bboxes):
+                img_h, img_w, _ = img_meta['img_shape']
+                factor = bboxes.new_tensor([img_w, img_h, img_w,
+                                            img_h]).unsqueeze(0)
+                bboxes_normalized = bbox_xyxy_to_cxcywh(bboxes) / factor
+                gt_bboxes_list.append(bboxes_normalized)
+            gt_bboxes = gt_bboxes_list
+
+            known = [torch.ones_like(labels) for labels in gt_labels]
+            known_num = [sum(k) for k in known]
+
+            num_groups = self.get_num_groups(int(max(known_num)))
+
+            unmask_bbox = unmask_label = torch.cat(known)
+            labels = torch.cat(gt_labels)
+            boxes = torch.cat(gt_bboxes)
+            batch_idx = torch.cat([
+                torch.full_like(t.long(), i) for i, t in enumerate(gt_labels)
+            ])
+
+            known_indice = torch.nonzero(unmask_label + unmask_bbox)
+            known_indice = known_indice.view(-1)
+
+            known_indice = known_indice.repeat(2 * num_groups, 1).view(-1)
+            known_labels = labels.repeat(2 * num_groups, 1).view(-1)
+            known_bid = batch_idx.repeat(2 * num_groups, 1).view(-1)
+            known_bboxs = boxes.repeat(2 * num_groups, 1)
+            known_labels_expand = known_labels.clone()
+            known_bbox_expand = known_bboxs.clone()
+
+            if self.label_noise_scale > 0:
+                p = torch.rand_like(known_labels_expand.float())
+                chosen_indice = torch.nonzero(
+                    p < (self.label_noise_scale * 0.5)).view(-1)
+                new_label = torch.randint_like(chosen_indice, 0,
+                                               self.num_classes)
+                known_labels_expand.scatter_(0, chosen_indice, new_label)
+            single_pad = int(max(known_num))  # TODO
+
+            pad_size = int(single_pad * 2 * num_groups)
+            positive_idx = torch.tensor(range(
+                len(boxes))).long().cuda().unsqueeze(0).repeat(num_groups, 1)
+            positive_idx += (torch.tensor(range(num_groups)) * len(boxes) *
+                             2).long().cuda().unsqueeze(1)
+            positive_idx = positive_idx.flatten()
+            negative_idx = positive_idx + len(boxes)
+            if self.box_noise_scale > 0:
+                known_bbox_ = torch.zeros_like(known_bboxs)
+                known_bbox_[:, : 2] = \
+                    known_bboxs[:, : 2] - known_bboxs[:, 2:] / 2
+                known_bbox_[:, 2:] = \
+                    known_bboxs[:, :2] + known_bboxs[:, 2:] / 2
+
+                diff = torch.zeros_like(known_bboxs)
+                diff[:, :2] = known_bboxs[:, 2:] / 2
+                diff[:, 2:] = known_bboxs[:, 2:] / 2
+
+                rand_sign = torch.randint_like(
+                    known_bboxs, low=0, high=2, dtype=torch.float32)
+                rand_sign = rand_sign * 2.0 - 1.0
+                rand_part = torch.rand_like(known_bboxs)
+                rand_part[negative_idx] += 1.0
+                rand_part *= rand_sign
+                known_bbox_ += \
+                    torch.mul(rand_part, diff).cuda() * self.box_noise_scale
+                known_bbox_ = known_bbox_.clamp(min=0.0, max=1.0)
+                known_bbox_expand[:, :2] = \
+                    (known_bbox_[:, :2] + known_bbox_[:, 2:]) / 2
+                known_bbox_expand[:, 2:] = \
+                    known_bbox_[:, 2:] - known_bbox_[:, :2]
+
+            m = known_labels_expand.long().to('cuda')
+            input_label_embed = label_enc(m)
+            input_bbox_embed = inverse_sigmoid(known_bbox_expand, eps=1e-3)
+            
+            padding_label = torch.zeros(pad_size, self.hidden_dim).cuda()
+            padding_bbox = torch.zeros(pad_size, 4).cuda()
+
+            input_query_label = padding_label.repeat(batch_size, 1, 1)
+            input_query_bbox = padding_bbox.repeat(batch_size, 1, 1)
+
+            map_known_indice = torch.tensor([]).to('cuda')
+            if len(known_num):
+                map_known_indice = torch.cat(
+                    [torch.tensor(range(num)) for num in known_num])
+                map_known_indice = torch.cat([
+                    map_known_indice + single_pad * i
+                    for i in range(2 * num_groups)
+                ]).long()
+            if len(known_bid):
+                input_query_label[(known_bid.long(),
+                                   map_known_indice)] = input_label_embed
+                input_query_bbox[(known_bid.long(),
+                                  map_known_indice)] = input_bbox_embed
+
+            tgt_size = pad_size + self.num_queries
+            attn_mask = torch.ones(tgt_size, tgt_size).to('cuda') < 0
+            # match query cannot see the reconstruct
+            attn_mask[pad_size:, :pad_size] = True
+            # reconstruct cannot see each other
+            for i in range(num_groups):
+                if i == 0:
+                    attn_mask[single_pad * 2 * i:single_pad * 2 * (i + 1),
+                              single_pad * 2 * (i + 1):pad_size] = True
+                if i == num_groups - 1:
+                    attn_mask[single_pad * 2 * i:single_pad * 2 *
+                              (i + 1), :single_pad * i * 2] = True
+                else:
+                    attn_mask[single_pad * 2 * i:single_pad * 2 * (i + 1),
+                              single_pad * 2 * (i + 1):pad_size] = True
+                    attn_mask[single_pad * 2 * i:single_pad * 2 *
+                              (i + 1), :single_pad * 2 * i] = True
+
+            dn_meta = {
+                'pad_size': pad_size,
+                'num_dn_group': num_groups,
+            }
+        else:
+            input_query_label = None
+            input_query_bbox = None
+            attn_mask = None
+            dn_meta = None
+        return input_query_label, input_query_bbox, attn_mask, dn_meta
+
+
+class CdnQueryGenerator(DnQueryGenerator):
+
+    def __init__(self, *args, **kwargs):
+        super(CdnQueryGenerator, self).__init__(*args, **kwargs)
+
+
+def build_dn_generator(dn_args):
+    """
+    Args:
+        dn_args (dict):
+    Returns:
+    """
+    if dn_args is None:
+        return None
+    type = dn_args.pop('type')
+    if type == 'DnQueryGenerator':
+        return DnQueryGenerator(**dn_args)
+    elif type == 'CdnQueryGenerator':
+        return CdnQueryGenerator(**dn_args)
+    else:
+        raise NotImplementedError(f'{type} is not supported yet')
\ No newline at end of file
--- a/detection/mmdet_custom/models/utils/transformer.py
+++ b/detection/mmdet_custom/models/utils/transformer.py
+import math
+import torch
+import torch.nn as nn
+from mmdet.models.utils.builder import TRANSFORMER
+from mmcv.cnn.bricks.registry import (
+    TRANSFORMER_LAYER_SEQUENCE, FEEDFORWARD_NETWORK, DROPOUT_LAYERS)
+from mmdet.models.utils.transformer import (inverse_sigmoid,
+                                            DeformableDetrTransformerDecoder,
+                                            DeformableDetrTransformer)
+
+
+def build_MLP(input_dim, hidden_dim, output_dim, num_layers):
+    # TODO: It can be implemented by add an out_channel arg of
+    #  mmcv.cnn.bricks.transformer.FFN
+    assert num_layers > 1, \
+        f'num_layers should be greater than 1 but got {num_layers}'
+    h = [hidden_dim] * (num_layers - 1)
+    layers = list()
+    for n, k in zip([input_dim] + h[:-1], h):
+        layers.extend((nn.Linear(n, k), nn.ReLU()))
+    # Note that the relu func of MLP in original DETR repo is set
+    # 'inplace=False', however the ReLU cfg of FFN in mmdet is set
+    # 'inplace=True' by default.
+    layers.append(nn.Linear(hidden_dim, output_dim))
+    return nn.Sequential(*layers)
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class DinoTransformerDecoder(DeformableDetrTransformerDecoder):
+
+    def __init__(self, *args, with_rp_noise=False, **kwargs):
+        super(DinoTransformerDecoder, self).__init__(*args, **kwargs)
+        self.with_rp_noise = with_rp_noise
+        self._init_layers()
+
+    def _init_layers(self):
+        self.ref_point_head = build_MLP(
+            self.embed_dims * 2,
+            self.embed_dims,
+            self.embed_dims,
+            2)
+        self.norm = nn.LayerNorm(self.embed_dims)
+
+    # @staticmethod
+    def gen_sineembed_for_position(self, pos_tensor):
+        # n_query, bs, _ = pos_tensor.size()
+        # sineembed_tensor = torch.zeros(n_query, bs, 256)
+        scale = 2 * math.pi
+        dim_t = torch.arange(
+            self.embed_dims//2, dtype=torch.float32, device=pos_tensor.device)
+        dim_t = 10000**(2 * (dim_t // 2) / (self.embed_dims//2))
+        x_embed = pos_tensor[:, :, 0] * scale
+        y_embed = pos_tensor[:, :, 1] * scale
+        pos_x = x_embed[:, :, None] / dim_t
+        pos_y = y_embed[:, :, None] / dim_t
+        pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()),
+                            dim=3).flatten(2)
+        pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()),
+                            dim=3).flatten(2)
+        if pos_tensor.size(-1) == 2:
+            pos = torch.cat((pos_y, pos_x), dim=2)
+        elif pos_tensor.size(-1) == 4:
+            w_embed = pos_tensor[:, :, 2] * scale
+            pos_w = w_embed[:, :, None] / dim_t
+            pos_w = torch.stack(
+                (pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()),
+                dim=3).flatten(2)
+
+            h_embed = pos_tensor[:, :, 3] * scale
+            pos_h = h_embed[:, :, None] / dim_t
+            pos_h = torch.stack(
+                (pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()),
+                dim=3).flatten(2)
+
+            pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2)
+        else:
+            raise ValueError('Unknown pos_tensor shape(-1):{}'.format(
+                pos_tensor.size(-1)))
+        return pos
+
+    def forward(self,
+                query,
+                *args,
+                reference_points=None,
+                valid_ratios=None,
+                reg_branches=None,
+                **kwargs):
+        output = query
+        intermediate = []
+        intermediate_reference_points = [reference_points]
+        for lid, layer in enumerate(self.layers):
+            if reference_points.shape[-1] == 4:
+                reference_points_input = \
+                    reference_points[:, :, None] * torch.cat(
+                        [valid_ratios, valid_ratios], -1)[:, None]
+            else:
+                assert reference_points.shape[-1] == 2
+                reference_points_input = \
+                    reference_points[:, :, None] * valid_ratios[:, None]
+            
+            if self.with_rp_noise and self.training:
+                device = reference_points.device
+                b, n, d = reference_points.size()
+                noise = torch.rand(b, n, d).to(device) * 0.02 - 0.01
+                reference_points = (reference_points + noise).clamp(0, 1)
+
+            query_sine_embed = self.gen_sineembed_for_position(
+                reference_points_input[:, :, 0, :])
+            query_pos = self.ref_point_head(query_sine_embed)
+
+            query_pos = query_pos.permute(1, 0, 2)
+            output = layer(
+                output,
+                *args,
+                query_pos=query_pos,
+                reference_points=reference_points_input,
+                **kwargs)
+            output = output.permute(1, 0, 2)
+
+            if reg_branches is not None:
+                tmp = reg_branches[lid](output)
+                assert reference_points.shape[-1] == 4
+                new_reference_points = tmp + inverse_sigmoid(
+                    reference_points, eps=1e-3)
+                new_reference_points = new_reference_points.sigmoid()
+                reference_points = new_reference_points.detach()
+
+            output = output.permute(1, 0, 2)
+            if self.return_intermediate:
+                intermediate.append(self.norm(output))
+                intermediate_reference_points.append(new_reference_points)
+                # NOTE this is for the "Look Forward Twice" module,
+                # in the DeformDETR, reference_points was appended.
+
+        if self.return_intermediate:
+            return torch.stack(intermediate), torch.stack(
+                intermediate_reference_points)
+
+        return output, reference_points
+
+
+@TRANSFORMER.register_module()
+class DinoTransformer(DeformableDetrTransformer):
+
+    def __init__(self, *args, **kwargs):
+        super(DinoTransformer, self).__init__(*args, **kwargs)
+
+    def init_layers(self):
+        """Initialize layers of the DinoTransformer."""
+        self.level_embeds = nn.Parameter(
+            torch.Tensor(self.num_feature_levels, self.embed_dims))
+        self.enc_output = nn.Linear(self.embed_dims, self.embed_dims)
+        self.enc_output_norm = nn.LayerNorm(self.embed_dims)
+        self.query_embed = nn.Embedding(self.two_stage_num_proposals,
+                                        self.embed_dims)
+
+    def init_weights(self):
+        super().init_weights()
+        nn.init.normal_(self.query_embed.weight.data)
+
+    def forward(self,
+                mlvl_feats,
+                mlvl_masks,
+                query_embed,
+                mlvl_pos_embeds,
+                dn_label_query,
+                dn_bbox_query,
+                attn_mask,
+                reg_branches=None,
+                cls_branches=None,
+                **kwargs):
+        assert self.as_two_stage and query_embed is None, \
+            'as_two_stage must be True for DINO'
+
+        feat_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        for lvl, (feat, mask, pos_embed) in enumerate(
+                zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)):
+            bs, c, h, w = feat.shape
+            spatial_shape = (h, w)
+            spatial_shapes.append(spatial_shape)
+            feat = feat.flatten(2).transpose(1, 2)
+            mask = mask.flatten(1)
+            pos_embed = pos_embed.flatten(2).transpose(1, 2)
+            lvl_pos_embed = pos_embed + self.level_embeds[lvl].view(1, 1, -1)
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            feat_flatten.append(feat)
+            mask_flatten.append(mask)
+        feat_flatten = torch.cat(feat_flatten, 1)
+        mask_flatten = torch.cat(mask_flatten, 1)
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
+        spatial_shapes = torch.as_tensor(
+            spatial_shapes, dtype=torch.long, device=feat_flatten.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros(
+            (1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = torch.stack(
+            [self.get_valid_ratio(m) for m in mlvl_masks], 1)
+
+        reference_points = self.get_reference_points(
+            spatial_shapes, valid_ratios, device=feat.device)
+
+        feat_flatten = feat_flatten.permute(1, 0, 2)  # (H*W, bs, embed_dims)
+        lvl_pos_embed_flatten = lvl_pos_embed_flatten.permute(
+            1, 0, 2)  # (H*W, bs, embed_dims)
+        memory = self.encoder(
+            query=feat_flatten,
+            key=None,
+            value=None,
+            query_pos=lvl_pos_embed_flatten,
+            query_key_padding_mask=mask_flatten,
+            spatial_shapes=spatial_shapes,
+            reference_points=reference_points,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios,
+            **kwargs)
+
+        memory = memory.permute(1, 0, 2)
+        bs, _, c = memory.shape
+
+        output_memory, output_proposals = self.gen_encoder_output_proposals(
+            memory, mask_flatten, spatial_shapes)
+        enc_outputs_class = cls_branches[self.decoder.num_layers](
+            output_memory)
+        enc_outputs_coord_unact = reg_branches[self.decoder.num_layers](
+            output_memory) + output_proposals
+        cls_out_features = cls_branches[self.decoder.num_layers].out_features
+        topk = self.two_stage_num_proposals
+        # NOTE In DeformDETR, enc_outputs_class[..., 0] is used for topk TODO
+        topk_indices = torch.topk(enc_outputs_class.max(-1)[0], topk, dim=1)[1]
+        # topk_proposal = torch.gather(
+        #     output_proposals, 1,
+        #     topk_indices.unsqueeze(-1).repeat(1, 1, 4)).sigmoid()
+        # topk_memory = torch.gather(
+        #     output_memory, 1,
+        #     topk_indices.unsqueeze(-1).repeat(1, 1, self.embed_dims))
+        topk_score = torch.gather(
+            enc_outputs_class, 1,
+            topk_indices.unsqueeze(-1).repeat(1, 1, cls_out_features))
+        topk_coords_unact = torch.gather(
+            enc_outputs_coord_unact, 1,
+            topk_indices.unsqueeze(-1).repeat(1, 1, 4))
+        topk_anchor = topk_coords_unact.sigmoid()
+        # NOTE In the original DeformDETR, init_reference_out is obtained
+        # from detached topk_coords_unact, which is different with DINO.  TODO
+        topk_coords_unact = topk_coords_unact.detach()
+
+        query = self.query_embed.weight[:, None, :].repeat(1, bs,
+                                                           1).transpose(0, 1)
+        if dn_label_query is not None:
+            query = torch.cat([dn_label_query, query], dim=1)
+        if dn_bbox_query is not None:
+            reference_points = torch.cat([dn_bbox_query, topk_coords_unact],
+                                         dim=1)
+        else:
+            reference_points = topk_coords_unact
+        reference_points = reference_points.sigmoid()
+
+        # decoder
+        query = query.permute(1, 0, 2)
+        memory = memory.permute(1, 0, 2)
+        inter_states, inter_references = self.decoder(
+            query=query,
+            key=None,
+            value=memory,
+            attn_masks=attn_mask,
+            key_padding_mask=mask_flatten,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios,
+            reg_branches=reg_branches,
+            **kwargs)
+
+        inter_references_out = inter_references
+
+        return inter_states, inter_references_out, topk_score, topk_anchor