Unverified Commit bdd98bcb authored by Zhe Chen's avatar Zhe Chen Committed by GitHub
Browse files

Release DINO model with InternImage-T and -L (#99)

parent 1e6e309d
...@@ -41,3 +41,12 @@ Based on community feedback, in 2017 the training/validation split was changed f ...@@ -41,3 +41,12 @@ Based on community feedback, in 2017 the training/validation split was changed f
- Training speed is measured with A100 GPUs using current code and may be faster than the speed in logs. - Training speed is measured with A100 GPUs using current code and may be faster than the speed in logs.
- Some logs are our recent newly trained ones. There might be slight differences between the results in logs and our paper. - Some logs are our recent newly trained ones. There might be slight differences between the results in logs and our paper.
- Please set `with_cp=True` to save memory if you meet `out-of-memory` issues. - Please set `with_cp=True` to save memory if you meet `out-of-memory` issues.
### DINO + InternImage
| backbone | lr type | pretrain | schd | box mAP | train time | #param | Config | Download |
| :------------: | :---------: |:---------: | :---------: | :-----: | :---: | :-----: | :---: | :---: |
| InternImage-T | layer-wise lr | ImageNet-1K | 1x | 53.9 | 9.5h | 49M | [config](./dino_4scale_internimage_t_1x_coco_layer_wise_lr.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_t_1x_coco.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_t_1x_coco.json) |
| InternImage-L | layer-wise lr | ImageNet-22K | 1x | 57.5 | 18h | 241M | [config](./dino_4scale_internimage_l_1x_coco_layer_wise_lr.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_l_1x_coco_layer_wise_lr.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_l_1x_coco_layer_wise_lr.log.json) |
| InternImage-L | 0.1x backbone lr | ImageNet-22K | 1x | 57.6 | 18h | 241M | [config](./dino_4scale_internimage_l_1x_coco_0.1x_backbone_lr.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_l_1x_coco_0.1x_backbone_lr.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_l_1x_coco_0.1x_backbone_lr.log.json) |
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
_base_ = [
'../_base_/datasets/coco_detection.py',
'../_base_/default_runtime.py',
'../_base_/schedules/schedule_1x.py',
]
pretrained = 'https://huggingface.co/OpenGVLab/InternImage/resolve/main/internimage_l_22k_192to384.pth'
model = dict(
type='DINO',
backbone=dict(
type='InternImage',
core_op='DCNv3',
channels=160,
depths=[5, 5, 22, 5],
groups=[10, 20, 40, 80],
mlp_ratio=4.,
drop_path_rate=0.4,
norm_layer='LN',
layer_scale=1.0,
offset_scale=2.0,
post_norm=True,
with_cp=False,
out_indices=(1, 2, 3),
init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
neck=dict(
type='ChannelMapper',
in_channels=[320, 640, 1280],
kernel_size=1,
out_channels=256,
act_cfg=None,
norm_cfg=dict(type='GN', num_groups=32),
num_outs=4),
bbox_head=dict(
type='DINOHead',
num_query=900,
num_classes=80,
in_channels=2048,
sync_cls_avg_factor=True,
as_two_stage=True,
with_box_refine=True,
dn_cfg=dict(
type='CdnQueryGenerator',
noise_scale=dict(label=0.5, box=1.0),
group_cfg=dict(dynamic=True, num_groups=None, num_dn_queries=100)),
transformer=dict(
type='DinoTransformer',
two_stage_num_proposals=900,
encoder=dict(
type='DetrTransformerEncoder',
num_layers=6,
transformerlayers=dict(
type='BaseTransformerLayer',
attn_cfgs=dict(
type='MultiScaleDeformableAttention',
embed_dims=256,
dropout=0.0),
feedforward_channels=2048,
ffn_dropout=0.0, # 0.1 for DeformDETR
operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
decoder=dict(
type='DinoTransformerDecoder',
num_layers=6,
return_intermediate=True,
transformerlayers=dict(
type='DetrTransformerDecoderLayer',
attn_cfgs=[
dict(
type='MultiheadAttention',
embed_dims=256,
num_heads=8,
dropout=0.0),
dict(
type='MultiScaleDeformableAttention',
embed_dims=256,
dropout=0.0),
],
feedforward_channels=2048,
ffn_dropout=0.0,
operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
'ffn', 'norm')))),
positional_encoding=dict(
type='SinePositionalEncoding',
num_feats=128,
temperature=20,
normalize=True),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(type='L1Loss', loss_weight=5.0),
loss_iou=dict(type='GIoULoss', loss_weight=2.0)),
# training and testing settings
train_cfg=dict(
assigner=dict(
type='HungarianAssigner',
cls_cost=dict(type='FocalLossCost', weight=2.0),
reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0))),
test_cfg=dict(max_per_img=300))
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
# from the default setting in mmdet.
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', with_bbox=True),
dict(type='RandomFlip', flip_ratio=0.5),
dict(
type='AutoAugment',
policies=[
[
dict(
type='Resize',
img_scale=[(480, 1333), (512, 1333), (544, 1333),
(576, 1333), (608, 1333), (640, 1333),
(672, 1333), (704, 1333), (736, 1333),
(768, 1333), (800, 1333)],
multiscale_mode='value',
keep_ratio=True)
],
[
dict(
type='Resize',
img_scale=[(400, 4200), (500, 4200), (600, 4200)],
multiscale_mode='value',
keep_ratio=True),
dict(
type='RandomCrop',
crop_type='absolute_range',
crop_size=(384, 600),
allow_negative_crop=False),
dict(
type='Resize',
img_scale=[(480, 1333), (512, 1333), (544, 1333),
(576, 1333), (608, 1333), (640, 1333),
(672, 1333), (704, 1333), (736, 1333),
(768, 1333), (800, 1333)],
multiscale_mode='value',
override=True,
keep_ratio=True)
]
]),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
]
# By default, models are trained on 8 GPUs with 2 images per GPU
data = dict(
samples_per_gpu=2,
train=dict(pipeline=train_pipeline))
# optimizer
optimizer = dict(
_delete_=True, type='AdamW', lr=0.0001, weight_decay=0.05,
paramwise_cfg=dict(
custom_keys={
'backbone': dict(lr_mult=0.1),
}))
optimizer_config = dict(_delete_=True, grad_clip=dict(max_norm=0.1, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[11])
evaluation = dict(save_best='auto')
checkpoint_config = dict(
interval=1,
max_keep_ckpts=3,
save_last=True,
)
\ No newline at end of file
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
_base_ = [
'../_base_/datasets/coco_detection.py',
'../_base_/default_runtime.py',
'../_base_/schedules/schedule_1x.py',
]
pretrained = 'https://huggingface.co/OpenGVLab/InternImage/resolve/main/internimage_l_22k_192to384.pth'
model = dict(
type='DINO',
backbone=dict(
type='InternImage',
core_op='DCNv3',
channels=160,
depths=[5, 5, 22, 5],
groups=[10, 20, 40, 80],
mlp_ratio=4.,
drop_path_rate=0.4,
norm_layer='LN',
layer_scale=1.0,
offset_scale=2.0,
post_norm=True,
with_cp=False,
out_indices=(1, 2, 3),
init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
neck=dict(
type='ChannelMapper',
in_channels=[320, 640, 1280],
kernel_size=1,
out_channels=256,
act_cfg=None,
norm_cfg=dict(type='GN', num_groups=32),
num_outs=4),
bbox_head=dict(
type='DINOHead',
num_query=900,
num_classes=80,
in_channels=2048,
sync_cls_avg_factor=True,
as_two_stage=True,
with_box_refine=True,
dn_cfg=dict(
type='CdnQueryGenerator',
noise_scale=dict(label=0.5, box=1.0),
group_cfg=dict(dynamic=True, num_groups=None, num_dn_queries=100)),
transformer=dict(
type='DinoTransformer',
two_stage_num_proposals=900,
encoder=dict(
type='DetrTransformerEncoder',
num_layers=6,
transformerlayers=dict(
type='BaseTransformerLayer',
attn_cfgs=dict(
type='MultiScaleDeformableAttention',
embed_dims=256,
dropout=0.0),
feedforward_channels=2048,
ffn_dropout=0.0, # 0.1 for DeformDETR
operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
decoder=dict(
type='DinoTransformerDecoder',
num_layers=6,
return_intermediate=True,
transformerlayers=dict(
type='DetrTransformerDecoderLayer',
attn_cfgs=[
dict(
type='MultiheadAttention',
embed_dims=256,
num_heads=8,
dropout=0.0),
dict(
type='MultiScaleDeformableAttention',
embed_dims=256,
dropout=0.0),
],
feedforward_channels=2048,
ffn_dropout=0.0,
operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
'ffn', 'norm')))),
positional_encoding=dict(
type='SinePositionalEncoding',
num_feats=128,
temperature=20,
normalize=True),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(type='L1Loss', loss_weight=5.0),
loss_iou=dict(type='GIoULoss', loss_weight=2.0)),
# training and testing settings
train_cfg=dict(
assigner=dict(
type='HungarianAssigner',
cls_cost=dict(type='FocalLossCost', weight=2.0),
reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0))),
test_cfg=dict(max_per_img=300))
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
# from the default setting in mmdet.
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', with_bbox=True),
dict(type='RandomFlip', flip_ratio=0.5),
dict(
type='AutoAugment',
policies=[
[
dict(
type='Resize',
img_scale=[(480, 1333), (512, 1333), (544, 1333),
(576, 1333), (608, 1333), (640, 1333),
(672, 1333), (704, 1333), (736, 1333),
(768, 1333), (800, 1333)],
multiscale_mode='value',
keep_ratio=True)
],
[
dict(
type='Resize',
img_scale=[(400, 4200), (500, 4200), (600, 4200)],
multiscale_mode='value',
keep_ratio=True),
dict(
type='RandomCrop',
crop_type='absolute_range',
crop_size=(384, 600),
allow_negative_crop=False),
dict(
type='Resize',
img_scale=[(480, 1333), (512, 1333), (544, 1333),
(576, 1333), (608, 1333), (640, 1333),
(672, 1333), (704, 1333), (736, 1333),
(768, 1333), (800, 1333)],
multiscale_mode='value',
override=True,
keep_ratio=True)
]
]),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
]
# By default, models are trained on 8 GPUs with 2 images per GPU
data = dict(
samples_per_gpu=2,
train=dict(pipeline=train_pipeline))
# optimizer
optimizer = dict(
_delete_=True, type='AdamW', lr=0.0001, weight_decay=0.0001,
constructor='CustomLayerDecayOptimizerConstructor',
paramwise_cfg=dict(num_layers=37, layer_decay_rate=0.90,
depths=[5, 5, 22, 5]))
optimizer_config = dict(_delete_=True, grad_clip=dict(max_norm=0.1, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[11])
evaluation = dict(save_best='auto')
checkpoint_config = dict(
interval=1,
max_keep_ckpts=3,
save_last=True,
)
\ No newline at end of file
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
_base_ = [
'../_base_/datasets/coco_detection.py',
'../_base_/default_runtime.py',
'../_base_/schedules/schedule_1x.py',
]
pretrained = 'https://huggingface.co/OpenGVLab/InternImage/resolve/main/internimage_t_1k_224.pth'
model = dict(
type='DINO',
backbone=dict(
type='InternImage',
core_op='DCNv3',
channels=64,
depths=[4, 4, 18, 4],
groups=[4, 8, 16, 32],
mlp_ratio=4.,
drop_path_rate=0.2,
norm_layer='LN',
layer_scale=1.0,
offset_scale=1.0,
post_norm=False,
with_cp=True,
out_indices=(1, 2, 3),
init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
neck=dict(
type='ChannelMapper',
in_channels=[128, 256, 512],
kernel_size=1,
out_channels=256,
act_cfg=None,
norm_cfg=dict(type='GN', num_groups=32),
num_outs=4),
bbox_head=dict(
type='DINOHead',
num_query=900,
num_classes=80,
in_channels=2048,
sync_cls_avg_factor=True,
as_two_stage=True,
with_box_refine=True,
dn_cfg=dict(
type='CdnQueryGenerator',
noise_scale=dict(label=0.5, box=1.0),
group_cfg=dict(dynamic=True, num_groups=None, num_dn_queries=100)),
transformer=dict(
type='DinoTransformer',
two_stage_num_proposals=900,
encoder=dict(
type='DetrTransformerEncoder',
num_layers=6,
transformerlayers=dict(
type='BaseTransformerLayer',
attn_cfgs=dict(
type='MultiScaleDeformableAttention',
embed_dims=256,
dropout=0.0),
feedforward_channels=2048,
ffn_dropout=0.0, # 0.1 for DeformDETR
operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
decoder=dict(
type='DinoTransformerDecoder',
num_layers=6,
return_intermediate=True,
transformerlayers=dict(
type='DetrTransformerDecoderLayer',
attn_cfgs=[
dict(
type='MultiheadAttention',
embed_dims=256,
num_heads=8,
dropout=0.0),
dict(
type='MultiScaleDeformableAttention',
embed_dims=256,
dropout=0.0),
],
feedforward_channels=2048,
ffn_dropout=0.0,
operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
'ffn', 'norm')))),
positional_encoding=dict(
type='SinePositionalEncoding',
num_feats=128,
temperature=20,
normalize=True),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(type='L1Loss', loss_weight=5.0),
loss_iou=dict(type='GIoULoss', loss_weight=2.0)),
# training and testing settings
train_cfg=dict(
assigner=dict(
type='HungarianAssigner',
cls_cost=dict(type='FocalLossCost', weight=2.0),
reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0))),
test_cfg=dict(max_per_img=100))
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
# from the default setting in mmdet.
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', with_bbox=True),
dict(type='RandomFlip', flip_ratio=0.5),
dict(
type='AutoAugment',
policies=[
[
dict(
type='Resize',
img_scale=[(480, 1333), (512, 1333), (544, 1333),
(576, 1333), (608, 1333), (640, 1333),
(672, 1333), (704, 1333), (736, 1333),
(768, 1333), (800, 1333)],
multiscale_mode='value',
keep_ratio=True)
],
[
dict(
type='Resize',
img_scale=[(400, 4200), (500, 4200), (600, 4200)],
multiscale_mode='value',
keep_ratio=True),
dict(
type='RandomCrop',
crop_type='absolute_range',
crop_size=(384, 600),
allow_negative_crop=False),
dict(
type='Resize',
img_scale=[(480, 1333), (512, 1333), (544, 1333),
(576, 1333), (608, 1333), (640, 1333),
(672, 1333), (704, 1333), (736, 1333),
(768, 1333), (800, 1333)],
multiscale_mode='value',
override=True,
keep_ratio=True)
]
]),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
]
# By default, models are trained on 8 GPUs with 2 images per GPU
data = dict(
samples_per_gpu=2,
train=dict(pipeline=train_pipeline))
# optimizer
optimizer = dict(
_delete_=True, type='AdamW', lr=0.0001, weight_decay=0.0001,
constructor='CustomLayerDecayOptimizerConstructor',
paramwise_cfg=dict(num_layers=30, layer_decay_rate=0.9,
depths=[4, 4, 18, 4]))
optimizer_config = dict(_delete_=True, grad_clip=dict(max_norm=0.1, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[11])
evaluation = dict(save_best='auto')
checkpoint_config = dict(
interval=1,
max_keep_ckpts=3,
save_last=True,
)
\ No newline at end of file
...@@ -4,4 +4,7 @@ ...@@ -4,4 +4,7 @@
# Licensed under The MIT License [see LICENSE for details] # Licensed under The MIT License [see LICENSE for details]
# -------------------------------------------------------- # --------------------------------------------------------
from .backbones import * # noqa: F401,F403 from .backbones import * # noqa: F401,F403
\ No newline at end of file from .dense_heads import * # noqa: F401,F403
from .detectors import * # noqa: F401,F403
from .utils import * # noqa: F401,F403
\ No newline at end of file
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from .deformable_detr_head import DeformableDETRHead
from .detr_head import DETRHead
from .dino_head import DINOHead
__all__ = ['DeformableDETRHead', 'DETRHead', 'DINOHead']
\ No newline at end of file
# Copyright (c) OpenMMLab. All rights reserved.
import copy
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import Linear, bias_init_with_prob, constant_init
from mmcv.runner import force_fp32
from mmdet.core import multi_apply
from mmdet.models.utils.transformer import inverse_sigmoid
from mmdet.models.builder import HEADS
from .detr_head import DETRHead
@HEADS.register_module(force=True)
class DeformableDETRHead(DETRHead):
"""Head of DeformDETR: Deformable DETR: Deformable Transformers for End-to-
End Object Detection.
Code is modified from the `official github repo
<https://github.com/fundamentalvision/Deformable-DETR>`_.
More details can be found in the `paper
<https://arxiv.org/abs/2010.04159>`_ .
Args:
with_box_refine (bool): Whether to refine the reference points
in the decoder. Defaults to False.
as_two_stage (bool) : Whether to generate the proposal from
the outputs of encoder.
transformer (obj:`ConfigDict`): ConfigDict is used for building
the Encoder and Decoder.
"""
def __init__(self,
*args,
with_box_refine=False,
as_two_stage=False,
transformer=None,
use_2fc_cls_branch=False,
**kwargs):
self.with_box_refine = with_box_refine
self.as_two_stage = as_two_stage
self.use_2fc_cls_branch = use_2fc_cls_branch
if self.as_two_stage:
transformer['as_two_stage'] = self.as_two_stage
super(DeformableDETRHead, self).__init__(
*args, transformer=transformer, **kwargs)
def _init_layers(self):
"""Initialize classification branch and regression branch of head."""
if not self.use_2fc_cls_branch:
fc_cls = Linear(self.embed_dims, self.cls_out_channels)
else:
fc_cls = nn.Sequential(*[
Linear(self.embed_dims, int(self.embed_dims * 1.5)),
nn.LayerNorm(int(self.embed_dims * 1.5)),
nn.GELU(),
Linear(int(self.embed_dims * 1.5), self.cls_out_channels),
])
fc_cls.out_features = self.cls_out_channels
reg_branch = []
for _ in range(self.num_reg_fcs):
reg_branch.append(Linear(self.embed_dims, self.embed_dims))
reg_branch.append(nn.ReLU())
reg_branch.append(Linear(self.embed_dims, 4))
reg_branch = nn.Sequential(*reg_branch)
def _get_clones(module, N):
return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
# last reg_branch is used to generate proposal from
# encode feature map when as_two_stage is True.
num_pred = (self.transformer.decoder.num_layers + 1) if \
self.as_two_stage else self.transformer.decoder.num_layers
if self.with_box_refine:
self.cls_branches = _get_clones(fc_cls, num_pred)
self.reg_branches = _get_clones(reg_branch, num_pred)
else:
self.cls_branches = nn.ModuleList(
[fc_cls for _ in range(num_pred)])
self.reg_branches = nn.ModuleList(
[reg_branch for _ in range(num_pred)])
if not self.as_two_stage:
self.query_embedding = nn.Embedding(
self.num_query,
self.embed_dims * 2)
def init_weights(self):
"""Initialize weights of the DeformDETR head."""
self.transformer.init_weights()
if self.loss_cls.use_sigmoid:
bias_init = bias_init_with_prob(0.01)
if not self.use_2fc_cls_branch:
for m in self.cls_branches:
nn.init.constant_(m.bias, bias_init)
for m in self.reg_branches:
constant_init(m[-1], 0, bias=0)
nn.init.constant_(self.reg_branches[0][-1].bias.data[2:], -2.0)
if self.as_two_stage:
for m in self.reg_branches:
nn.init.constant_(m[-1].bias.data[2:], 0.0)
def forward(self, mlvl_feats, img_metas):
"""Forward function.
Args:
mlvl_feats (tuple[Tensor]): Features from the upstream
network, each is a 4D-tensor with shape
(N, C, H, W).
img_metas (list[dict]): List of image information.
Returns:
all_cls_scores (Tensor): Outputs from the classification head, \
shape [nb_dec, bs, num_query, cls_out_channels]. Note \
cls_out_channels should includes background.
all_bbox_preds (Tensor): Sigmoid outputs from the regression \
head with normalized coordinate format (cx, cy, w, h). \
Shape [nb_dec, bs, num_query, 4].
enc_outputs_class (Tensor): The score of each point on encode \
feature map, has shape (N, h*w, num_class). Only when \
as_two_stage is True it would be returned, otherwise \
`None` would be returned.
enc_outputs_coord (Tensor): The proposal generate from the \
encode feature map, has shape (N, h*w, 4). Only when \
as_two_stage is True it would be returned, otherwise \
`None` would be returned.
"""
batch_size = mlvl_feats[0].size(0)
input_img_h, input_img_w = img_metas[0]['batch_input_shape']
img_masks = mlvl_feats[0].new_ones(
(batch_size, input_img_h, input_img_w))
for img_id in range(batch_size):
img_h, img_w, _ = img_metas[img_id]['img_shape']
img_masks[img_id, :img_h, :img_w] = 0
mlvl_masks = []
mlvl_positional_encodings = []
for feat in mlvl_feats:
mlvl_masks.append(
F.interpolate(img_masks[None],
size=feat.shape[-2:]).to(torch.bool).squeeze(0))
mlvl_positional_encodings.append(
self.positional_encoding(mlvl_masks[-1]))
query_embeds = None
if not self.as_two_stage:
query_embeds = self.query_embedding.weight
hs, init_reference, inter_references, \
enc_outputs_class, enc_outputs_coord = self.transformer(
mlvl_feats,
mlvl_masks,
query_embeds,
mlvl_positional_encodings,
reg_branches=self.reg_branches if self.with_box_refine else None, # noqa:E501
cls_branches=self.cls_branches if self.as_two_stage else None # noqa:E501
)
hs = hs.permute(0, 2, 1, 3)
outputs_classes = []
outputs_coords = []
for lvl in range(hs.shape[0]):
if lvl == 0:
reference = init_reference
else:
reference = inter_references[lvl - 1]
reference = inverse_sigmoid(reference)
outputs_class = self.cls_branches[lvl](hs[lvl])
tmp = self.reg_branches[lvl](hs[lvl])
if reference.shape[-1] == 4:
tmp += reference
else:
assert reference.shape[-1] == 2
tmp[..., :2] += reference
outputs_coord = tmp.sigmoid()
outputs_classes.append(outputs_class)
outputs_coords.append(outputs_coord)
outputs_classes = torch.stack(outputs_classes)
outputs_coords = torch.stack(outputs_coords)
if self.as_two_stage:
return outputs_classes, outputs_coords, \
enc_outputs_class, \
enc_outputs_coord.sigmoid()
else:
return outputs_classes, outputs_coords, \
None, None
@force_fp32(apply_to=('all_cls_scores_list', 'all_bbox_preds_list'))
def loss(self,
all_cls_scores,
all_bbox_preds,
enc_cls_scores,
enc_bbox_preds,
gt_bboxes_list,
gt_labels_list,
img_metas,
gt_bboxes_ignore=None):
""""Loss function.
Args:
all_cls_scores (Tensor): Classification score of all
decoder layers, has shape
[nb_dec, bs, num_query, cls_out_channels].
all_bbox_preds (Tensor): Sigmoid regression
outputs of all decode layers. Each is a 4D-tensor with
normalized coordinate format (cx, cy, w, h) and shape
[nb_dec, bs, num_query, 4].
enc_cls_scores (Tensor): Classification scores of
points on encode feature map , has shape
(N, h*w, num_classes). Only be passed when as_two_stage is
True, otherwise is None.
enc_bbox_preds (Tensor): Regression results of each points
on the encode feature map, has shape (N, h*w, 4). Only be
passed when as_two_stage is True, otherwise is None.
gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
img_metas (list[dict]): List of image meta information.
gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
which can be ignored for each image. Default None.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
assert gt_bboxes_ignore is None, \
f'{self.__class__.__name__} only supports ' \
f'for gt_bboxes_ignore setting to None.'
num_dec_layers = len(all_cls_scores)
all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
all_gt_bboxes_ignore_list = [
gt_bboxes_ignore for _ in range(num_dec_layers)
]
img_metas_list = [img_metas for _ in range(num_dec_layers)]
losses_cls, losses_bbox, losses_iou = multi_apply(
self.loss_single, all_cls_scores, all_bbox_preds,
all_gt_bboxes_list, all_gt_labels_list, img_metas_list,
all_gt_bboxes_ignore_list)
loss_dict = dict()
# loss of proposal generated from encode feature map.
if enc_cls_scores is not None:
binary_labels_list = [
torch.zeros_like(gt_labels_list[i])
for i in range(len(img_metas))
]
enc_loss_cls, enc_losses_bbox, enc_losses_iou = \
self.loss_single(enc_cls_scores, enc_bbox_preds,
gt_bboxes_list, binary_labels_list,
img_metas, gt_bboxes_ignore)
loss_dict['enc_loss_cls'] = enc_loss_cls
loss_dict['enc_loss_bbox'] = enc_losses_bbox
loss_dict['enc_loss_iou'] = enc_losses_iou
# loss from the last decoder layer
loss_dict['loss_cls'] = losses_cls[-1]
loss_dict['loss_bbox'] = losses_bbox[-1]
loss_dict['loss_iou'] = losses_iou[-1]
# loss from other decoder layers
num_dec_layer = 0
for loss_cls_i, loss_bbox_i, loss_iou_i in zip(losses_cls[:-1],
losses_bbox[:-1],
losses_iou[:-1]):
loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
loss_dict[f'd{num_dec_layer}.loss_iou'] = loss_iou_i
num_dec_layer += 1
return loss_dict
@force_fp32(apply_to=('all_cls_scores_list', 'all_bbox_preds_list'))
def get_bboxes(self,
all_cls_scores,
all_bbox_preds,
enc_cls_scores,
enc_bbox_preds,
img_metas,
rescale=False):
"""Transform network outputs for a batch into bbox predictions.
Args:
all_cls_scores (Tensor): Classification score of all
decoder layers, has shape
[nb_dec, bs, num_query, cls_out_channels].
all_bbox_preds (Tensor): Sigmoid regression
outputs of all decode layers. Each is a 4D-tensor with
normalized coordinate format (cx, cy, w, h) and shape
[nb_dec, bs, num_query, 4].
enc_cls_scores (Tensor): Classification scores of
points on encode feature map , has shape
(N, h*w, num_classes). Only be passed when as_two_stage is
True, otherwise is None.
enc_bbox_preds (Tensor): Regression results of each points
on the encode feature map, has shape (N, h*w, 4). Only be
passed when as_two_stage is True, otherwise is None.
img_metas (list[dict]): Meta information of each image.
rescale (bool, optional): If True, return boxes in original
image space. Default False.
Returns:
list[list[Tensor, Tensor]]: Each item in result_list is 2-tuple. \
The first item is an (n, 5) tensor, where the first 4 columns \
are bounding box positions (tl_x, tl_y, br_x, br_y) and the \
5-th column is a score between 0 and 1. The second item is a \
(n,) tensor where each item is the predicted class label of \
the corresponding box.
"""
cls_scores = all_cls_scores[-1]
bbox_preds = all_bbox_preds[-1]
result_list = []
for img_id in range(len(img_metas)):
cls_score = cls_scores[img_id]
bbox_pred = bbox_preds[img_id]
img_shape = img_metas[img_id]['img_shape']
scale_factor = img_metas[img_id]['scale_factor']
proposals = self._get_bboxes_single(cls_score, bbox_pred,
img_shape, scale_factor,
rescale)
result_list.append(proposals)
return result_list
This diff is collapsed.
# Copyright (c) OpenMMLab. All rights reserved.
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmdet.core import (bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh, multi_apply,
reduce_mean)
from ..utils import build_dn_generator
from mmdet.models.utils.transformer import inverse_sigmoid
from mmdet.models.builder import HEADS
from .deformable_detr_head import DeformableDETRHead
from mmcv.runner import force_fp32
@HEADS.register_module()
class DINOHead(DeformableDETRHead):
def __init__(self, *args, dn_cfg=None, **kwargs):
super(DINOHead, self).__init__(*args, **kwargs)
self._init_layers()
self.init_denoising(dn_cfg)
assert self.as_two_stage, \
'as_two_stage must be True for DINO'
assert self.with_box_refine, \
'with_box_refine must be True for DINO'
def _init_layers(self):
super()._init_layers()
# NOTE The original repo of DINO set the num_embeddings 92 for coco,
# 91 (0~90) of which represents target classes and the 92 (91)
# indicates [Unknown] class. However, the embedding of unknown class
# is not used in the original DINO
self.label_embedding = nn.Embedding(self.cls_out_channels,
self.embed_dims)
def init_denoising(self, dn_cfg):
if dn_cfg is not None:
dn_cfg['num_classes'] = self.num_classes
dn_cfg['num_queries'] = self.num_query
dn_cfg['hidden_dim'] = self.embed_dims
self.dn_generator = build_dn_generator(dn_cfg)
def forward_train(self,
x,
img_metas,
gt_bboxes,
gt_labels=None,
gt_bboxes_ignore=None,
proposal_cfg=None,
**kwargs):
assert proposal_cfg is None, '"proposal_cfg" must be None'
assert self.dn_generator is not None, '"dn_cfg" must be set'
dn_label_query, dn_bbox_query, attn_mask, dn_meta = \
self.dn_generator(gt_bboxes, gt_labels,
self.label_embedding, img_metas)
outs = self(x, img_metas, dn_label_query, dn_bbox_query, attn_mask)
if gt_labels is None:
loss_inputs = outs + (gt_bboxes, img_metas, dn_meta)
else:
loss_inputs = outs + (gt_bboxes, gt_labels, img_metas, dn_meta)
losses = self.loss(*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
return losses
def forward(self,
mlvl_feats,
img_metas,
dn_label_query=None,
dn_bbox_query=None,
attn_mask=None):
batch_size = mlvl_feats[0].size(0)
input_img_h, input_img_w = img_metas[0]['batch_input_shape']
img_masks = mlvl_feats[0].new_ones(
(batch_size, input_img_h, input_img_w))
for img_id in range(batch_size):
img_h, img_w, _ = img_metas[img_id]['img_shape']
img_masks[img_id, :img_h, :img_w] = 0
mlvl_masks = []
mlvl_positional_encodings = []
for feat in mlvl_feats:
mlvl_masks.append(
F.interpolate(
img_masks[None],
size=feat.shape[-2:]).to(torch.bool).squeeze(0))
mlvl_positional_encodings.append(
self.positional_encoding(mlvl_masks[-1]))
query_embeds = None
hs, inter_references, topk_score, topk_anchor = \
self.transformer(
mlvl_feats,
mlvl_masks,
query_embeds,
mlvl_positional_encodings,
dn_label_query,
dn_bbox_query,
attn_mask,
reg_branches=self.reg_branches if self.with_box_refine else None, # noqa:E501
cls_branches=self.cls_branches if self.as_two_stage else None # noqa:E501
)
hs = hs.permute(0, 2, 1, 3)
if dn_label_query is not None and dn_label_query.size(1) == 0:
# NOTE: If there is no target in the image, the parameters of
# label_embedding won't be used in producing loss, which raises
# RuntimeError when using distributed mode.
hs[0] += self.label_embedding.weight[0, 0] * 0.0
outputs_classes = []
outputs_coords = []
for lvl in range(hs.shape[0]):
reference = inter_references[lvl]
reference = inverse_sigmoid(reference, eps=1e-3)
outputs_class = self.cls_branches[lvl](hs[lvl])
tmp = self.reg_branches[lvl](hs[lvl])
if reference.shape[-1] == 4:
tmp += reference
else:
assert reference.shape[-1] == 2
tmp[..., :2] += reference
outputs_coord = tmp.sigmoid()
outputs_classes.append(outputs_class)
outputs_coords.append(outputs_coord)
outputs_classes = torch.stack(outputs_classes)
outputs_coords = torch.stack(outputs_coords)
return outputs_classes, outputs_coords, topk_score, topk_anchor
@force_fp32(apply_to=('all_cls_scores', 'all_bbox_preds'))
def loss(self,
all_cls_scores,
all_bbox_preds,
enc_topk_scores,
enc_topk_anchors,
gt_bboxes_list,
gt_labels_list,
img_metas,
dn_meta=None,
gt_bboxes_ignore=None):
assert gt_bboxes_ignore is None, \
f'{self.__class__.__name__} only supports ' \
f'for gt_bboxes_ignore setting to None.'
loss_dict = dict()
# extract denoising and matching part of outputs
all_cls_scores, all_bbox_preds, dn_cls_scores, dn_bbox_preds = \
self.extract_dn_outputs(all_cls_scores, all_bbox_preds, dn_meta)
if enc_topk_scores is not None:
# calculate loss from encode feature maps
# NOTE The DeformDETR calculate binary cls loss
# for all encoder embeddings, while DINO calculate
# multi-class loss for topk embeddings.
enc_loss_cls, enc_losses_bbox, enc_losses_iou = \
self.loss_single(enc_topk_scores, enc_topk_anchors,
gt_bboxes_list, gt_labels_list,
img_metas, gt_bboxes_ignore)
# collate loss from encode feature maps
loss_dict['interm_loss_cls'] = enc_loss_cls
loss_dict['interm_loss_bbox'] = enc_losses_bbox
loss_dict['interm_loss_iou'] = enc_losses_iou
# calculate loss from all decoder layers
num_dec_layers = len(all_cls_scores)
all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
all_gt_bboxes_ignore_list = [
gt_bboxes_ignore for _ in range(num_dec_layers)
]
img_metas_list = [img_metas for _ in range(num_dec_layers)]
losses_cls, losses_bbox, losses_iou = multi_apply(
self.loss_single, all_cls_scores, all_bbox_preds,
all_gt_bboxes_list, all_gt_labels_list, img_metas_list,
all_gt_bboxes_ignore_list)
# collate loss from the last decoder layer
loss_dict['loss_cls'] = losses_cls[-1]
loss_dict['loss_bbox'] = losses_bbox[-1]
loss_dict['loss_iou'] = losses_iou[-1]
# collate loss from other decoder layers
num_dec_layer = 0
for loss_cls_i, loss_bbox_i, loss_iou_i in zip(losses_cls[:-1],
losses_bbox[:-1],
losses_iou[:-1]):
loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
loss_dict[f'd{num_dec_layer}.loss_iou'] = loss_iou_i
num_dec_layer += 1
if dn_cls_scores is not None:
# calculate denoising loss from all decoder layers
dn_meta = [dn_meta for _ in img_metas]
dn_losses_cls, dn_losses_bbox, dn_losses_iou = self.loss_dn(
dn_cls_scores, dn_bbox_preds, gt_bboxes_list, gt_labels_list,
img_metas, dn_meta)
# collate denoising loss
loss_dict['dn_loss_cls'] = dn_losses_cls[-1]
loss_dict['dn_loss_bbox'] = dn_losses_bbox[-1]
loss_dict['dn_loss_iou'] = dn_losses_iou[-1]
num_dec_layer = 0
for loss_cls_i, loss_bbox_i, loss_iou_i in zip(
dn_losses_cls[:-1], dn_losses_bbox[:-1],
dn_losses_iou[:-1]):
loss_dict[f'd{num_dec_layer}.dn_loss_cls'] = loss_cls_i
loss_dict[f'd{num_dec_layer}.dn_loss_bbox'] = loss_bbox_i
loss_dict[f'd{num_dec_layer}.dn_loss_iou'] = loss_iou_i
num_dec_layer += 1
return loss_dict
def loss_dn(self, dn_cls_scores, dn_bbox_preds, gt_bboxes_list,
gt_labels_list, img_metas, dn_meta):
num_dec_layers = len(dn_cls_scores)
all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
img_metas_list = [img_metas for _ in range(num_dec_layers)]
dn_meta_list = [dn_meta for _ in range(num_dec_layers)]
return multi_apply(self.loss_dn_single, dn_cls_scores, dn_bbox_preds,
all_gt_bboxes_list, all_gt_labels_list,
img_metas_list, dn_meta_list)
def loss_dn_single(self, dn_cls_scores, dn_bbox_preds, gt_bboxes_list,
gt_labels_list, img_metas, dn_meta):
num_imgs = dn_cls_scores.size(0)
bbox_preds_list = [dn_bbox_preds[i] for i in range(num_imgs)]
cls_reg_targets = self.get_dn_target(bbox_preds_list, gt_bboxes_list,
gt_labels_list, img_metas,
dn_meta)
(labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
num_total_pos, num_total_neg) = cls_reg_targets
labels = torch.cat(labels_list, 0)
label_weights = torch.cat(label_weights_list, 0)
bbox_targets = torch.cat(bbox_targets_list, 0)
bbox_weights = torch.cat(bbox_weights_list, 0)
# classification loss
cls_scores = dn_cls_scores.reshape(-1, self.cls_out_channels)
# construct weighted avg_factor to match with the official DETR repo
cls_avg_factor = \
num_total_pos * 1.0 + num_total_neg * self.bg_cls_weight
if self.sync_cls_avg_factor:
cls_avg_factor = reduce_mean(
cls_scores.new_tensor([cls_avg_factor]))
cls_avg_factor = max(cls_avg_factor, 1)
if len(cls_scores) > 0:
loss_cls = self.loss_cls(
cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
else:
loss_cls = torch.zeros( # TODO: How to better return zero loss
1,
dtype=cls_scores.dtype,
device=cls_scores.device)
# Compute the average number of gt boxes across all gpus, for
# normalization purposes
num_total_pos = loss_cls.new_tensor([num_total_pos])
num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
# construct factors used for rescale bboxes
factors = []
for img_meta, bbox_pred in zip(img_metas, dn_bbox_preds):
img_h, img_w, _ = img_meta['img_shape']
factor = bbox_pred.new_tensor([img_w, img_h, img_w,
img_h]).unsqueeze(0).repeat(
bbox_pred.size(0), 1)
factors.append(factor)
factors = torch.cat(factors, 0)
# DETR regress the relative position of boxes (cxcywh) in the image,
# thus the learning target is normalized by the image size. So here
# we need to re-scale them for calculating IoU loss
bbox_preds = dn_bbox_preds.reshape(-1, 4)
bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors
bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors
# regression IoU loss, defaultly GIoU loss
loss_iou = self.loss_iou(
bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos)
# regression L1 loss
loss_bbox = self.loss_bbox(
bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos)
return loss_cls, loss_bbox, loss_iou
def get_dn_target(self, dn_bbox_preds_list, gt_bboxes_list, gt_labels_list,
img_metas, dn_meta):
(labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
pos_inds_list,
neg_inds_list) = multi_apply(self._get_dn_target_single,
dn_bbox_preds_list, gt_bboxes_list,
gt_labels_list, img_metas, dn_meta)
num_total_pos = sum((inds.numel() for inds in pos_inds_list))
num_total_neg = sum((inds.numel() for inds in neg_inds_list))
return (labels_list, label_weights_list, bbox_targets_list,
bbox_weights_list, num_total_pos, num_total_neg)
def _get_dn_target_single(self, dn_bbox_pred, gt_bboxes, gt_labels,
img_meta, dn_meta):
num_groups = dn_meta['num_dn_group']
pad_size = dn_meta['pad_size']
assert pad_size % num_groups == 0
single_pad = pad_size // num_groups
num_bboxes = dn_bbox_pred.size(0)
if len(gt_labels) > 0:
t = torch.range(0, len(gt_labels) - 1).long().cuda()
t = t.unsqueeze(0).repeat(num_groups, 1)
pos_assigned_gt_inds = t.flatten()
pos_inds = (torch.tensor(range(num_groups)) *
single_pad).long().cuda().unsqueeze(1) + t
pos_inds = pos_inds.flatten()
else:
pos_inds = pos_assigned_gt_inds = torch.tensor([]).long().cuda()
neg_inds = pos_inds + single_pad // 2
# label targets
labels = gt_bboxes.new_full((num_bboxes, ),
self.num_classes,
dtype=torch.long)
labels[pos_inds] = gt_labels[pos_assigned_gt_inds]
label_weights = gt_bboxes.new_ones(num_bboxes)
# bbox targets
bbox_targets = torch.zeros_like(dn_bbox_pred)
bbox_weights = torch.zeros_like(dn_bbox_pred)
bbox_weights[pos_inds] = 1.0
img_h, img_w, _ = img_meta['img_shape']
# DETR regress the relative position of boxes (cxcywh) in the image.
# Thus the learning target should be normalized by the image size, also
# the box format should be converted from defaultly x1y1x2y2 to cxcywh.
factor = dn_bbox_pred.new_tensor([img_w, img_h, img_w,
img_h]).unsqueeze(0)
gt_bboxes_normalized = gt_bboxes / factor
gt_bboxes_targets = bbox_xyxy_to_cxcywh(gt_bboxes_normalized)
bbox_targets[pos_inds] = gt_bboxes_targets.repeat([num_groups, 1])
return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
neg_inds)
@staticmethod
def extract_dn_outputs(all_cls_scores, all_bbox_preds, dn_meta):
# if dn_meta and dn_meta['pad_size'] > 0:
if dn_meta is not None:
denoising_cls_scores = all_cls_scores[:, :, :
dn_meta['pad_size'], :]
denoising_bbox_preds = all_bbox_preds[:, :, :
dn_meta['pad_size'], :]
matching_cls_scores = all_cls_scores[:, :, dn_meta['pad_size']:, :]
matching_bbox_preds = all_bbox_preds[:, :, dn_meta['pad_size']:, :]
else:
denoising_cls_scores = None
denoising_bbox_preds = None
matching_cls_scores = all_cls_scores
matching_bbox_preds = all_bbox_preds
return (matching_cls_scores, matching_bbox_preds, denoising_cls_scores,
denoising_bbox_preds)
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from .dino import DINO
__all__ = ['DINO']
\ No newline at end of file
# Copyright (c) OpenMMLab. All rights reserved.
from mmdet.models.builder import DETECTORS
from mmdet.models.detectors.detr import DETR
@DETECTORS.register_module()
class DINO(DETR):
def __init__(self, *args, **kwargs):
super(DETR, self).__init__(*args, **kwargs)
\ No newline at end of file
from .query_denoising import build_dn_generator
from .transformer import (DinoTransformer, DinoTransformerDecoder)
__all__ = ['build_dn_generator', 'DinoTransformer', 'DinoTransformerDecoder']
\ No newline at end of file
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from mmcv.runner import BaseModule
from mmdet.core import bbox_xyxy_to_cxcywh
from mmdet.models.utils.transformer import inverse_sigmoid
class DnQueryGenerator(BaseModule):
def __init__(self,
num_queries,
hidden_dim,
num_classes,
noise_scale=dict(label=0.5, box=0.4),
group_cfg=dict(
dynamic=True, num_groups=None, num_dn_queries=None)):
super(DnQueryGenerator, self).__init__()
self.num_queries = num_queries
self.hidden_dim = hidden_dim
self.num_classes = num_classes
self.label_noise_scale = noise_scale['label']
self.box_noise_scale = noise_scale['box']
self.dynamic_dn_groups = group_cfg.get('dynamic', False)
if self.dynamic_dn_groups:
assert 'num_dn_queries' in group_cfg, \
'num_dn_queries should be set when using ' \
'dynamic dn groups'
self.num_dn = group_cfg['num_dn_queries']
else:
assert 'num_groups' in group_cfg, \
'num_groups should be set when using ' \
'static dn groups'
self.num_dn = group_cfg['num_groups']
assert isinstance(self.num_dn, int) and self.num_dn >= 1, \
f'Expected the num in group_cfg to have type int. ' \
f'Found {type(self.num_dn)} '
def get_num_groups(self, group_queries=None):
"""
Args:
group_queries (int): Number of dn queries in one group.
"""
if self.dynamic_dn_groups:
assert group_queries is not None, \
'group_queries should be provided when using ' \
'dynamic dn groups'
if group_queries == 0:
num_groups = 1
else:
num_groups = self.num_dn // group_queries
else:
num_groups = self.num_dn
if num_groups < 1: # avoid num_groups < 1 in query generator
num_groups = 1
return int(num_groups)
def forward(self,
gt_bboxes,
gt_labels=None,
label_enc=None,
img_metas=None):
"""
Args:
gt_bboxes (List[Tensor]): List of ground truth bboxes
of the image, shape of each (num_gts, 4).
gt_labels (List[Tensor]): List of ground truth labels
of the image, shape of each (num_gts,), if None,
TODO:noisy_label would be None.
Returns:
TODO
"""
# TODO: temp only support for CDN
# TODO: temp assert gt_labels is not None and label_enc is not None
if self.training:
if gt_labels is not None:
assert len(gt_bboxes) == len(gt_labels), \
f'the length of provided gt_labels ' \
f'{len(gt_labels)} should be equal to' \
f' that of gt_bboxes {len(gt_bboxes)}'
assert gt_labels is not None \
and label_enc is not None \
and img_metas is not None # TODO: adjust args
batch_size = len(gt_bboxes)
# convert bbox
gt_bboxes_list = []
for img_meta, bboxes in zip(img_metas, gt_bboxes):
img_h, img_w, _ = img_meta['img_shape']
factor = bboxes.new_tensor([img_w, img_h, img_w,
img_h]).unsqueeze(0)
bboxes_normalized = bbox_xyxy_to_cxcywh(bboxes) / factor
gt_bboxes_list.append(bboxes_normalized)
gt_bboxes = gt_bboxes_list
known = [torch.ones_like(labels) for labels in gt_labels]
known_num = [sum(k) for k in known]
num_groups = self.get_num_groups(int(max(known_num)))
unmask_bbox = unmask_label = torch.cat(known)
labels = torch.cat(gt_labels)
boxes = torch.cat(gt_bboxes)
batch_idx = torch.cat([
torch.full_like(t.long(), i) for i, t in enumerate(gt_labels)
])
known_indice = torch.nonzero(unmask_label + unmask_bbox)
known_indice = known_indice.view(-1)
known_indice = known_indice.repeat(2 * num_groups, 1).view(-1)
known_labels = labels.repeat(2 * num_groups, 1).view(-1)
known_bid = batch_idx.repeat(2 * num_groups, 1).view(-1)
known_bboxs = boxes.repeat(2 * num_groups, 1)
known_labels_expand = known_labels.clone()
known_bbox_expand = known_bboxs.clone()
if self.label_noise_scale > 0:
p = torch.rand_like(known_labels_expand.float())
chosen_indice = torch.nonzero(
p < (self.label_noise_scale * 0.5)).view(-1)
new_label = torch.randint_like(chosen_indice, 0,
self.num_classes)
known_labels_expand.scatter_(0, chosen_indice, new_label)
single_pad = int(max(known_num)) # TODO
pad_size = int(single_pad * 2 * num_groups)
positive_idx = torch.tensor(range(
len(boxes))).long().cuda().unsqueeze(0).repeat(num_groups, 1)
positive_idx += (torch.tensor(range(num_groups)) * len(boxes) *
2).long().cuda().unsqueeze(1)
positive_idx = positive_idx.flatten()
negative_idx = positive_idx + len(boxes)
if self.box_noise_scale > 0:
known_bbox_ = torch.zeros_like(known_bboxs)
known_bbox_[:, : 2] = \
known_bboxs[:, : 2] - known_bboxs[:, 2:] / 2
known_bbox_[:, 2:] = \
known_bboxs[:, :2] + known_bboxs[:, 2:] / 2
diff = torch.zeros_like(known_bboxs)
diff[:, :2] = known_bboxs[:, 2:] / 2
diff[:, 2:] = known_bboxs[:, 2:] / 2
rand_sign = torch.randint_like(
known_bboxs, low=0, high=2, dtype=torch.float32)
rand_sign = rand_sign * 2.0 - 1.0
rand_part = torch.rand_like(known_bboxs)
rand_part[negative_idx] += 1.0
rand_part *= rand_sign
known_bbox_ += \
torch.mul(rand_part, diff).cuda() * self.box_noise_scale
known_bbox_ = known_bbox_.clamp(min=0.0, max=1.0)
known_bbox_expand[:, :2] = \
(known_bbox_[:, :2] + known_bbox_[:, 2:]) / 2
known_bbox_expand[:, 2:] = \
known_bbox_[:, 2:] - known_bbox_[:, :2]
m = known_labels_expand.long().to('cuda')
input_label_embed = label_enc(m)
input_bbox_embed = inverse_sigmoid(known_bbox_expand, eps=1e-3)
padding_label = torch.zeros(pad_size, self.hidden_dim).cuda()
padding_bbox = torch.zeros(pad_size, 4).cuda()
input_query_label = padding_label.repeat(batch_size, 1, 1)
input_query_bbox = padding_bbox.repeat(batch_size, 1, 1)
map_known_indice = torch.tensor([]).to('cuda')
if len(known_num):
map_known_indice = torch.cat(
[torch.tensor(range(num)) for num in known_num])
map_known_indice = torch.cat([
map_known_indice + single_pad * i
for i in range(2 * num_groups)
]).long()
if len(known_bid):
input_query_label[(known_bid.long(),
map_known_indice)] = input_label_embed
input_query_bbox[(known_bid.long(),
map_known_indice)] = input_bbox_embed
tgt_size = pad_size + self.num_queries
attn_mask = torch.ones(tgt_size, tgt_size).to('cuda') < 0
# match query cannot see the reconstruct
attn_mask[pad_size:, :pad_size] = True
# reconstruct cannot see each other
for i in range(num_groups):
if i == 0:
attn_mask[single_pad * 2 * i:single_pad * 2 * (i + 1),
single_pad * 2 * (i + 1):pad_size] = True
if i == num_groups - 1:
attn_mask[single_pad * 2 * i:single_pad * 2 *
(i + 1), :single_pad * i * 2] = True
else:
attn_mask[single_pad * 2 * i:single_pad * 2 * (i + 1),
single_pad * 2 * (i + 1):pad_size] = True
attn_mask[single_pad * 2 * i:single_pad * 2 *
(i + 1), :single_pad * 2 * i] = True
dn_meta = {
'pad_size': pad_size,
'num_dn_group': num_groups,
}
else:
input_query_label = None
input_query_bbox = None
attn_mask = None
dn_meta = None
return input_query_label, input_query_bbox, attn_mask, dn_meta
class CdnQueryGenerator(DnQueryGenerator):
def __init__(self, *args, **kwargs):
super(CdnQueryGenerator, self).__init__(*args, **kwargs)
def build_dn_generator(dn_args):
"""
Args:
dn_args (dict):
Returns:
"""
if dn_args is None:
return None
type = dn_args.pop('type')
if type == 'DnQueryGenerator':
return DnQueryGenerator(**dn_args)
elif type == 'CdnQueryGenerator':
return CdnQueryGenerator(**dn_args)
else:
raise NotImplementedError(f'{type} is not supported yet')
\ No newline at end of file
import math
import torch
import torch.nn as nn
from mmdet.models.utils.builder import TRANSFORMER
from mmcv.cnn.bricks.registry import (
TRANSFORMER_LAYER_SEQUENCE, FEEDFORWARD_NETWORK, DROPOUT_LAYERS)
from mmdet.models.utils.transformer import (inverse_sigmoid,
DeformableDetrTransformerDecoder,
DeformableDetrTransformer)
def build_MLP(input_dim, hidden_dim, output_dim, num_layers):
# TODO: It can be implemented by add an out_channel arg of
# mmcv.cnn.bricks.transformer.FFN
assert num_layers > 1, \
f'num_layers should be greater than 1 but got {num_layers}'
h = [hidden_dim] * (num_layers - 1)
layers = list()
for n, k in zip([input_dim] + h[:-1], h):
layers.extend((nn.Linear(n, k), nn.ReLU()))
# Note that the relu func of MLP in original DETR repo is set
# 'inplace=False', however the ReLU cfg of FFN in mmdet is set
# 'inplace=True' by default.
layers.append(nn.Linear(hidden_dim, output_dim))
return nn.Sequential(*layers)
@TRANSFORMER_LAYER_SEQUENCE.register_module()
class DinoTransformerDecoder(DeformableDetrTransformerDecoder):
def __init__(self, *args, with_rp_noise=False, **kwargs):
super(DinoTransformerDecoder, self).__init__(*args, **kwargs)
self.with_rp_noise = with_rp_noise
self._init_layers()
def _init_layers(self):
self.ref_point_head = build_MLP(
self.embed_dims * 2,
self.embed_dims,
self.embed_dims,
2)
self.norm = nn.LayerNorm(self.embed_dims)
# @staticmethod
def gen_sineembed_for_position(self, pos_tensor):
# n_query, bs, _ = pos_tensor.size()
# sineembed_tensor = torch.zeros(n_query, bs, 256)
scale = 2 * math.pi
dim_t = torch.arange(
self.embed_dims//2, dtype=torch.float32, device=pos_tensor.device)
dim_t = 10000**(2 * (dim_t // 2) / (self.embed_dims//2))
x_embed = pos_tensor[:, :, 0] * scale
y_embed = pos_tensor[:, :, 1] * scale
pos_x = x_embed[:, :, None] / dim_t
pos_y = y_embed[:, :, None] / dim_t
pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()),
dim=3).flatten(2)
pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()),
dim=3).flatten(2)
if pos_tensor.size(-1) == 2:
pos = torch.cat((pos_y, pos_x), dim=2)
elif pos_tensor.size(-1) == 4:
w_embed = pos_tensor[:, :, 2] * scale
pos_w = w_embed[:, :, None] / dim_t
pos_w = torch.stack(
(pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()),
dim=3).flatten(2)
h_embed = pos_tensor[:, :, 3] * scale
pos_h = h_embed[:, :, None] / dim_t
pos_h = torch.stack(
(pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()),
dim=3).flatten(2)
pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2)
else:
raise ValueError('Unknown pos_tensor shape(-1):{}'.format(
pos_tensor.size(-1)))
return pos
def forward(self,
query,
*args,
reference_points=None,
valid_ratios=None,
reg_branches=None,
**kwargs):
output = query
intermediate = []
intermediate_reference_points = [reference_points]
for lid, layer in enumerate(self.layers):
if reference_points.shape[-1] == 4:
reference_points_input = \
reference_points[:, :, None] * torch.cat(
[valid_ratios, valid_ratios], -1)[:, None]
else:
assert reference_points.shape[-1] == 2
reference_points_input = \
reference_points[:, :, None] * valid_ratios[:, None]
if self.with_rp_noise and self.training:
device = reference_points.device
b, n, d = reference_points.size()
noise = torch.rand(b, n, d).to(device) * 0.02 - 0.01
reference_points = (reference_points + noise).clamp(0, 1)
query_sine_embed = self.gen_sineembed_for_position(
reference_points_input[:, :, 0, :])
query_pos = self.ref_point_head(query_sine_embed)
query_pos = query_pos.permute(1, 0, 2)
output = layer(
output,
*args,
query_pos=query_pos,
reference_points=reference_points_input,
**kwargs)
output = output.permute(1, 0, 2)
if reg_branches is not None:
tmp = reg_branches[lid](output)
assert reference_points.shape[-1] == 4
new_reference_points = tmp + inverse_sigmoid(
reference_points, eps=1e-3)
new_reference_points = new_reference_points.sigmoid()
reference_points = new_reference_points.detach()
output = output.permute(1, 0, 2)
if self.return_intermediate:
intermediate.append(self.norm(output))
intermediate_reference_points.append(new_reference_points)
# NOTE this is for the "Look Forward Twice" module,
# in the DeformDETR, reference_points was appended.
if self.return_intermediate:
return torch.stack(intermediate), torch.stack(
intermediate_reference_points)
return output, reference_points
@TRANSFORMER.register_module()
class DinoTransformer(DeformableDetrTransformer):
def __init__(self, *args, **kwargs):
super(DinoTransformer, self).__init__(*args, **kwargs)
def init_layers(self):
"""Initialize layers of the DinoTransformer."""
self.level_embeds = nn.Parameter(
torch.Tensor(self.num_feature_levels, self.embed_dims))
self.enc_output = nn.Linear(self.embed_dims, self.embed_dims)
self.enc_output_norm = nn.LayerNorm(self.embed_dims)
self.query_embed = nn.Embedding(self.two_stage_num_proposals,
self.embed_dims)
def init_weights(self):
super().init_weights()
nn.init.normal_(self.query_embed.weight.data)
def forward(self,
mlvl_feats,
mlvl_masks,
query_embed,
mlvl_pos_embeds,
dn_label_query,
dn_bbox_query,
attn_mask,
reg_branches=None,
cls_branches=None,
**kwargs):
assert self.as_two_stage and query_embed is None, \
'as_two_stage must be True for DINO'
feat_flatten = []
mask_flatten = []
lvl_pos_embed_flatten = []
spatial_shapes = []
for lvl, (feat, mask, pos_embed) in enumerate(
zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)):
bs, c, h, w = feat.shape
spatial_shape = (h, w)
spatial_shapes.append(spatial_shape)
feat = feat.flatten(2).transpose(1, 2)
mask = mask.flatten(1)
pos_embed = pos_embed.flatten(2).transpose(1, 2)
lvl_pos_embed = pos_embed + self.level_embeds[lvl].view(1, 1, -1)
lvl_pos_embed_flatten.append(lvl_pos_embed)
feat_flatten.append(feat)
mask_flatten.append(mask)
feat_flatten = torch.cat(feat_flatten, 1)
mask_flatten = torch.cat(mask_flatten, 1)
lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
spatial_shapes = torch.as_tensor(
spatial_shapes, dtype=torch.long, device=feat_flatten.device)
level_start_index = torch.cat((spatial_shapes.new_zeros(
(1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
valid_ratios = torch.stack(
[self.get_valid_ratio(m) for m in mlvl_masks], 1)
reference_points = self.get_reference_points(
spatial_shapes, valid_ratios, device=feat.device)
feat_flatten = feat_flatten.permute(1, 0, 2) # (H*W, bs, embed_dims)
lvl_pos_embed_flatten = lvl_pos_embed_flatten.permute(
1, 0, 2) # (H*W, bs, embed_dims)
memory = self.encoder(
query=feat_flatten,
key=None,
value=None,
query_pos=lvl_pos_embed_flatten,
query_key_padding_mask=mask_flatten,
spatial_shapes=spatial_shapes,
reference_points=reference_points,
level_start_index=level_start_index,
valid_ratios=valid_ratios,
**kwargs)
memory = memory.permute(1, 0, 2)
bs, _, c = memory.shape
output_memory, output_proposals = self.gen_encoder_output_proposals(
memory, mask_flatten, spatial_shapes)
enc_outputs_class = cls_branches[self.decoder.num_layers](
output_memory)
enc_outputs_coord_unact = reg_branches[self.decoder.num_layers](
output_memory) + output_proposals
cls_out_features = cls_branches[self.decoder.num_layers].out_features
topk = self.two_stage_num_proposals
# NOTE In DeformDETR, enc_outputs_class[..., 0] is used for topk TODO
topk_indices = torch.topk(enc_outputs_class.max(-1)[0], topk, dim=1)[1]
# topk_proposal = torch.gather(
# output_proposals, 1,
# topk_indices.unsqueeze(-1).repeat(1, 1, 4)).sigmoid()
# topk_memory = torch.gather(
# output_memory, 1,
# topk_indices.unsqueeze(-1).repeat(1, 1, self.embed_dims))
topk_score = torch.gather(
enc_outputs_class, 1,
topk_indices.unsqueeze(-1).repeat(1, 1, cls_out_features))
topk_coords_unact = torch.gather(
enc_outputs_coord_unact, 1,
topk_indices.unsqueeze(-1).repeat(1, 1, 4))
topk_anchor = topk_coords_unact.sigmoid()
# NOTE In the original DeformDETR, init_reference_out is obtained
# from detached topk_coords_unact, which is different with DINO. TODO
topk_coords_unact = topk_coords_unact.detach()
query = self.query_embed.weight[:, None, :].repeat(1, bs,
1).transpose(0, 1)
if dn_label_query is not None:
query = torch.cat([dn_label_query, query], dim=1)
if dn_bbox_query is not None:
reference_points = torch.cat([dn_bbox_query, topk_coords_unact],
dim=1)
else:
reference_points = topk_coords_unact
reference_points = reference_points.sigmoid()
# decoder
query = query.permute(1, 0, 2)
memory = memory.permute(1, 0, 2)
inter_states, inter_references = self.decoder(
query=query,
key=None,
value=memory,
attn_masks=attn_mask,
key_padding_mask=mask_flatten,
reference_points=reference_points,
spatial_shapes=spatial_shapes,
level_start_index=level_start_index,
valid_ratios=valid_ratios,
reg_branches=reg_branches,
**kwargs)
inter_references_out = inter_references
return inter_states, inter_references_out, topk_score, topk_anchor
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment