Commit 21d9f185 authored by zhe chen's avatar zhe chen
Browse files

Release detection model

parent 4dba489c
...@@ -67,6 +67,7 @@ pip install opencv-python termcolor yacs pyyaml scipy ...@@ -67,6 +67,7 @@ pip install opencv-python termcolor yacs pyyaml scipy
# Please use a version of numpy lower than 2.0 # Please use a version of numpy lower than 2.0
pip install numpy==1.26.4 pip install numpy==1.26.4
pip install pydantic==1.10.13 pip install pydantic==1.10.13
pip install yapf==0.40.1
``` ```
- Compile CUDA operators - Compile CUDA operators
......
...@@ -47,3 +47,9 @@ Based on community feedback, in 2017 the training/validation split was changed f ...@@ -47,3 +47,9 @@ Based on community feedback, in 2017 the training/validation split was changed f
| InternImage-T | layer-wise lr | ImageNet-1K | 1x | 53.9 | 9.5h | 49M | [config](./dino_4scale_internimage_t_1x_coco_layer_wise_lr.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_t_1x_coco.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_t_1x_coco.json) | | InternImage-T | layer-wise lr | ImageNet-1K | 1x | 53.9 | 9.5h | 49M | [config](./dino_4scale_internimage_t_1x_coco_layer_wise_lr.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_t_1x_coco.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_t_1x_coco.json) |
| InternImage-L | layer-wise lr | ImageNet-22K | 1x | 57.5 | 18h | 241M | [config](./dino_4scale_internimage_l_1x_coco_layer_wise_lr.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_l_1x_coco_layer_wise_lr.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_l_1x_coco_layer_wise_lr.log.json) | | InternImage-L | layer-wise lr | ImageNet-22K | 1x | 57.5 | 18h | 241M | [config](./dino_4scale_internimage_l_1x_coco_layer_wise_lr.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_l_1x_coco_layer_wise_lr.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_l_1x_coco_layer_wise_lr.log.json) |
| InternImage-L | 0.1x backbone lr | ImageNet-22K | 1x | 57.6 | 18h | 241M | [config](./dino_4scale_internimage_l_1x_coco_0.1x_backbone_lr.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_l_1x_coco_0.1x_backbone_lr.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_l_1x_coco_0.1x_backbone_lr.log.json) | | InternImage-L | 0.1x backbone lr | ImageNet-22K | 1x | 57.6 | 18h | 241M | [config](./dino_4scale_internimage_l_1x_coco_0.1x_backbone_lr.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_l_1x_coco_0.1x_backbone_lr.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_l_1x_coco_0.1x_backbone_lr.log.json) |
### DINO + CB-InternImage
| backbone | pretrain | box mAP (ss) | box mAP (ms) | #param | Config | Download |
| :--------------: | :--------: | :----------: | :----------: | :----: | :-----------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------: |
| CB-InternImage-H | Objects365 | 64.5 | 65.0 | 2.18B | [config](./dino_4scale_cbinternimage_h_objects365_coco_ss.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_cbinternimage_h_objects365_coco.pth) |
_base_ = [
'../_base_/datasets/coco_detection.py',
'../_base_/default_runtime.py'
]
load_from = 'https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_cbinternimage_h_objects365_80classes.pth'
model = dict(
type='CBDINO',
backbone=dict(
type='CBInternImage',
core_op='DCNv3',
channels=320,
depths=[6, 6, 32, 6],
groups=[10, 20, 40, 80],
mlp_ratio=4.,
drop_path_rate=0.5,
norm_layer='LN',
layer_scale=None,
offset_scale=1.0,
post_norm=False,
dw_kernel_size=5, # for InternImage-H/G
res_post_norm=True, # for InternImage-H/G
level2_post_norm=True, # for InternImage-H/G
level2_post_norm_block_ids=[5, 11, 17, 23, 29], # for InternImage-H/G
center_feature_scale=True, # for InternImage-H/G
with_cp=True,
out_indices=[(0, 1, 2, 3), (1, 2, 3)],
init_cfg=None,
),
neck=[dict(
type='CBChannelMapper',
in_channels=[640, 1280, 2560],
kernel_size=1,
out_channels=256,
act_cfg=None,
norm_cfg=dict(type='GN', num_groups=32),
num_outs=4)],
bbox_head=dict(
type='CBDINOHead',
num_query=900,
num_classes=80,
in_channels=2048, # TODO
sync_cls_avg_factor=True,
as_two_stage=True,
with_box_refine=True,
dn_cfg=dict(
type='CdnQueryGenerator',
noise_scale=dict(label=0.5, box=1.0), # 0.5, 0.4 for DN-DETR
group_cfg=dict(dynamic=True, num_groups=None, num_dn_queries=1000)),
transformer=dict(
type='DinoTransformer',
two_stage_num_proposals=900,
encoder=dict(
type='DetrTransformerEncoder',
num_layers=6,
transformerlayers=dict(
type='BaseTransformerLayer',
attn_cfgs=dict(
type='MultiScaleDeformableAttention',
embed_dims=256,
dropout=0.0), # 0.1 for DeformDETR
feedforward_channels=2048, # 1024 for DeformDETR
ffn_cfgs=dict(
type='FFN',
embed_dims=256,
feedforward_channels=2048,
num_fcs=2,
ffn_drop=0.,
use_checkpoint=True,
act_cfg=dict(type='ReLU', inplace=True),),
ffn_dropout=0.0, # 0.1 for DeformDETR
operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
decoder=dict(
type='DinoTransformerDecoder',
num_layers=6,
return_intermediate=True,
transformerlayers=dict(
type='DetrTransformerDecoderLayer',
attn_cfgs=[
dict(
type='MultiheadAttention',
embed_dims=256,
num_heads=8,
dropout=0.0), # 0.1 for DeformDETR
dict(
type='MultiScaleDeformableAttention',
num_levels=4,
embed_dims=256,
dropout=0.0), # 0.1 for DeformDETR
],
feedforward_channels=2048, # 1024 for DeformDETR
ffn_cfgs=dict(
type='FFN',
embed_dims=256,
feedforward_channels=2048,
num_fcs=2,
ffn_drop=0.,
use_checkpoint=True,
act_cfg=dict(type='ReLU', inplace=True),),
ffn_dropout=0.0, # 0.1 for DeformDETR
operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
'ffn', 'norm')))),
positional_encoding=dict(
type='SinePositionalEncoding',
num_feats=128,
temperature=20,
normalize=True),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0), # 2.0 in DeformDETR
loss_bbox=dict(type='L1Loss', loss_weight=5.0),
loss_iou=dict(type='GIoULoss', loss_weight=2.0)),
# training and testing settings
train_cfg=dict(
assigner=dict(
type='HungarianAssigner',
cls_cost=dict(type='FocalLossCost', weight=2.0),
reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0)),
snip_cfg=dict(
type='v3',
weight=0.1)),
test_cfg=dict(max_per_img=300)) # TODO: Originally 100
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
# from the default setting in mmdet.
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', with_bbox=True),
dict(type='RandomFlip', flip_ratio=0.5),
dict(type='Resize',
img_scale=[(2000, 600), (2000, 1800)],
multiscale_mode='range',
keep_ratio=True),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(2000, 1000),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img'])
])
]
data = dict(
samples_per_gpu=1,
workers_per_gpu=2,
train=dict(filter_empty_gt=True, pipeline=train_pipeline),
val=dict(pipeline=test_pipeline),
test=dict(pipeline=test_pipeline))
# optimizer
optimizer = dict(
type='AdamW', lr=0.0001, weight_decay=0.0001,
constructor='CustomLayerDecayOptimizerConstructor',
paramwise_cfg=dict(num_layers=50, layer_decay_rate=0.94,
depths=[6, 6, 32, 6], offset_lr_scale=1e-3))
optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[])
runner = dict(type='IterBasedRunner', max_iters=20000)
checkpoint_config = dict(interval=200, max_keep_ckpts=3)
evaluation = dict(interval=200, save_best='auto')
# resume_from = None
# custom_hooks = [
# dict(
# type='ExpMomentumEMAHook',
# resume_from=resume_from,
# momentum=0.0003,
# priority=49)
# ]
...@@ -7,4 +7,5 @@ ...@@ -7,4 +7,5 @@
from .backbones import * # noqa: F401,F403 from .backbones import * # noqa: F401,F403
from .dense_heads import * # noqa: F401,F403 from .dense_heads import * # noqa: F401,F403
from .detectors import * # noqa: F401,F403 from .detectors import * # noqa: F401,F403
from .necks import * # noqa: F401,F403
from .utils import * # noqa: F401,F403 from .utils import * # noqa: F401,F403
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
# Licensed under The MIT License [see LICENSE for details] # Licensed under The MIT License [see LICENSE for details]
# -------------------------------------------------------- # --------------------------------------------------------
from .cbnet import CBInternImage
from .intern_image import InternImage from .intern_image import InternImage
__all__ = ['InternImage'] __all__ = ['InternImage', 'CBInternImage']
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import constant_init
from mmcv.runner import BaseModule
from mmdet.models.builder import BACKBONES
from torch.nn.modules.batchnorm import _BatchNorm
from .intern_image import InternImage
class LayerScale(nn.Module):
def __init__(self, init_values=0., dim=1024):
super(LayerScale, self).__init__()
self.gamma = nn.Parameter(init_values * torch.ones((dim, 1, 1)), requires_grad=True)
def forward(self, x):
return self.gamma * x
class _InternImage(InternImage):
def __init__(self, cb_idx, **kwargs):
super(_InternImage, self).__init__(**kwargs)
self.cb_idx = cb_idx
self.num_features_list = [int(self.channels * 2 ** i) for i in range(self.num_levels)]
if cb_idx == 1:
self.gamma0 = nn.Parameter(torch.zeros((self.num_features_list[0])), requires_grad=True)
self.gamma1 = nn.Parameter(torch.zeros((self.num_features_list[1])), requires_grad=True)
self.gamma2 = nn.Parameter(torch.zeros((self.num_features_list[2])), requires_grad=True)
self.gamma3 = nn.Parameter(torch.zeros((self.num_features_list[3])), requires_grad=True)
def del_layers(self, del_stages):
self.del_stages = del_stages
if self.del_stages >= 0:
del self.patch_embed
def forward(self, x, cb_feats=None, pre_tmps=None):
outs, tmps = [], []
if hasattr(self, 'patch_embed'):
x = self.patch_embed(x)
x = self.pos_drop(x)
Wh, Ww = x.size(1), x.size(2)
tmps.append((x, Wh, Ww))
else:
x, Wh, Ww = pre_tmps[0]
for i, level in enumerate(self.levels):
if cb_feats is not None:
gamma = getattr(self, f'gamma{i}')
x = x + gamma.half() * cb_feats[i] # [B, H, W, C]
x, x_ = level(x, return_wo_downsample=True)
if i in self.out_indices:
outs.append(x_.permute(0, 3, 1, 2).contiguous())
return tuple(outs), tmps
def train(self, mode=True):
super(_InternImage, self).train(mode)
@BACKBONES.register_module()
class CBInternImage(BaseModule):
def __init__(self, channels=96, out_indices=None, cb_zero_init=True, cb_del_stages=1, **kwargs):
super(CBInternImage, self).__init__()
self.cb_zero_init = cb_zero_init
self.cb_del_stages = cb_del_stages
self.out_indices = out_indices
assert len(out_indices) == 2
self.cb_modules = nn.ModuleList()
for cb_idx in range(2):
cb_module = _InternImage(channels=channels,
out_indices=out_indices[cb_idx],
cb_idx=cb_idx, **kwargs)
if cb_idx > 0:
cb_module.del_layers(cb_del_stages)
self.cb_modules.append(cb_module)
self.num_layers = self.cb_modules[0].num_layers
cb_inplanes = [channels * 2 ** i for i in range(self.num_layers)]
self.cb_linears = nn.ModuleList()
for i in range(self.num_layers):
linears = nn.ModuleList()
if i >= self.cb_del_stages - 1:
jrange = 4 - i
for j in range(jrange):
if cb_inplanes[i + j] != cb_inplanes[i]:
layer = nn.Conv2d(cb_inplanes[i + j], cb_inplanes[i], 1)
else:
layer = nn.Identity()
linears.append(layer)
self.cb_linears.append(linears)
def init_weights(self):
for m in self.cb_modules:
m.init_weights()
def spatial_interpolate(self, x, H, W):
if H != x.shape[2] or W != x.shape[3]:
x = F.interpolate(x, size=(H, W), mode='nearest')
return x
def _get_cb_feats(self, feats, tmps):
cb_feats = []
Wh, Ww = tmps[0][1:3]
for i in range(self.num_layers):
feed = 0
if i >= self.cb_del_stages - 1:
jrange = 4 - i
for j in range(jrange):
tmp = self.cb_linears[i][j](feats[j + i])
tmp = self.spatial_interpolate(tmp, Wh, Ww)
tmp = tmp.permute(0, 2, 3, 1) # [B, H, W, C]
feed += tmp
cb_feats.append(feed)
Wh, Ww = Wh // 2, Ww // 2
return cb_feats
def forward(self, x):
outs = []
for i, module in enumerate(self.cb_modules):
if i == 0:
feats, tmps = module(x)
else:
feats, tmps = module(x, cb_feats, tmps)
outs.append(feats)
if i < len(self.cb_modules) - 1:
cb_feats = self._get_cb_feats(outs[-1], tmps)
if len(self.out_indices[0]) == len(self.out_indices[1]) + 1:
outs[0] = outs[0][1:]
return tuple(outs)
def train(self, mode=True):
super(CBInternImage, self).train(mode)
for m in self.cb_modules:
m.train(mode=mode)
for m in self.cb_linears.modules():
if isinstance(m, _BatchNorm):
m.eval()
...@@ -4,8 +4,9 @@ ...@@ -4,8 +4,9 @@
# Licensed under The MIT License [see LICENSE for details] # Licensed under The MIT License [see LICENSE for details]
# -------------------------------------------------------- # --------------------------------------------------------
from .cbdino_head import CBDINOHead
from .deformable_detr_head import DeformableDETRHead from .deformable_detr_head import DeformableDETRHead
from .detr_head import DETRHead from .detr_head import DETRHead
from .dino_head import DINOHead from .dino_head import DINOHead
__all__ = ['DeformableDETRHead', 'DETRHead', 'DINOHead'] __all__ = ['DeformableDETRHead', 'DETRHead', 'DINOHead', 'CBDINOHead']
# Copyright (c) OpenMMLab. All rights reserved.
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmdet.core import (bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh, multi_apply,
reduce_mean)
from mmdet.models.builder import HEADS
from mmdet.models.utils.transformer import inverse_sigmoid
from ..utils import build_dn_generator
from .deformable_detr_head import DeformableDETRHead
@HEADS.register_module()
class CBDINOHead(DeformableDETRHead):
def __init__(self, cb_first_weight=0.5, *args, dn_cfg=None, **kwargs):
super(CBDINOHead, self).__init__(*args, **kwargs)
self._init_layers()
self.init_denoising(dn_cfg)
self.cb_first_weight = cb_first_weight
assert self.as_two_stage, \
'as_two_stage must be True for DINO'
assert self.with_box_refine, \
'with_box_refine must be True for DINO'
def _init_layers(self):
super()._init_layers()
self.label_embedding = nn.Embedding(self.cls_out_channels,
self.embed_dims)
def init_denoising(self, dn_cfg):
if dn_cfg is not None:
dn_cfg['num_classes'] = self.num_classes
dn_cfg['num_queries'] = self.num_query
dn_cfg['hidden_dim'] = self.embed_dims
self.dn_generator = build_dn_generator(dn_cfg)
def upd_loss(self, losses, idx, weight):
new_losses = dict()
for k, v in losses.items():
new_k = '{}{}'.format(k, idx)
if weight != 1 and 'loss' in k:
new_k = '{}_w{}'.format(new_k, weight)
if isinstance(v, list) or isinstance(v, tuple):
new_losses[new_k] = [i*weight for i in v]
else:
new_losses[new_k] = v*weight
return new_losses
def forward_train(self,
x,
img_metas,
gt_bboxes,
gt_labels=None,
gt_bboxes_ignore=None,
proposal_cfg=None,
**kwargs):
assert proposal_cfg is None, '"proposal_cfg" must be None'
assert self.dn_generator is not None, '"dn_cfg" must be set'
dn_label_query, dn_bbox_query, attn_mask, dn_meta = \
self.dn_generator(gt_bboxes, gt_labels, self.label_embedding, img_metas)
outs = self(x, img_metas, dn_label_query, dn_bbox_query, attn_mask)
out1 = tuple(out[..., 0:1, :, :] for out in outs)
out2 = tuple(out[..., 1:2, :, :] for out in outs)
if gt_labels is None:
loss_inputs1 = out1 + ([gt_bboxes[0]], [img_metas[0]], dn_meta)
loss_inputs2 = out2 + ([gt_bboxes[1]], [img_metas[1]], dn_meta)
else:
loss_inputs1 = out1 + ([gt_bboxes[0]], [gt_labels[0]], [img_metas[0]], dn_meta)
loss_inputs2 = out2 + ([gt_bboxes[1]], [gt_labels[1]], [img_metas[1]], dn_meta)
loss_inputs = (loss_inputs1, loss_inputs2)
losses = dict()
loss_weights = [self.cb_first_weight] + [1] * 1
for i in range(2):
sub_losses = self.loss(*loss_inputs[i], gt_bboxes_ignore=gt_bboxes_ignore)
sub_losses = self.upd_loss(sub_losses, idx=i, weight=loss_weights[i])
losses.update(sub_losses)
return losses
def forward(self,
mlvl_feats,
img_metas,
dn_label_query=None,
dn_bbox_query=None,
attn_mask=None):
batch_size = mlvl_feats[0].size(0)
input_img_h, input_img_w = img_metas[0]['batch_input_shape']
img_masks = mlvl_feats[0].new_ones(
(batch_size, input_img_h, input_img_w))
for img_id in range(batch_size):
img_h, img_w, _ = img_metas[img_id]['img_shape']
img_masks[img_id, :img_h, :img_w] = 0
mlvl_masks = []
mlvl_positional_encodings = []
for feat in mlvl_feats:
mlvl_masks.append(
F.interpolate(img_masks[None],
size=feat.shape[-2:]).to(torch.bool).squeeze(0))
mlvl_positional_encodings.append(
self.positional_encoding(mlvl_masks[-1]))
query_embeds = None
hs, inter_references, topk_score, topk_anchor = \
self.transformer(
mlvl_feats,
mlvl_masks,
query_embeds,
mlvl_positional_encodings,
dn_label_query,
dn_bbox_query,
attn_mask,
reg_branches=self.reg_branches if self.with_box_refine else None, # noqa:E501
cls_branches=self.cls_branches if self.as_two_stage else None # noqa:E501
)
hs = hs.permute(0, 2, 1, 3)
if dn_label_query is not None and dn_label_query.size(1) == 0:
# NOTE: If there is no target in the image, the parameters of
# label_embedding won't be used in producing loss, which raises
# RuntimeError when using distributed mode.
hs[0] += self.label_embedding.weight[0, 0] * 0.0
outputs_classes = []
outputs_coords = []
for lvl in range(hs.shape[0]):
reference = inter_references[lvl]
reference = inverse_sigmoid(reference, eps=1e-3)
outputs_class = self.cls_branches[lvl](hs[lvl])
tmp = self.reg_branches[lvl](hs[lvl])
if reference.shape[-1] == 4:
tmp += reference
else:
assert reference.shape[-1] == 2
tmp[..., :2] += reference
outputs_coord = tmp.sigmoid()
outputs_classes.append(outputs_class)
outputs_coords.append(outputs_coord)
outputs_classes = torch.stack(outputs_classes)
outputs_coords = torch.stack(outputs_coords)
return outputs_classes, outputs_coords, topk_score, topk_anchor
def loss(self,
all_cls_scores,
all_bbox_preds,
enc_topk_scores,
enc_topk_anchors,
gt_bboxes_list,
gt_labels_list,
img_metas,
dn_meta=None,
gt_bboxes_ignore=None):
assert gt_bboxes_ignore is None, \
f'{self.__class__.__name__} only supports ' \
f'for gt_bboxes_ignore setting to None.'
loss_dict = dict()
# extract denoising and matching part of outputs
all_cls_scores, all_bbox_preds, dn_cls_scores, dn_bbox_preds = \
self.extract_dn_outputs(all_cls_scores, all_bbox_preds, dn_meta)
if enc_topk_scores is not None:
# calculate loss from encode feature maps
# NOTE The DeformDETR calculate binary cls loss
# for all encoder embeddings, while DINO calculate
# multi-class loss for topk embeddings.
enc_loss_cls, enc_losses_bbox, enc_losses_iou = \
self.loss_single(enc_topk_scores, enc_topk_anchors,
gt_bboxes_list, gt_labels_list,
img_metas, gt_bboxes_ignore)
# collate loss from encode feature maps
loss_dict['interm_loss_cls'] = enc_loss_cls
loss_dict['interm_loss_bbox'] = enc_losses_bbox
loss_dict['interm_loss_iou'] = enc_losses_iou
# calculate loss from all decoder layers
num_dec_layers = len(all_cls_scores)
all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
all_gt_bboxes_ignore_list = [
gt_bboxes_ignore for _ in range(num_dec_layers)
]
img_metas_list = [img_metas for _ in range(num_dec_layers)]
losses_cls, losses_bbox, losses_iou = multi_apply(
self.loss_single, all_cls_scores, all_bbox_preds,
all_gt_bboxes_list, all_gt_labels_list, img_metas_list,
all_gt_bboxes_ignore_list)
# collate loss from the last decoder layer
loss_dict['loss_cls'] = losses_cls[-1]
loss_dict['loss_bbox'] = losses_bbox[-1]
loss_dict['loss_iou'] = losses_iou[-1]
# collate loss from other decoder layers
num_dec_layer = 0
for loss_cls_i, loss_bbox_i, loss_iou_i in zip(losses_cls[:-1],
losses_bbox[:-1],
losses_iou[:-1]):
loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
loss_dict[f'd{num_dec_layer}.loss_iou'] = loss_iou_i
num_dec_layer += 1
if dn_cls_scores is not None:
# calculate denoising loss from all decoder layers
dn_meta = [dn_meta for _ in img_metas]
tmp = self.loss_dn(dn_cls_scores, dn_bbox_preds, gt_bboxes_list, gt_labels_list, img_metas, dn_meta)
# print(tmp)
if len(tmp) == 0:
print(dn_cls_scores)
dn_losses_cls, dn_losses_bbox, dn_losses_iou = tmp
# collate denoising loss
loss_dict['dn_loss_cls'] = dn_losses_cls[-1]
loss_dict['dn_loss_bbox'] = dn_losses_bbox[-1]
loss_dict['dn_loss_iou'] = dn_losses_iou[-1]
num_dec_layer = 0
for loss_cls_i, loss_bbox_i, loss_iou_i in zip(
dn_losses_cls[:-1], dn_losses_bbox[:-1],
dn_losses_iou[:-1]):
loss_dict[f'd{num_dec_layer}.dn_loss_cls'] = loss_cls_i
loss_dict[f'd{num_dec_layer}.dn_loss_bbox'] = loss_bbox_i
loss_dict[f'd{num_dec_layer}.dn_loss_iou'] = loss_iou_i
num_dec_layer += 1
# assert len(loss_dict.keys()) == 39, "number of keys must be 39!"
return loss_dict
def loss_dn(self, dn_cls_scores, dn_bbox_preds, gt_bboxes_list,
gt_labels_list, img_metas, dn_meta):
num_dec_layers = len(dn_cls_scores)
all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
img_metas_list = [img_metas for _ in range(num_dec_layers)]
dn_meta_list = [dn_meta for _ in range(num_dec_layers)]
return multi_apply(self.loss_dn_single, dn_cls_scores, dn_bbox_preds,
all_gt_bboxes_list, all_gt_labels_list,
img_metas_list, dn_meta_list)
def loss_dn_single(self, dn_cls_scores, dn_bbox_preds, gt_bboxes_list,
gt_labels_list, img_metas, dn_meta):
num_imgs = dn_cls_scores.size(0)
bbox_preds_list = [dn_bbox_preds[i] for i in range(num_imgs)]
cls_reg_targets = self.get_dn_target(bbox_preds_list, gt_bboxes_list,
gt_labels_list, img_metas,
dn_meta)
(labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
num_total_pos, num_total_neg) = cls_reg_targets
labels = torch.cat(labels_list, 0)
label_weights = torch.cat(label_weights_list, 0)
bbox_targets = torch.cat(bbox_targets_list, 0)
bbox_weights = torch.cat(bbox_weights_list, 0)
# classification loss
cls_scores = dn_cls_scores.reshape(-1, self.cls_out_channels)
# construct weighted avg_factor to match with the official DETR repo
cls_avg_factor = \
num_total_pos * 1.0 + num_total_neg * self.bg_cls_weight
if self.sync_cls_avg_factor:
cls_avg_factor = reduce_mean(
cls_scores.new_tensor([cls_avg_factor]))
cls_avg_factor = max(cls_avg_factor, 1)
if len(cls_scores) > 0:
loss_cls = self.loss_cls(
cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
else:
loss_cls = torch.zeros( # TODO: How to better return zero loss
1,
dtype=cls_scores.dtype,
device=cls_scores.device)
# Compute the average number of gt boxes across all gpus, for
# normalization purposes
num_total_pos = loss_cls.new_tensor([num_total_pos])
num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
# construct factors used for rescale bboxes
factors = []
for img_meta, bbox_pred in zip(img_metas, dn_bbox_preds):
img_h, img_w, _ = img_meta['img_shape']
factor = bbox_pred.new_tensor([img_w, img_h, img_w,
img_h]).unsqueeze(0).repeat(
bbox_pred.size(0), 1)
factors.append(factor)
factors = torch.cat(factors, 0)
# DETR regress the relative position of boxes (cxcywh) in the image,
# thus the learning target is normalized by the image size. So here
# we need to re-scale them for calculating IoU loss
bbox_preds = dn_bbox_preds.reshape(-1, 4)
bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors
bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors
# regression IoU loss, defaultly GIoU loss
loss_iou = self.loss_iou(
bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos)
# regression L1 loss
loss_bbox = self.loss_bbox(
bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos)
return loss_cls, loss_bbox, loss_iou
def get_dn_target(self, dn_bbox_preds_list, gt_bboxes_list, gt_labels_list,
img_metas, dn_meta):
(labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
pos_inds_list,
neg_inds_list) = multi_apply(self._get_dn_target_single,
dn_bbox_preds_list, gt_bboxes_list,
gt_labels_list, img_metas, dn_meta)
num_total_pos = sum((inds.numel() for inds in pos_inds_list))
num_total_neg = sum((inds.numel() for inds in neg_inds_list))
return (labels_list, label_weights_list, bbox_targets_list,
bbox_weights_list, num_total_pos, num_total_neg)
def _get_dn_target_single(self, dn_bbox_pred, gt_bboxes, gt_labels,
img_meta, dn_meta):
num_groups = dn_meta['num_dn_group']
pad_size = dn_meta['pad_size']
assert pad_size % num_groups == 0
single_pad = pad_size // num_groups
num_bboxes = dn_bbox_pred.size(0)
if len(gt_labels) > 0:
t = torch.range(0, len(gt_labels) - 1).long().cuda()
t = t.unsqueeze(0).repeat(num_groups, 1)
pos_assigned_gt_inds = t.flatten()
pos_inds = (torch.tensor(range(num_groups)) *
single_pad).long().cuda().unsqueeze(1) + t
pos_inds = pos_inds.flatten()
else:
pos_inds = pos_assigned_gt_inds = torch.tensor([]).long().cuda()
neg_inds = pos_inds + single_pad // 2
# label targets
labels = gt_bboxes.new_full((num_bboxes, ),
self.num_classes,
dtype=torch.long)
labels[pos_inds] = gt_labels[pos_assigned_gt_inds]
label_weights = gt_bboxes.new_ones(num_bboxes)
# bbox targets
bbox_targets = torch.zeros_like(dn_bbox_pred)
bbox_weights = torch.zeros_like(dn_bbox_pred)
bbox_weights[pos_inds] = 1.0
img_h, img_w, _ = img_meta['img_shape']
# DETR regress the relative position of boxes (cxcywh) in the image.
# Thus the learning target should be normalized by the image size, also
# the box format should be converted from defaultly x1y1x2y2 to cxcywh.
factor = dn_bbox_pred.new_tensor([img_w, img_h, img_w,
img_h]).unsqueeze(0)
gt_bboxes_normalized = gt_bboxes / factor
gt_bboxes_targets = bbox_xyxy_to_cxcywh(gt_bboxes_normalized)
bbox_targets[pos_inds] = gt_bboxes_targets.repeat([num_groups, 1])
return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
neg_inds)
@staticmethod
def extract_dn_outputs(all_cls_scores, all_bbox_preds, dn_meta):
# if dn_meta and dn_meta['pad_size'] > 0:
if dn_meta is not None:
denoising_cls_scores = all_cls_scores[:, :, :
dn_meta['pad_size'], :]
denoising_bbox_preds = all_bbox_preds[:, :, :
dn_meta['pad_size'], :]
matching_cls_scores = all_cls_scores[:, :, dn_meta['pad_size']:, :]
matching_bbox_preds = all_bbox_preds[:, :, dn_meta['pad_size']:, :]
else:
denoising_cls_scores = None
denoising_bbox_preds = None
matching_cls_scores = all_cls_scores
matching_bbox_preds = all_bbox_preds
return (matching_cls_scores, matching_bbox_preds, denoising_cls_scores,
denoising_bbox_preds)
def tta_test_bboxes(self, feats, img_metas, rescale=False):
"""Test det bboxes without test-time augmentation.
Args:
feats (tuple[torch.Tensor]): Multi-level features from the
upstream network, each is a 4D-tensor.
img_metas (list[dict]): List of image information.
rescale (bool, optional): Whether to rescale the results.
Defaults to False.
"""
# forward of this head requires img_metas
outs = self.forward(feats, img_metas)
all_cls_scores, all_bbox_preds, enc_cls_scores, enc_bbox_preds = outs
cls_scores = all_cls_scores[-1]
bbox_preds = all_bbox_preds[-1]
return bbox_preds, cls_scores
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
# Licensed under The MIT License [see LICENSE for details] # Licensed under The MIT License [see LICENSE for details]
# -------------------------------------------------------- # --------------------------------------------------------
from .cbnet_dino import CBDINO
from .dino import DINO from .dino import DINO
__all__ = ['DINO'] __all__ = ['DINO', 'CBDINO']
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from mmdet.core import bbox2result
from mmdet.models.builder import DETECTORS
from .dino import DINO
@DETECTORS.register_module()
class CBDINO(DINO):
def __init__(self, rule=None, **kwargs):
super(CBDINO, self).__init__(**kwargs)
for k, v in self.named_parameters():
if rule == 'freeze_backbone_expect_level_4':
if 'backbone' in k and 'backbone.cb_modules.1.levels.3' not in k:
v.requires_grad = False
if rule == 'freeze_backbone_expect_0_level_1_2_3':
if 'backbone' in k and 'backbone.cb_modules.0.levels.0' not in k \
and 'backbone.cb_modules.1.levels.0' not in k \
and 'backbone.cb_modules.2.levels.0' not in k:
v.requires_grad = False
if rule == 'freeze_backbone_expect_level_3_4':
if 'backbone' in k and 'backbone.cb_modules.1.levels.2' not in k \
and 'backbone.cb_modules.1.levels.3' not in k:
v.requires_grad = False
if rule == 'freeze_cb_first_backbone':
if 'backbone.cb_modules.0' in k:
v.requires_grad = False
if rule == 'freeze_cb_first_backbone_expect_level_4':
if 'backbone.cb_modules.0' in k and 'levels.3' not in k:
v.requires_grad = False
if rule == 'freeze_backbone':
if 'backbone' in k:
v.requires_grad = False
if rule == 'freeze_backbone_encoder':
if 'backbone' in k or 'encoder' in k:
v.requires_grad = False
if rule == 'freeze_backbone_neck':
if 'backbone' in k or 'neck' in k:
v.requires_grad = False
if rule == 'freeze_stage_1_2':
if 'patch_embed' in k:
v.requires_grad = False
if 'levels.0.' in k or 'levels.1.' in k:
v.requires_grad = False
if rule == 'freeze_stage_1':
if 'patch_embed' in k:
v.requires_grad = False
if 'levels.0.' in k:
v.requires_grad = False
def forward_train(self,
img,
img_metas,
gt_bboxes,
gt_labels,
gt_bboxes_ignore=None,
gt_masks=None,
proposals=None,
loss_weights=None,
**kwargs):
batch_input_shape = tuple(img[0].size()[-2:])
for img_meta in img_metas:
img_meta['batch_input_shape'] = batch_input_shape
xs = self.extract_feat(img)
# x0: x01, x02, x03, x04, x05
# x1: x11, x12, x13, x14, x15
if not isinstance(xs[0], (list, tuple)):
xs = [xs]
loss_weights = None
elif loss_weights is None:
loss_weights = [0.5] + [1]*(len(xs)-1)
# [0.5, 1]
losses = dict()
new_x = [torch.cat((xs[0][i], xs[1][i])) for i in range(len(xs[0]))]
img_metas = img_metas + img_metas
gt_bboxes = gt_bboxes + gt_bboxes
gt_labels = gt_labels + gt_labels
gt_bboxes_ignore = gt_bboxes_ignore + \
gt_bboxes_ignore if gt_bboxes_ignore is not None else None
losses = self.bbox_head.forward_train(
new_x, img_metas, gt_bboxes, gt_labels, gt_bboxes_ignore)
return losses
def simple_test(self, img, img_metas, rescale=False):
feat = self.extract_feat(img)
results_list = self.bbox_head.simple_test_bboxes(
feat, img_metas, rescale=rescale)
bbox_results = [
bbox2result(det_bboxes, det_labels, self.bbox_head.num_classes)
for det_bboxes, det_labels in results_list
]
return bbox_results
from .cbnet_channel_mapper import CBChannelMapper
__all__ = ['CBChannelMapper']
from mmdet.models.builder import NECKS
from mmdet.models.necks import ChannelMapper
@NECKS.register_module()
class CBChannelMapper(ChannelMapper):
def __init__(self, cb_idx=1, **kwargs):
super(CBChannelMapper, self).__init__(**kwargs)
self.cb_idx = cb_idx
def forward(self, inputs):
if not isinstance(inputs[0], (list, tuple)):
inputs = [inputs]
if self.training:
outs = []
# from IPython import embed; embed()
for x in inputs:
out = super().forward(x)
outs.append(out)
return outs
else:
out = super().forward(inputs[self.cb_idx])
return out
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment