Commit 00af501a authored by zhe chen's avatar zhe chen
Browse files

Release detection models

parent 0dec0215
...@@ -112,8 +112,12 @@ Prepare datasets according to the guidelines in [MMDetection v2.28.1](https://gi ...@@ -112,8 +112,12 @@ Prepare datasets according to the guidelines in [MMDetection v2.28.1](https://gi
| :--------: | :--------------: | :--: | :-----: | :----: | :----------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | | :--------: | :--------------: | :--: | :-----: | :----: | :----------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| DINO | InternImage-T | 1x | 53.9 | 49M | [config](./configs/coco/dino_4scale_internimage_t_1x_coco_layer_wise_lr.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_t_1x_coco.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_t_1x_coco.json) | | DINO | InternImage-T | 1x | 53.9 | 49M | [config](./configs/coco/dino_4scale_internimage_t_1x_coco_layer_wise_lr.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_t_1x_coco.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_t_1x_coco.json) |
| DINO | InternImage-L | 1x | 57.6 | 241M | [config](./configs/coco/dino_4scale_internimage_l_1x_coco_0.1x_backbone_lr.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_l_1x_coco_0.1x_backbone_lr.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_l_1x_coco_0.1x_backbone_lr.log.json) | | DINO | InternImage-L | 1x | 57.6 | 241M | [config](./configs/coco/dino_4scale_internimage_l_1x_coco_0.1x_backbone_lr.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_l_1x_coco_0.1x_backbone_lr.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_l_1x_coco_0.1x_backbone_lr.log.json) |
| DINO | CB-InternImage-H | 1x | 64.5 | 2.18B | [config](./configs/coco/dino_4scale_cbinternimage_h_objects365_coco_ss.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_cbinternimage_h_objects365_coco.pth) | | DINO | InternImage-H | 1x | 63.4 | 1.1B | [config](./configs/coco/dino_4scale_internimage_h_objects365_coco_ss.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_h_objects365_coco.pth) |
| DINO (TTA) | CB-InternImage-H | 1x | 65.0 | 2.18B | - | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_cbinternimage_h_objects365_coco.pth) | | DINO | CB-InternImage-H | 1x | 64.5 | 2.2B | [config](./configs/coco/dino_4scale_cbinternimage_h_objects365_coco_ss.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_cbinternimage_h_objects365_coco.pth) |
| DINO (TTA) | CB-InternImage-H | 1x | 65.0 | 2.2B | TODO | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_cbinternimage_h_objects365_coco.pth) |
| DINO | InternImage-G | 1x | 64.2 | 3.1B | [config](./configs/coco/dino_4scale_internimage_g_objects365_coco_ss.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_g_objects365_coco.pth) |
| DINO (TTA) | CB-InternImage-G | 1x | 65.1 | 6B | TODO | TODO |
| DINO (TTA) | CB-InternImage-G | 1x | 65.3 | 6B | TODO | TODO |
</div> </div>
......
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
_base_ = [ _base_ = [
'../_base_/datasets/coco_detection.py', '../_base_/datasets/coco_detection.py',
'../_base_/default_runtime.py' '../_base_/default_runtime.py'
...@@ -122,7 +127,7 @@ model = dict( ...@@ -122,7 +127,7 @@ model = dict(
snip_cfg=dict( snip_cfg=dict(
type='v3', type='v3',
weight=0.1)), weight=0.1)),
test_cfg=dict(max_per_img=300)) # TODO: Originally 100 test_cfg=dict(max_per_img=300))
img_norm_cfg = dict( img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different # train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
......
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
_base_ = [
'../_base_/datasets/coco_detection.py',
'../_base_/default_runtime.py'
]
model = dict(
type='DINO',
backbone=dict(
type='InternImage',
core_op='DCNv3',
channels=512,
depths=[2, 2, 48, 4],
groups=[16, 32, 64, 128],
mlp_ratio=4.,
drop_path_rate=0.5,
norm_layer='LN',
layer_scale=None,
offset_scale=1.0,
post_norm=True,
dw_kernel_size=5, # for InternImage-H/G
res_post_norm=False, # for InternImage-H/G
level2_post_norm=True, # for InternImage-H/G
level2_post_norm_block_ids=[5, 11, 17, 23, 29, 35, 41, 47], # for InternImage-H/G
center_feature_scale=True, # for InternImage-H/G
with_cp=True,
out_indices=(1, 2, 3),
init_cfg=None # dict(type='Pretrained', checkpoint=pretrained)
),
neck=dict(
type='ChannelMapper',
in_channels=[1024, 2048, 4096],
kernel_size=1,
out_channels=256,
act_cfg=None,
norm_cfg=dict(type='GN', num_groups=32),
num_outs=4),
bbox_head=dict(
type='DINOHead',
num_query=900,
num_classes=80,
in_channels=2048, # TODO
sync_cls_avg_factor=True,
as_two_stage=True,
with_box_refine=True,
dn_cfg=dict(
type='CdnQueryGenerator',
noise_scale=dict(label=0.5, box=1.0), # 0.5, 0.4 for DN-DETR
group_cfg=dict(dynamic=True, num_groups=None, num_dn_queries=1000)),
transformer=dict(
type='DinoTransformer',
two_stage_num_proposals=900,
encoder=dict(
type='DetrTransformerEncoder',
num_layers=6,
transformerlayers=dict(
type='BaseTransformerLayer',
attn_cfgs=dict(
type='MultiScaleDeformableAttention',
embed_dims=256,
dropout=0.0), # 0.1 for DeformDETR
feedforward_channels=2048, # 1024 for DeformDETR
ffn_cfgs=dict(
type='EfficientFFN',
embed_dims=256,
feedforward_channels=2048,
num_fcs=2,
ffn_drop=0.,
use_checkpoint=True,
act_cfg=dict(type='ReLU', inplace=True),),
ffn_dropout=0.0, # 0.1 for DeformDETR
operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
decoder=dict(
type='DinoTransformerDecoder',
num_layers=6,
return_intermediate=True,
transformerlayers=dict(
type='DetrTransformerDecoderLayer',
attn_cfgs=[
dict(
type='MultiheadAttention',
embed_dims=256,
num_heads=8,
dropout=0.0), # 0.1 for DeformDETR
dict(
type='MultiScaleDeformableAttention',
num_levels=4,
embed_dims=256,
dropout=0.0), # 0.1 for DeformDETR
],
feedforward_channels=2048, # 1024 for DeformDETR
ffn_cfgs=dict(
type='EfficientFFN',
embed_dims=256,
feedforward_channels=2048,
num_fcs=2,
ffn_drop=0.,
use_checkpoint=True,
act_cfg=dict(type='ReLU', inplace=True),),
ffn_dropout=0.0, # 0.1 for DeformDETR
operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
'ffn', 'norm')))),
positional_encoding=dict(
type='SinePositionalEncoding',
num_feats=128,
temperature=20,
normalize=True),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0), # 2.0 in DeformDETR
loss_bbox=dict(type='L1Loss', loss_weight=5.0),
loss_iou=dict(type='GIoULoss', loss_weight=2.0)),
# training and testing settings
train_cfg=dict(
assigner=dict(
type='HungarianAssigner',
cls_cost=dict(type='FocalLossCost', weight=2.0),
reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0)),
snip_cfg=dict(
type='v3',
weight=0.1)),
test_cfg=dict(max_per_img=300))
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
# from the default setting in mmdet.
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', with_bbox=True),
dict(type='RandomFlip', flip_ratio=0.5),
dict(type='Resize',
img_scale=[(2000, 600), (2000, 1800)],
multiscale_mode='range',
keep_ratio=True),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(2000, 1000),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img'])
])
]
data = dict(
samples_per_gpu=1,
workers_per_gpu=2,
train=dict(filter_empty_gt=True, pipeline=train_pipeline),
val=dict(pipeline=test_pipeline),
test=dict(pipeline=test_pipeline))
# optimizer
optimizer = dict(
type='AdamW', lr=0.0001, weight_decay=0.0001,
constructor='CustomLayerDecayOptimizerConstructor',
paramwise_cfg=dict(num_layers=56, layer_decay_rate=0.94,
depths=[2, 2, 48, 4], offset_lr_scale=1e-3))
optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[])
runner = dict(type='IterBasedRunner', max_iters=20000)
checkpoint_config = dict(interval=200, max_keep_ckpts=3)
evaluation = dict(interval=200, save_best='auto')
# resume_from = None
# custom_hooks = [
# dict(
# type='ExpMomentumEMAHook',
# resume_from=resume_from,
# momentum=0.0003,
# priority=49)
# ]
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
_base_ = [
'../_base_/datasets/coco_detection.py',
'../_base_/default_runtime.py'
]
model = dict(
type='DINO',
backbone=dict(
type='InternImage',
core_op='DCNv3',
channels=320,
depths=[6, 6, 32, 6],
groups=[10, 20, 40, 80],
mlp_ratio=4.,
drop_path_rate=0.5,
norm_layer='LN',
layer_scale=None,
offset_scale=1.0,
post_norm=False,
dw_kernel_size=5, # for InternImage-H/G
res_post_norm=True, # for InternImage-H/G
level2_post_norm=True, # for InternImage-H/G
level2_post_norm_block_ids=[5, 11, 17, 23, 29], # for InternImage-H/G
center_feature_scale=True, # for InternImage-H/G
with_cp=True,
out_indices=(1, 2, 3),
init_cfg=None # dict(type='Pretrained', checkpoint=pretrained)
),
neck=dict(
type='ChannelMapper',
in_channels=[640, 1280, 2560],
kernel_size=1,
out_channels=256,
act_cfg=None,
norm_cfg=dict(type='GN', num_groups=32),
num_outs=4),
bbox_head=dict(
type='DINOHead',
num_query=900,
num_classes=80,
in_channels=2048, # TODO
sync_cls_avg_factor=True,
as_two_stage=True,
with_box_refine=True,
dn_cfg=dict(
type='CdnQueryGenerator',
noise_scale=dict(label=0.5, box=1.0), # 0.5, 0.4 for DN-DETR
group_cfg=dict(dynamic=True, num_groups=None, num_dn_queries=1000)),
transformer=dict(
type='DinoTransformer',
two_stage_num_proposals=900,
encoder=dict(
type='DetrTransformerEncoder',
num_layers=6,
transformerlayers=dict(
type='BaseTransformerLayer',
attn_cfgs=dict(
type='MultiScaleDeformableAttention',
embed_dims=256,
dropout=0.0), # 0.1 for DeformDETR
feedforward_channels=2048, # 1024 for DeformDETR
ffn_cfgs=dict(
type='FFN',
embed_dims=256,
feedforward_channels=2048,
num_fcs=2,
ffn_drop=0.,
use_checkpoint=True,
act_cfg=dict(type='ReLU', inplace=True),),
ffn_dropout=0.0, # 0.1 for DeformDETR
operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
decoder=dict(
type='DinoTransformerDecoder',
num_layers=6,
return_intermediate=True,
transformerlayers=dict(
type='DetrTransformerDecoderLayer',
attn_cfgs=[
dict(
type='MultiheadAttention',
embed_dims=256,
num_heads=8,
dropout=0.0), # 0.1 for DeformDETR
dict(
type='MultiScaleDeformableAttention',
num_levels=4,
embed_dims=256,
dropout=0.0), # 0.1 for DeformDETR
],
feedforward_channels=2048, # 1024 for DeformDETR
ffn_cfgs=dict(
type='FFN',
embed_dims=256,
feedforward_channels=2048,
num_fcs=2,
ffn_drop=0.,
use_checkpoint=True,
act_cfg=dict(type='ReLU', inplace=True),),
ffn_dropout=0.0, # 0.1 for DeformDETR
operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
'ffn', 'norm')))),
positional_encoding=dict(
type='SinePositionalEncoding',
num_feats=128,
temperature=20,
normalize=True),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0), # 2.0 in DeformDETR
loss_bbox=dict(type='L1Loss', loss_weight=5.0),
loss_iou=dict(type='GIoULoss', loss_weight=2.0)),
# training and testing settings
train_cfg=dict(
assigner=dict(
type='HungarianAssigner',
cls_cost=dict(type='FocalLossCost', weight=2.0),
reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0)),
snip_cfg=dict(
type='v3',
weight=0.1)),
test_cfg=dict(max_per_img=300))
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
# from the default setting in mmdet.
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', with_bbox=True),
dict(type='RandomFlip', flip_ratio=0.5),
dict(type='Resize',
img_scale=[(2000, 600), (2000, 1800)],
multiscale_mode='range',
keep_ratio=True),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(2000, 1000),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img'])
])
]
data = dict(
samples_per_gpu=1,
workers_per_gpu=2,
train=dict(filter_empty_gt=True, pipeline=train_pipeline),
val=dict(pipeline=test_pipeline),
test=dict(pipeline=test_pipeline))
# optimizer
optimizer = dict(
type='AdamW', lr=0.0001, weight_decay=0.0001,
constructor='CustomLayerDecayOptimizerConstructor',
paramwise_cfg=dict(num_layers=50, layer_decay_rate=0.94,
depths=[6, 6, 32, 6], offset_lr_scale=1e-3))
optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[])
runner = dict(type='IterBasedRunner', max_iters=20000)
checkpoint_config = dict(interval=200, max_keep_ckpts=3)
evaluation = dict(interval=200, save_best='auto')
# resume_from = None
# custom_hooks = [
# dict(
# type='ExpMomentumEMAHook',
# resume_from=resume_from,
# momentum=0.0003,
# priority=49)
# ]
...@@ -4,11 +4,14 @@ ...@@ -4,11 +4,14 @@
# Licensed under The MIT License [see LICENSE for details] # Licensed under The MIT License [see LICENSE for details]
# -------------------------------------------------------- # --------------------------------------------------------
import torch
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from .custom_layer_decay_optimizer_constructor import \ from .custom_layer_decay_optimizer_constructor import \
CustomLayerDecayOptimizerConstructor CustomLayerDecayOptimizerConstructor
from .efficient_ffn import EfficientFFN
__all__ = ['CustomLayerDecayOptimizerConstructor'] __all__ = ['CustomLayerDecayOptimizerConstructor', 'EfficientFFN']
if torch.__version__.startswith('1.11'): if torch.__version__.startswith('1.11'):
......
# Copyright (c) OpenMMLab. All rights reserved.
import math
import warnings
from typing import Sequence
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint as checkpoint
from mmcv.cnn import (build_activation_layer, build_conv_layer,
build_norm_layer, xavier_init)
from mmcv.cnn.bricks.registry import (FEEDFORWARD_NETWORK, TRANSFORMER_LAYER,
TRANSFORMER_LAYER_SEQUENCE)
from mmcv.cnn.bricks.transformer import (BaseTransformerLayer,
TransformerLayerSequence,
build_transformer_layer_sequence)
from mmcv.runner import force_fp32
from mmcv.runner.base_module import BaseModule
from mmcv.utils import deprecated_api_warning, to_2tuple
from mmdet.models.utils.builder import TRANSFORMER
from torch.nn.init import normal_
@FEEDFORWARD_NETWORK.register_module()
class EfficientFFN(BaseModule):
@deprecated_api_warning(
{
'dropout': 'ffn_drop',
'add_residual': 'add_identity'
},
cls_name='EfficientFFN')
def __init__(self,
embed_dims=256,
feedforward_channels=1024,
num_fcs=2,
act_cfg=dict(type='ReLU', inplace=True),
ffn_drop=0.,
dropout_layer=None,
add_identity=True,
init_cfg=None,
split=4,
use_checkpoint=False,
**kwargs):
super(EfficientFFN, self).__init__(init_cfg)
assert num_fcs >= 2, 'num_fcs should be no less ' \
f'than 2. got {num_fcs}.'
self.embed_dims = embed_dims
self.feedforward_channels = feedforward_channels
self.num_fcs = num_fcs
self.act_cfg = act_cfg
self.activate = build_activation_layer(act_cfg)
self.drop = nn.Dropout(ffn_drop)
in_channels = embed_dims
self.use_checkpoint = use_checkpoint
self.split = split
for i in range(split):
fc1 = nn.Linear(in_channels, feedforward_channels //
self.split, bias=True)
setattr(self, f'fc1_{i}', fc1)
for i in range(split):
fc2 = nn.Linear(feedforward_channels // self.split,
embed_dims, bias=False)
setattr(self, f'fc2_{i}', fc2)
self.fc2_bias = nn.Parameter(torch.zeros(
(embed_dims)), requires_grad=True)
# fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out(self.fc2_0.weight)
# bound = 1 / math.sqrt(fan_in)
# torch.nn.init.uniform_(self.fc2_bias, -bound, bound)
self.dropout_layer = build_dropout(
dropout_layer) if dropout_layer else torch.nn.Identity()
self.add_identity = add_identity
@deprecated_api_warning({'residual': 'identity'}, cls_name='FFN')
def forward(self, x, identity=None):
def _inner_forward(x, i):
fc1 = getattr(self, f'fc1_{i}')
x = fc1(x)
x = self.activate(x)
x = self.drop(x)
fc2 = getattr(self, f'fc2_{i}')
x = fc2(x)
x = self.drop(x)
return x
out = 0
for i in range(self.split):
if self.use_checkpoint and x.requires_grad:
out = out + checkpoint.checkpoint(_inner_forward, x, i)
else:
out = out + _inner_forward(x, i)
out = out + self.fc2_bias
if not self.add_identity:
return self.dropout_layer(out)
if identity is None:
identity = x
return identity + self.dropout_layer(out)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment