Commit 63f0666a authored by zhe chen's avatar zhe chen
Browse files

Release OpenImages model

parent ecd555e9
......@@ -35,7 +35,7 @@ data = dict(
train=dict(
type=dataset_type,
ann_file=data_root + 'annotations/oidv6-train-annotations-bbox.csv',
img_prefix=data_root + 'OpenImages/train/',
img_prefix=data_root + 'train/',
label_file=data_root + 'annotations/class-descriptions-boxable.csv',
hierarchy_file=data_root +
'annotations/bbox_labels_600_hierarchy.json',
......@@ -43,7 +43,7 @@ data = dict(
val=dict(
type=dataset_type,
ann_file=data_root + 'annotations/validation-annotations-bbox.csv',
img_prefix=data_root + 'OpenImages/validation/',
img_prefix=data_root + 'validation/',
label_file=data_root + 'annotations/class-descriptions-boxable.csv',
hierarchy_file=data_root +
'annotations/bbox_labels_600_hierarchy.json',
......@@ -54,7 +54,7 @@ data = dict(
test=dict(
type=dataset_type,
ann_file=data_root + 'annotations/validation-annotations-bbox.csv',
img_prefix=data_root + 'OpenImages/validation/',
img_prefix=data_root + 'validation/',
label_file=data_root + 'annotations/class-descriptions-boxable.csv',
hierarchy_file=data_root +
'annotations/bbox_labels_600_hierarchy.json',
......
# OpenImages
## Introduction
OpenImages V6 is a large-scale dataset , consists of 9 million training images, 41,620 validation samples, and 125,456 test samples. It is a partially annotated dataset, with 9,600 trainable classes.
## Model Zoo
### DINO + CB-InternImage
| backbone | pretrain | mAP (ss) | #param | Config | Download |
| :--------------: | :--------: | :------: | :----: | :-----------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------: |
| CB-InternImage-H | Objects365 | 74.1 | 2.18B | [config](./dino_4scale_cbinternimage_h_objects365_openimages_ss.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_cbinternimage_h_objects365_openimages.pth) |
_base_ = [
'../_base_/datasets/openimages_detection.py',
'../_base_/default_runtime.py'
]
load_from = 'https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_cbinternimage_h_objects365_80classes.pth'
model = dict(
type='CBDINO',
backbone=dict(
type='CBInternImage',
core_op='DCNv3',
channels=320,
depths=[6, 6, 32, 6],
groups=[10, 20, 40, 80],
mlp_ratio=4.,
drop_path_rate=0.5,
norm_layer='LN',
layer_scale=None,
offset_scale=1.0,
post_norm=False,
dw_kernel_size=5, # for InternImage-H/G
res_post_norm=True, # for InternImage-H/G
level2_post_norm=True, # for InternImage-H/G
level2_post_norm_block_ids=[5, 11, 17, 23, 29], # for InternImage-H/G
center_feature_scale=True, # for InternImage-H/G
with_cp=True,
out_indices=[(0, 1, 2, 3), (1, 2, 3)],
init_cfg=None,
),
neck=[dict(
type='CBChannelMapper',
in_channels=[640, 1280, 2560],
kernel_size=1,
out_channels=256,
act_cfg=None,
norm_cfg=dict(type='GN', num_groups=32),
num_outs=4)],
bbox_head=dict(
type='CBDINOHead',
num_query=900,
num_classes=601,
in_channels=2048, # TODO
sync_cls_avg_factor=True,
as_two_stage=True,
with_box_refine=True,
dn_cfg=dict(
type='CdnQueryGenerator',
noise_scale=dict(label=0.5, box=1.0), # 0.5, 0.4 for DN-DETR
group_cfg=dict(dynamic=True, num_groups=None, num_dn_queries=1000)),
transformer=dict(
type='DinoTransformer',
two_stage_num_proposals=900,
encoder=dict(
type='DetrTransformerEncoder',
num_layers=6,
transformerlayers=dict(
type='BaseTransformerLayer',
attn_cfgs=dict(
type='MultiScaleDeformableAttention',
embed_dims=256,
dropout=0.0), # 0.1 for DeformDETR
feedforward_channels=2048, # 1024 for DeformDETR
ffn_cfgs=dict(
type='FFN',
embed_dims=256,
feedforward_channels=2048,
num_fcs=2,
ffn_drop=0.,
use_checkpoint=True,
act_cfg=dict(type='ReLU', inplace=True),),
ffn_dropout=0.0, # 0.1 for DeformDETR
operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
decoder=dict(
type='DinoTransformerDecoder',
num_layers=6,
return_intermediate=True,
transformerlayers=dict(
type='DetrTransformerDecoderLayer',
attn_cfgs=[
dict(
type='MultiheadAttention',
embed_dims=256,
num_heads=8,
dropout=0.0), # 0.1 for DeformDETR
dict(
type='MultiScaleDeformableAttention',
num_levels=4,
embed_dims=256,
dropout=0.0), # 0.1 for DeformDETR
],
feedforward_channels=2048, # 1024 for DeformDETR
ffn_cfgs=dict(
type='FFN',
embed_dims=256,
feedforward_channels=2048,
num_fcs=2,
ffn_drop=0.,
use_checkpoint=True,
act_cfg=dict(type='ReLU', inplace=True),),
ffn_dropout=0.0, # 0.1 for DeformDETR
operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
'ffn', 'norm')))),
positional_encoding=dict(
type='SinePositionalEncoding',
num_feats=128,
temperature=20,
normalize=True),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0), # 2.0 in DeformDETR
loss_bbox=dict(type='L1Loss', loss_weight=5.0),
loss_iou=dict(type='GIoULoss', loss_weight=2.0)),
# training and testing settings
train_cfg=dict(
assigner=dict(
type='HungarianAssigner',
cls_cost=dict(type='FocalLossCost', weight=2.0),
reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0)),
snip_cfg=dict(
type='v3',
weight=0.1)),
test_cfg=dict(max_per_img=300)) # TODO: Originally 100
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
# from the default setting in mmdet.
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', with_bbox=True, denorm_bbox=True),
dict(type='RandomFlip', flip_ratio=0.5),
dict(type='Resize',
img_scale=[(2000, 600), (2000, 1200)],
multiscale_mode='range',
keep_ratio=True),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(2000, 1000),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img'])
])
]
data = dict(
samples_per_gpu=1,
workers_per_gpu=2,
train_dataloader=dict(class_aware_sampler=dict(num_sample_class=1)),
train=dict(filter_empty_gt=True, pipeline=train_pipeline),
val=dict(pipeline=test_pipeline),
test=dict(pipeline=test_pipeline))
# optimizer
optimizer = dict(
type='AdamW', lr=0.0001, weight_decay=0.0001,
constructor='CustomLayerDecayOptimizerConstructor',
paramwise_cfg=dict(num_layers=50, layer_decay_rate=0.94,
depths=[6, 6, 32, 6], offset_lr_scale=1e-3))
optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[])
runner = dict(type='IterBasedRunner', max_iters=100000)
checkpoint_config = dict(interval=2000, max_keep_ckpts=3)
evaluation = dict(
interval=2000, metric='mAP', save_best='auto')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment