Commit c570a7eb authored by zhe chen's avatar zhe chen
Browse files

Release cityscapes model

parent 477b5ed3
......@@ -115,7 +115,7 @@ Prepare datasets according to the [guidelines](https://github.com/open-mmlab/mms
<div>
| method | backbone | resolution | mIoU (ss/ms) | #params | FLOPs | Config | Download |
| :---------: | :------------: | :--------: | :-----------: | :-----: | :---: | :-------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| :---------: | :------------: | :--------: | :-----------: | :-----: | :---: | :-----------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| UperNet | InternImage-T | 512x1024 | 82.58 / 83.40 | 59M | 1889G | [config](./configs/cityscapes/upernet_internimage_t_512x1024_160k_cityscapes.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_t_512x1024_160k_cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_t_512x1024_160k_cityscapes.log.json) |
| UperNet | InternImage-S | 512x1024 | 82.74 / 83.45 | 80M | 2035G | [config](./configs/cityscapes/upernet_internimage_s_512x1024_160k_cityscapes.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_s_512x1024_160k_cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_s_512x1024_160k_cityscapes.log.json) |
| UperNet | InternImage-B | 512x1024 | 83.18 / 83.97 | 128M | 2369G | [config](./configs/cityscapes/upernet_internimage_b_512x1024_160k_cityscapes.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_b_512x1024_160k_cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_b_512x1024_160k_cityscapes.log.json) |
......@@ -125,6 +125,7 @@ Prepare datasets according to the [guidelines](https://github.com/open-mmlab/mms
| UperNet\* | InternImage-XL | 512x1024 | 86.20 / 86.42 | 368M | 4022G | [config](./configs/cityscapes/upernet_internimage_xl_512x1024_160k_mapillary2cityscapes.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/upernet_internimage_xl_512x1024_160k_mapillary2cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/upernet_internimage_xl_512x1024_160k_mapillary2cityscapes.log.json) |
| SegFormer\* | InternImage-L | 512x1024 | 85.16 / 85.67 | 220M | 1580G | [config](./configs/cityscapes/segformer_internimage_l_512x1024_160k_mapillary2cityscapes.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/segformer_internimage_l_512x1024_160k_mapillary2cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/segformer_internimage_l_512x1024_160k_mapillary2cityscapes.log.json) |
| SegFormer\* | InternImage-XL | 512x1024 | 85.41 / 85.93 | 330M | 2364G | [config](./configs/cityscapes/segformer_internimage_xl_512x1024_160k_mapillary2cityscapes.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/segformer_internimage_xl_512x1024_160k_mapillary2cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/segformer_internimage_xl_512x1024_160k_mapillary2cityscapes.log.json) |
| Mask2Former | InternImage-H | 1024x1024 | 86.37 / 86.96 | 1094M | 7878G | [config](./configs/cityscapes/mask2former_internimage_h_1024x1024_80k_mapillary2cityscapes_ss.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/mask2former_internimage_h_1024x1024_80k_mapillary2cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/mask2former_internimage_h_1024x1024_80k_mapillary2cityscapes.log.json) |
\* denotes the model is trained using extra Mapillary dataset.
......@@ -145,6 +146,19 @@ Prepare datasets according to the [guidelines](https://github.com/open-mmlab/mms
</details>
<details>
<summary> Dataset: COCO-Stuff-10K </summary>
<br>
<div>
| method | backbone | resolution | mIoU (ss) | #params | FLOPs | Config | Download |
| :---------: | :-----------: | :--------: | :-------: | :-----: | :---: | :------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| Mask2Former | InternImage-H | 896x896 | 52.6 | 1.31B | 4635G | [config](./configs/coco_stuff10k/mask2former_internimage_h_896_80k_cocostuff10k_ss.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/mask2former_internimage_h_896_80k_cocostuff10k.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/mask2former_internimage_h_896_80k_cocostuff10k.log.json) |
</div>
</details>
## Evaluation
To evaluate our `InternImage` on ADE20K val, run:
......
_base_ = './cityscapes_extra.py'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
crop_size = (1024, 1024)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations'),
dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)),
dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
dict(type='RandomFlip', prob=0.5),
dict(type='PhotoMetricDistortion'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_semantic_seg']),
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(2048, 1024),
# img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='Normalize', **img_norm_cfg),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img']),
])
]
data = dict(
train=dict(pipeline=train_pipeline),
val=dict(pipeline=test_pipeline),
test=dict(pipeline=test_pipeline))
# dataset settings
dataset_type = 'MapillaryDataset'
data_root = 'data/Mapillary/'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
crop_size = (896, 896)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations'),
dict(type='MapillaryHack'),
dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 1.0)),
dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
dict(type='RandomFlip', prob=0.5),
dict(type='PhotoMetricDistortion'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_semantic_seg']),
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(2048, 1024),
# img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='Normalize', **img_norm_cfg),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img']),
])
]
data = dict(
samples_per_gpu=2,
workers_per_gpu=2,
train=dict(
type=dataset_type,
data_root='data/Mapillary/',
img_dir=['training/images', 'validation/images'],
ann_dir=['training/labels', 'validation/labels'],
pipeline=train_pipeline),
val=dict(
type='CityscapesDataset',
data_root='data/cityscapes/',
img_dir='leftImg8bit/val',
ann_dir='gtFine/val',
pipeline=test_pipeline),
test=dict(
type='CityscapesDataset',
data_root='data/cityscapes/',
img_dir='leftImg8bit/val',
ann_dir='gtFine/val',
pipeline=test_pipeline))
......@@ -23,6 +23,7 @@ model = dict(
offset_scale=1.0,
post_norm=True,
with_cp=False,
out_indices=(0, 1, 2, 3),
init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
decode_head=dict(num_classes=150, in_channels=[80, 160, 320, 640]),
auxiliary_head=dict(num_classes=150, in_channels=320),
......
......@@ -36,3 +36,11 @@ Mapillary 80k + Cityscapes (w/ coarse data) 160k
| :------------: | :--------: | :-----------: | :----------: | :--------: | :-----: | :---: | :------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| InternImage-L | 512x1024 | 85.16 / 85.67 | 0.37s / iter | 17h | 220M | 1580G | [config](./segformer_internimage_l_512x1024_160k_mapillary2cityscapes.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/segformer_internimage_l_512x1024_160k_mapillary2cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/segformer_internimage_l_512x1024_160k_mapillary2cityscapes.log.json) |
| InternImage-XL | 512x1024 | 85.41 / 85.93 | 0.43s / iter | 19.5h | 330M | 2364G | [config](./segformer_internimage_xl_512x1024_160k_mapillary2cityscapes.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/segformer_internimage_xl_512x1024_160k_mapillary2cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/segformer_internimage_xl_512x1024_160k_mapillary2cityscapes.log.json) |
### Mask2Former + InternImage (with additional data)
Mapillary 80k + Cityscapes (w/ coarse data) 80k
| backbone | resolution | mIoU (ss/ms) | #params | FLOPs | Config | Download |
| :-----------: | :--------: | :-----------: | :-----: | :---: | :----------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| InternImage-H | 1024x1024 | 86.37 / 86.96 | 1094M | 7878G | [config](./mask2former_internimage_h_1024x1024_80k_mapillary2cityscapes_ss.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/mask2former_internimage_h_1024x1024_80k_mapillary2cityscapes.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/mask2former_internimage_h_1024x1024_80k_mapillary2cityscapes.log.json) |
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
_base_ = [
'../_base_/models/mask2former_beit.py', '../_base_/datasets/cityscapes_1024x1024.py',
'../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
]
num_classes = 19
crop_size = (1024, 1024)
load_from = 'https://huggingface.co/OpenGVLab/InternImage/resolve/main/mask2former_internimage_h_896x896_80k_mapillary.pth'
model = dict(
type='EncoderDecoderMask2Former',
backbone=dict(
_delete_=True,
type='InternImage',
core_op='DCNv3',
channels=320,
depths=[6, 6, 32, 6],
groups=[10, 20, 40, 80],
mlp_ratio=4.,
drop_path_rate=0.5,
norm_layer='LN',
layer_scale=None,
offset_scale=1.0,
post_norm=False,
dw_kernel_size=5, # for InternImage-H/G
res_post_norm=True, # for InternImage-H/G
level2_post_norm=True, # for InternImage-H/G
level2_post_norm_block_ids=[5, 11, 17, 23, 29], # for InternImage-H/G
center_feature_scale=True, # for InternImage-H/G
with_cp=False,
out_indices=(0, 1, 2, 3),
init_cfg=None),
decode_head=dict(
in_channels=[320, 640, 1280, 2560],
feat_channels=256,
out_channels=256,
num_classes=num_classes,
num_queries=100,
pixel_decoder=dict(
type='MSDeformAttnPixelDecoder',
num_outs=3,
norm_cfg=dict(type='GN', num_groups=32),
act_cfg=dict(type='ReLU'),
encoder=dict(
type='DetrTransformerEncoder',
num_layers=6,
transformerlayers=dict(
type='BaseTransformerLayer',
attn_cfgs=dict(
type='MultiScaleDeformableAttention',
embed_dims=256,
num_heads=8,
num_levels=3,
num_points=4,
im2col_step=64,
dropout=0.0,
batch_first=False,
norm_cfg=None,
init_cfg=None),
ffn_cfgs=dict(
type='FFN',
embed_dims=256,
feedforward_channels=1024,
num_fcs=2,
ffn_drop=0.0,
with_cp=False, # set with_cp=True to save memory
act_cfg=dict(type='ReLU', inplace=True)),
operation_order=('self_attn', 'norm', 'ffn', 'norm')),
init_cfg=None),
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True),
init_cfg=None),
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True),
transformer_decoder=dict(
type='DetrTransformerDecoder',
return_intermediate=True,
num_layers=9,
transformerlayers=dict(
type='DetrTransformerDecoderLayer',
attn_cfgs=dict(
type='MultiheadAttention',
embed_dims=256,
num_heads=8,
attn_drop=0.0,
proj_drop=0.0,
dropout_layer=None,
batch_first=False),
ffn_cfgs=dict(
embed_dims=256,
feedforward_channels=2048,
num_fcs=2,
act_cfg=dict(type='ReLU', inplace=True),
ffn_drop=0.0,
dropout_layer=None,
with_cp=False, # set with_cp=True to save memory
add_identity=True),
feedforward_channels=2048,
operation_order=('cross_attn', 'norm', 'self_attn', 'norm',
'ffn', 'norm')),
init_cfg=None),
loss_cls=dict(
type='CrossEntropyLoss',
use_sigmoid=False,
loss_weight=2.0,
reduction='mean',
class_weight=[1.0] * num_classes + [0.1])
),
test_cfg=dict(mode='slide', crop_size=crop_size, stride=(512, 512)))
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations'),
dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)),
dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
dict(type='RandomFlip', prob=0.5),
dict(type='PhotoMetricDistortion'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
dict(type='ToMask'),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_semantic_seg', 'gt_masks', 'gt_labels'])
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(2048, 1024),
img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
flip=True,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='ResizeToMultiple', size_divisor=32),
dict(type='RandomFlip'),
dict(type='Normalize', **img_norm_cfg),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img']),
])
]
optimizer = dict(
_delete_=True, type='AdamW', lr=1e-5, betas=(0.9, 0.999), weight_decay=0.05,
constructor='CustomLayerDecayOptimizerConstructor',
paramwise_cfg=dict(num_layers=50, layer_decay_rate=0.95,
depths=[6, 6, 32, 6], offset_lr_scale=1.0))
lr_config = dict(_delete_=True, policy='poly',
warmup='linear',
warmup_iters=1500,
warmup_ratio=1e-6,
power=1.0, min_lr=0.0, by_epoch=False)
# By default, models are trained on 8 GPUs with 2 images per GPU
data = dict(samples_per_gpu=2,
train=dict(pipeline=train_pipeline),
val=dict(pipeline=test_pipeline),
test=dict(pipeline=test_pipeline))
runner = dict(type='IterBasedRunner')
optimizer_config = dict(_delete_=True, grad_clip=dict(max_norm=0.1, norm_type=2))
checkpoint_config = dict(by_epoch=False, interval=1000, max_keep_ckpts=1)
evaluation = dict(interval=2000, metric='mIoU', save_best='mIoU')
# fp16 = dict(loss_scale=dict(init_scale=512))
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
_base_ = [
'../_base_/models/mask2former_beit.py', '../_base_/datasets/cityscapes_extra_1024x1024.py',
'../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
]
num_classes = 19
crop_size = (1024, 1024)
load_from = 'https://huggingface.co/OpenGVLab/InternImage/resolve/main/mask2former_internimage_h_896x896_80k_mapillary.pth'
model = dict(
type='EncoderDecoderMask2Former',
backbone=dict(
_delete_=True,
type='InternImage',
core_op='DCNv3',
channels=320,
depths=[6, 6, 32, 6],
groups=[10, 20, 40, 80],
mlp_ratio=4.,
drop_path_rate=0.5,
norm_layer='LN',
layer_scale=None,
offset_scale=1.0,
post_norm=False,
dw_kernel_size=5, # for InternImage-H/G
res_post_norm=True, # for InternImage-H/G
level2_post_norm=True, # for InternImage-H/G
level2_post_norm_block_ids=[5, 11, 17, 23, 29], # for InternImage-H/G
center_feature_scale=True, # for InternImage-H/G
with_cp=False,
out_indices=(0, 1, 2, 3),
init_cfg=None),
decode_head=dict(
in_channels=[320, 640, 1280, 2560],
feat_channels=256,
out_channels=256,
num_classes=num_classes,
num_queries=100,
pixel_decoder=dict(
type='MSDeformAttnPixelDecoder',
num_outs=3,
norm_cfg=dict(type='GN', num_groups=32),
act_cfg=dict(type='ReLU'),
encoder=dict(
type='DetrTransformerEncoder',
num_layers=6,
transformerlayers=dict(
type='BaseTransformerLayer',
attn_cfgs=dict(
type='MultiScaleDeformableAttention',
embed_dims=256,
num_heads=8,
num_levels=3,
num_points=4,
im2col_step=64,
dropout=0.0,
batch_first=False,
norm_cfg=None,
init_cfg=None),
ffn_cfgs=dict(
type='FFN',
embed_dims=256,
feedforward_channels=1024,
num_fcs=2,
ffn_drop=0.0,
with_cp=False, # set with_cp=True to save memory
act_cfg=dict(type='ReLU', inplace=True)),
operation_order=('self_attn', 'norm', 'ffn', 'norm')),
init_cfg=None),
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True),
init_cfg=None),
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True),
transformer_decoder=dict(
type='DetrTransformerDecoder',
return_intermediate=True,
num_layers=9,
transformerlayers=dict(
type='DetrTransformerDecoderLayer',
attn_cfgs=dict(
type='MultiheadAttention',
embed_dims=256,
num_heads=8,
attn_drop=0.0,
proj_drop=0.0,
dropout_layer=None,
batch_first=False),
ffn_cfgs=dict(
embed_dims=256,
feedforward_channels=2048,
num_fcs=2,
act_cfg=dict(type='ReLU', inplace=True),
ffn_drop=0.0,
dropout_layer=None,
with_cp=False, # set with_cp=True to save memory
add_identity=True),
feedforward_channels=2048,
operation_order=('cross_attn', 'norm', 'self_attn', 'norm',
'ffn', 'norm')),
init_cfg=None),
loss_cls=dict(
type='CrossEntropyLoss',
use_sigmoid=False,
loss_weight=2.0,
reduction='mean',
class_weight=[1.0] * num_classes + [0.1])
),
test_cfg=dict(mode='slide', crop_size=crop_size, stride=(512, 512)))
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations'),
dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)),
dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
dict(type='RandomFlip', prob=0.5),
dict(type='PhotoMetricDistortion'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
dict(type='ToMask'),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_semantic_seg', 'gt_masks', 'gt_labels'])
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(2048, 1024),
# img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='ResizeToMultiple', size_divisor=32),
dict(type='RandomFlip'),
dict(type='Normalize', **img_norm_cfg),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img']),
])
]
optimizer = dict(
_delete_=True, type='AdamW', lr=1e-5, betas=(0.9, 0.999), weight_decay=0.05,
constructor='CustomLayerDecayOptimizerConstructor',
paramwise_cfg=dict(num_layers=50, layer_decay_rate=0.95,
depths=[6, 6, 32, 6], offset_lr_scale=1.0))
lr_config = dict(_delete_=True, policy='poly',
warmup='linear',
warmup_iters=1500,
warmup_ratio=1e-6,
power=1.0, min_lr=0.0, by_epoch=False)
# By default, models are trained on 8 GPUs with 2 images per GPU
data = dict(samples_per_gpu=2,
train=dict(pipeline=train_pipeline),
val=dict(pipeline=test_pipeline),
test=dict(pipeline=test_pipeline))
runner = dict(type='IterBasedRunner')
optimizer_config = dict(_delete_=True, grad_clip=dict(max_norm=0.1, norm_type=2))
checkpoint_config = dict(by_epoch=False, interval=1000, max_keep_ckpts=1)
evaluation = dict(interval=2000, metric='mIoU', save_best='mIoU')
# fp16 = dict(loss_scale=dict(init_scale=512))
......@@ -21,3 +21,9 @@ We first pretrain our models on the Mapillary Vistas dataset, then finetune them
| :------------: | :--------: | :--: | :----------: | :--------: | :-----: | :---: | :------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------: |
| InternImage-L | 512x1024 | 80k | 0.37s / iter | 9h | 220M | 1580G | [config](./segformer_internimage_l_512x1024_80k_mapillary.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/segformer_internimage_l_512x1024_80k_mapillary.pth) |
| InternImage-XL | 512x1024 | 80k | 0.43s / iter | 10h | 330M | 2364G | [config](./segformer_internimage_xl_512x1024_80k_mapillary.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/segformer_internimage_xl_512x1024_80k_mapillary.pth) |
### Mask2Former + InternImage
| backbone | resolution | schd | #params | FLOPs | Config | Download |
| :-----------: | :--------: | :--: | :-----: | :---: | :------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------: |
| InternImage-H | 896x896 | 80k | 1094M | 7878G | [config](./mask2former_internimage_h_896x896_80k_mapillary.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/mask2former_internimage_h_896x896_80k_mapillary.pth) |
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
_base_ = [
'../_base_/models/mask2former_beit.py', '../_base_/datasets/mapillary_896x896.py',
'../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
]
num_classes = 19
crop_size = (896, 896)
pretrained = 'https://huggingface.co/OpenGVLab/InternImage/resolve/main/internimage_h_jointto22k_384.pth'
model = dict(
type='EncoderDecoderMask2Former',
backbone=dict(
_delete_=True,
type='InternImage',
core_op='DCNv3',
channels=320,
depths=[6, 6, 32, 6],
groups=[10, 20, 40, 80],
mlp_ratio=4.,
drop_path_rate=0.5,
norm_layer='LN',
layer_scale=None,
offset_scale=1.0,
post_norm=False,
dw_kernel_size=5, # for InternImage-H/G
res_post_norm=True, # for InternImage-H/G
level2_post_norm=True, # for InternImage-H/G
level2_post_norm_block_ids=[5, 11, 17, 23, 29], # for InternImage-H/G
center_feature_scale=True, # for InternImage-H/G
with_cp=False,
out_indices=(0, 1, 2, 3),
init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
decode_head=dict(
in_channels=[320, 640, 1280, 2560],
feat_channels=256,
out_channels=256,
num_classes=num_classes,
num_queries=100,
pixel_decoder=dict(
type='MSDeformAttnPixelDecoder',
num_outs=3,
norm_cfg=dict(type='GN', num_groups=32),
act_cfg=dict(type='ReLU'),
encoder=dict(
type='DetrTransformerEncoder',
num_layers=6,
transformerlayers=dict(
type='BaseTransformerLayer',
attn_cfgs=dict(
type='MultiScaleDeformableAttention',
embed_dims=256,
num_heads=8,
num_levels=3,
num_points=4,
im2col_step=64,
dropout=0.0,
batch_first=False,
norm_cfg=None,
init_cfg=None),
ffn_cfgs=dict(
type='FFN',
embed_dims=256,
feedforward_channels=1024,
num_fcs=2,
ffn_drop=0.0,
with_cp=False, # set with_cp=True to save memory
act_cfg=dict(type='ReLU', inplace=True)),
operation_order=('self_attn', 'norm', 'ffn', 'norm')),
init_cfg=None),
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True),
init_cfg=None),
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True),
transformer_decoder=dict(
type='DetrTransformerDecoder',
return_intermediate=True,
num_layers=9,
transformerlayers=dict(
type='DetrTransformerDecoderLayer',
attn_cfgs=dict(
type='MultiheadAttention',
embed_dims=256,
num_heads=8,
attn_drop=0.0,
proj_drop=0.0,
dropout_layer=None,
batch_first=False),
ffn_cfgs=dict(
embed_dims=256,
feedforward_channels=2048,
num_fcs=2,
act_cfg=dict(type='ReLU', inplace=True),
ffn_drop=0.0,
dropout_layer=None,
with_cp=False, # set with_cp=True to save memory
add_identity=True),
feedforward_channels=2048,
operation_order=('cross_attn', 'norm', 'self_attn', 'norm',
'ffn', 'norm')),
init_cfg=None),
loss_cls=dict(
type='CrossEntropyLoss',
use_sigmoid=False,
loss_weight=2.0,
reduction='mean',
class_weight=[1.0] * num_classes + [0.1])
),
test_cfg=dict(mode='slide', crop_size=crop_size, stride=(512, 512)))
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations'),
dict(type='MapillaryHack'),
dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)),
dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
dict(type='RandomFlip', prob=0.5),
dict(type='PhotoMetricDistortion'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
dict(type='ToMask'),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_semantic_seg', 'gt_masks', 'gt_labels'])
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(2048, 1024),
# img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='ResizeToMultiple', size_divisor=32),
dict(type='RandomFlip'),
dict(type='Normalize', **img_norm_cfg),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img']),
])
]
optimizer = dict(
_delete_=True, type='AdamW', lr=2e-5, betas=(0.9, 0.999), weight_decay=0.05,
constructor='CustomLayerDecayOptimizerConstructor',
paramwise_cfg=dict(num_layers=50, layer_decay_rate=0.95,
depths=[6, 6, 32, 6], offset_lr_scale=1.0))
lr_config = dict(_delete_=True, policy='poly',
warmup='linear',
warmup_iters=1500,
warmup_ratio=1e-6,
power=1.0, min_lr=0.0, by_epoch=False)
# By default, models are trained on 16 GPUs with 2 images per GPU
data = dict(samples_per_gpu=2,
train=dict(pipeline=train_pipeline),
val=dict(pipeline=test_pipeline),
test=dict(pipeline=test_pipeline))
runner = dict(type='IterBasedRunner')
optimizer_config = dict(_delete_=True, grad_clip=dict(max_norm=0.1, norm_type=2))
checkpoint_config = dict(by_epoch=False, interval=1000, max_keep_ckpts=1)
evaluation = dict(interval=2000, metric='mIoU', save_best='mIoU')
# fp16 = dict(loss_scale=dict(init_scale=512))
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
from functools import partial
import mmcv_custom # noqa: F401,F403
import mmseg_custom # noqa: F401,F403
......
......@@ -462,7 +462,10 @@ class Mask2FormerHead(BaseDecodeHead):
decoder layer. Each with shape (batch_size, num_queries, \
h, w).
"""
try:
batch_size = len(img_metas)
except:
batch_size = 1
mask_features, multi_scale_memorys = self.pixel_decoder(feats)
# multi_scale_memorys (from low resolution to high resolution)
decoder_inputs = []
......@@ -570,7 +573,7 @@ class Mask2FormerHead(BaseDecodeHead):
"""
all_cls_scores, all_mask_preds = self(inputs, img_metas)
cls_score, mask_pred = all_cls_scores[-1], all_mask_preds[-1]
ori_h, ori_w, _ = img_metas[0]['ori_shape']
# ori_h, ori_w, _ = img_metas[0]['ori_shape']
# semantic inference
cls_score = F.softmax(cls_score, dim=-1)[..., :-1]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment