[release] Add configs and models for cityscapes (#31)

* add _base_ files * support mapillary dataset * reorganize ade20k configs * move figs to docs * release configs and models of cityscapes * update README.md * update load_from urls

[release] Add configs and models for cityscapes (#31)
* add _base_ files * support mapillary dataset * reorganize ade20k configs * move figs to docs * release configs and models of cityscapes * update README.md * update load_from urls
0f94318a · Zhe Chen · zhe chen · 0f982e7a · 0f94318a · 0f94318a
Commit 0f94318a authored Mar 14, 2023 by Zhe Chen Committed by zhe chen Mar 14, 2023
20 changed files
--- a/segmentation/configs/_base_/datasets/potsdam.py
+++ b/segmentation/configs/_base_/datasets/potsdam.py
--- a/segmentation/configs/_base_/datasets/stare.py
+++ b/segmentation/configs/_base_/datasets/stare.py
+# dataset settings
+dataset_type = 'STAREDataset'
+data_root = 'data/STARE'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+img_scale = (605, 700)
+crop_size = (128, 128)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_semantic_seg'])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=img_scale,
+        # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0],
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img'])
+        ])
+]
+
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=40000,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            img_dir='images/training',
+            ann_dir='annotations/training',
+            pipeline=train_pipeline)),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        img_dir='images/validation',
+        ann_dir='annotations/validation',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        img_dir='images/validation',
+        ann_dir='annotations/validation',
+        pipeline=test_pipeline))
--- a/segmentation/configs/_base_/models/segformer_mit-b0.py
+++ b/segmentation/configs/_base_/models/segformer_mit-b0.py
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    pretrained=None,
+    backbone=dict(
+        type='MixVisionTransformer',
+        in_channels=3,
+        embed_dims=32,
+        num_stages=4,
+        num_layers=[2, 2, 2, 2],
+        num_heads=[1, 2, 5, 8],
+        patch_sizes=[7, 3, 3, 3],
+        sr_ratios=[8, 4, 2, 1],
+        out_indices=(0, 1, 2, 3),
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.1),
+    decode_head=dict(
+        type='SegformerHead',
+        in_channels=[32, 64, 160, 256],
+        in_index=[0, 1, 2, 3],
+        channels=256,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
\ No newline at end of file
--- a/segmentation/configs/_base_/schedules/schedule_20k.py
+++ b/segmentation/configs/_base_/schedules/schedule_20k.py
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict()
+# learning policy
+lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
+# runtime settings
+runner = dict(type='IterBasedRunner', max_iters=20000)
+checkpoint_config = dict(by_epoch=False, interval=2000)
+evaluation = dict(interval=2000, metric='mIoU', pre_eval=True)
--- a/segmentation/configs/_base_/schedules/schedule_320k.py
+++ b/segmentation/configs/_base_/schedules/schedule_320k.py
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict()
+# learning policy
+lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
+# runtime settings
+runner = dict(type='IterBasedRunner', max_iters=320000)
+checkpoint_config = dict(by_epoch=False, interval=32000)
+evaluation = dict(interval=32000, metric='mIoU')
--- a/segmentation/configs/_base_/schedules/schedule_40k.py
+++ b/segmentation/configs/_base_/schedules/schedule_40k.py
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict()
+# learning policy
+lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
+# runtime settings
+runner = dict(type='IterBasedRunner', max_iters=40000)
+checkpoint_config = dict(by_epoch=False, interval=4000)
+evaluation = dict(interval=4000, metric='mIoU', pre_eval=True)
--- a/segmentation/configs/_base_/schedules/schedule_80k.py
+++ b/segmentation/configs/_base_/schedules/schedule_80k.py
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict()
+# learning policy
+lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
+# runtime settings
+runner = dict(type='IterBasedRunner', max_iters=80000)
+checkpoint_config = dict(by_epoch=False, interval=8000)
+evaluation = dict(interval=8000, metric='mIoU', pre_eval=True)
--- a/segmentation/configs/ade20k/README.md
+++ b/segmentation/configs/ade20k/README.md
+# ADE20K
+
+Introduced by Zhou et al. in [Scene Parsing Through ADE20K Dataset](https://paperswithcode.com/paper/scene-parsing-through-ade20k-dataset).
+
+The ADE20K semantic segmentation dataset contains more than 20K scene-centric images exhaustively annotated with pixel-level objects and object parts labels. There are totally 150 semantic categories, which include stuffs like sky, road, grass, and discrete objects like person, car, bed.
+
+
+## Model Zoo
+
+### UperNet + InternImage
+
+
+| backbone       | resolution | mIoU (ss/ms) | train speed | train time | #param | FLOPs | Config | Download            |
+|:--------------:|:----------:|:-----------:|:-----------:|:----------:|:-------:|:-----:|:-----:|:-------------------:|
+| InternImage-T  | 512x512    | 47.9 / 48.1  | 0.23s / iter       | 10.5h      | 59M     | 944G  | [config](./upernet_internimage_t_512_160k_ade20k.py) | [ckpt](https://github.com/OpenGVLab/InternImage/releases/download/seg_models/upernet_internimage_t_512_160k_ade20k.pth) \| [log](https://github.com/OpenGVLab/InternImage/releases/download/seg_models/upernet_internimage_t_512_160k_ade20k.log.json)   | 
+| InternImage-S  | 512x512    | 50.1 / 50.9  | 0.25s / iter       | 11.5h      | 80M     | 1017G | [config](./upernet_internimage_s_512_160k_ade20k.py)  | [ckpt](https://github.com/OpenGVLab/InternImage/releases/download/seg_models/upernet_internimage_s_512_160k_ade20k.pth) \| [log](https://github.com/OpenGVLab/InternImage/releases/download/seg_models/upernet_internimage_s_512_160k_ade20k.log.json)  | 
+| InternImage-B  | 512x512    | 50.8 / 51.3  | 0.26s / iter       | 12h        | 128M    | 1185G | [config](./upernet_internimage_b_512_160k_ade20k.py) | [ckpt](https://github.com/OpenGVLab/InternImage/releases/download/seg_models/upernet_internimage_b_512_160k_ade20k.pth) \| [log](https://github.com/OpenGVLab/InternImage/releases/download/seg_models/upernet_internimage_b_512_160k_ade20k.log.json)  | 
+| InternImage-L  | 640x640    | 53.9 / 54.1  | 0.42s / iter       | 19h        | 256M    | 2526G | [config](./upernet_internimage_l_640_160k_ade20k.py)| [ckpt](https://github.com/OpenGVLab/InternImage/releases/download/seg_models/upernet_internimage_l_640_160k_ade20k.pth) \| [log](https://github.com/OpenGVLab/InternImage/releases/download/seg_models/upernet_internimage_l_640_160k_ade20k.log.json)  | 
+| InternImage-XL | 640x640    | 55.0 / 55.3  | 0.47s / iter       | 22h        | 368M    | 3142G | [config](./upernet_internimage_xl_640_160k_ade20k.py) | [ckpt](https://github.com/OpenGVLab/InternImage/releases/download/seg_models/upernet_internimage_xl_640_160k_ade20k.pth) \| [log](https://github.com/OpenGVLab/InternImage/releases/download/seg_models/upernet_internimage_xl_640_160k_ade20k.log.json) | 
+
+- Training speed is measured with A100 GPU.
+- Please set `with_cp=True` to save memory if you meet `out-of-memory` issues.
+- The logs are our recent newly trained ones. There are slight differences between the results in logs and our paper.
--- a/segmentation/configs/upernet/upernet_internimage_b_512_160k_ade20k.py
+++ b/segmentation/configs/upernet/upernet_internimage_b_512_160k_ade20k.py
@@ -50,8 +50,8 @@ test_pipeline = [
 optimizer = dict(
    _delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.05,
    constructor='CustomLayerDecayOptimizerConstructor',
-    paramwise_cfg=dict(num_layers=30, layer_decay_rate=1.0,
-                       depths=[4, 4, 18, 4]))
+    paramwise_cfg=dict(num_layers=33, layer_decay_rate=1.0,
+                       depths=[4, 4, 21, 4]))
 lr_config = dict(_delete_=True, policy='poly',
                 warmup='linear',
                 warmup_iters=1500,

--- a/segmentation/configs/upernet/upernet_internimage_l_640_160k_ade20k.py
+++ b/segmentation/configs/upernet/upernet_internimage_l_640_160k_ade20k.py
--- a/segmentation/configs/upernet/upernet_internimage_s_512_160k_ade20k.py
+++ b/segmentation/configs/upernet/upernet_internimage_s_512_160k_ade20k.py
--- a/segmentation/configs/upernet/upernet_internimage_t_512_160k_ade20k.py
+++ b/segmentation/configs/upernet/upernet_internimage_t_512_160k_ade20k.py
--- a/segmentation/configs/upernet/upernet_internimage_xl_640_160k_ade20k.py
+++ b/segmentation/configs/upernet/upernet_internimage_xl_640_160k_ade20k.py
--- a/segmentation/configs/cityscapes/README.md
+++ b/segmentation/configs/cityscapes/README.md
+# Cityscapes
+
+Introduced by Cordts et al. in [The Cityscapes Dataset for Semantic Urban Scene Understanding](https://paperswithcode.com/paper/the-cityscapes-dataset-for-semantic-urban).
+
+Cityscapes is a large-scale database which focuses on semantic understanding of urban street scenes. It provides semantic, instance-wise, and dense pixel annotations for 30 classes grouped into 8 categories (flat surfaces, humans, vehicles, constructions, objects, nature, sky, and void). The dataset consists of around 5000 fine annotated images and 20000 coarse annotated ones. Data was captured in 50 cities during several months, daytimes, and good weather conditions. It was originally recorded as video so the frames were manually selected to have the following features: large number of dynamic objects, varying scene layout, and varying background.
+
+## Model Zoo
+
+### UperNet + InternImage
+
+| backbone       | resolution |  mIoU (ss/ms) | train speed | train time | #params | FLOPs | Config | Download                                                                      |
+|:--------------:|:----------:|:------------:|:-----------:|:----------:|:-------:|:-----:|:----:|:----:|
+| InternImage-T  | 512x1024   |   82.58 / 83.40    | 0.32s / iter       | 14.5h      | 59M     | 1889G | [config](./upernet_internimage_t_512x1024_160k_cityscapes.py) | [ckpt](https://github.com/OpenGVLab/InternImage/releases/download/seg_models/upernet_internimage_t_512x1024_160k_cityscapes.pth) \| [log](https://github.com/OpenGVLab/InternImage/releases/download/seg_models/upernet_internimage_t_512x1024_160k_cityscapes.log.json) |
+| InternImage-S  | 512x1024   |   82.74 / 83.45    | 0.36s / iter       | 16.5h      | 80M     | 2035G | [config](./upernet_internimage_s_512x1024_160k_cityscapes.py) |[ckpt](https://github.com/OpenGVLab/InternImage/releases/download/seg_models/upernet_internimage_s_512x1024_160k_cityscapes.pth) \| [log](https://github.com/OpenGVLab/InternImage/releases/download/seg_models/upernet_internimage_s_512x1024_160k_cityscapes.log.json)  |
+| InternImage-B  | 512x1024   |   83.18 / 83.97    | 0.39s / iter       | 17h        | 128M    | 2369G | [config](./upernet_internimage_b_512x1024_160k_cityscapes.py) |[ckpt](https://github.com/OpenGVLab/InternImage/releases/download/seg_models/upernet_internimage_b_512x1024_160k_cityscapes.pth) \| [log](https://github.com/OpenGVLab/InternImage/releases/download/seg_models/upernet_internimage_b_512x1024_160k_cityscapes.log.json)  |
+| InternImage-L  | 512x1024   |    83.68 / 84.41   | 0.50s / iter       | 23h        | 256M    | 3234G | [config](./upernet_internimage_l_512x1024_160k_cityscapes.py) |[ckpt](https://github.com/OpenGVLab/InternImage/releases/download/seg_models/upernet_internimage_l_512x1024_160k_cityscapes.pth) \| [log](https://github.com/OpenGVLab/InternImage/releases/download/seg_models/upernet_internimage_l_512x1024_160k_cityscapes.log.json)  |
+| InternImage-XL | 512x1024   |    83.62 / 84.28   | 0.56s / iter       | 26h       | 368M    | 4022G | [config](./upernet_internimage_xl_512x1024_160k_cityscapes.py) |[ckpt](https://github.com/OpenGVLab/InternImage/releases/download/seg_models/upernet_internimage_xl_512x1024_160k_cityscapes.pth) \| [log](https://github.com/OpenGVLab/InternImage/releases/download/seg_models/upernet_internimage_xl_512x1024_160k_cityscapes.log.json) |
+
+- Training speed is measured with A100 GPU.
+- Please set `with_cp=True` to save memory if you meet `out-of-memory` issues.
+
+### UperNet + InternImage (with additional data)
+
+Mapillary 80k + Cityscapes (w/ coarse data) 160k 
+
+| backbone       | resolution |  mIoU (ss/ms) | train speed | train time | #params | FLOPs | Config | Download     |
+|:--------------:|:----------:|:------------:|:-----------:|:-----------:|:-------:|:-----:|:------:|:------------:|
+| InternImage-L  | 512x1024   | 85.94 / 86.22  | 0.50s / iter | 23h    | 256M  | 3234G | [config](./upernet_internimage_l_512x1024_160k_mapillary2cityscapes.py) | [ckpt](https://github.com/OpenGVLab/InternImage/releases/download/seg_models/upernet_internimage_l_512x1024_160k_mapillary2cityscapes.pth)  \| [log](https://github.com/OpenGVLab/InternImage/releases/download/seg_models/upernet_internimage_l_512x1024_160k_mapillary2cityscapes.log.json)  |
+| InternImage-XL | 512x1024   | 86.20 / 86.42  | 0.56s / iter | 26h    | 368M  | 4022G | [config](./upernet_internimage_xl_512x1024_160k_mapillary2cityscapes.py) | [ckpt](https://github.com/OpenGVLab/InternImage/releases/download/seg_models/upernet_internimage_xl_512x1024_160k_mapillary2cityscapes.pth) \| [log](https://github.com/OpenGVLab/InternImage/releases/download/seg_models/upernet_internimage_xl_512x1024_160k_mapillary2cityscapes.log.json) |
+
+### SegFormerHead + InternImage (with additional data)
+
+Mapillary 80k + Cityscapes (w/ coarse data) 160k
+
+| backbone       | resolution |  mIoU (ss/ms) | train speed | train time | #params | FLOPs | Config | Download |
+|:--------------:|:----------:|:------------:|:-----------:|:-----------:|:-------:|:-----:|:-----:|:---------:|
+| InternImage-L  | 512x1024   | 85.16 / 85.67  | 0.37s / iter       | 17h        | 220M    | 1580G | [config](./segformer_internimage_l_512x1024_160k_mapillary2cityscapes.py) | [ckpt](https://github.com/OpenGVLab/InternImage/releases/download/seg_models/segformer_internimage_l_512x1024_160k_mapillary2cityscapes.pth) \| [log](https://github.com/OpenGVLab/InternImage/releases/download/seg_models/segformer_internimage_l_512x1024_160k_mapillary2cityscapes.log.json)  |
+| InternImage-XL | 512x1024   | 85.41 / 85.93  | 0.43s / iter       |  19.5h      | 330M    | 2364G | [config](./segformer_internimage_xl_512x1024_160k_mapillary2cityscapes.py) | [ckpt](https://github.com/OpenGVLab/InternImage/releases/download/seg_models/segformer_internimage_xl_512x1024_160k_mapillary2cityscapes.pth) \| [log](https://github.com/OpenGVLab/InternImage/releases/download/seg_models/segformer_internimage_xl_512x1024_160k_mapillary2cityscapes.log.json) |
+
--- a/segmentation/configs/cityscapes/segformer_internimage_l_512x1024_160k_mapillary2cityscapes.py
+++ b/segmentation/configs/cityscapes/segformer_internimage_l_512x1024_160k_mapillary2cityscapes.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+_base_ = [
+    '../_base_/models/segformer_mit-b0.py', '../_base_/datasets/cityscapes_extra.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+load_from = 'https://github.com/OpenGVLab/InternImage/releases/download/seg_models/segformer_internimage_l_512x1024_80k_mapillary.pth'
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='InternImage',
+        core_op='DCNv3',
+        channels=160,
+        depths=[5, 5, 22, 5],
+        groups=[10, 20, 40, 80],
+        mlp_ratio=4.,
+        drop_path_rate=0.4,
+        norm_layer='LN',
+        layer_scale=1.0,
+        offset_scale=2.0,
+        post_norm=True,
+        with_cp=False,
+        out_indices=(0, 1, 2, 3),
+        init_cfg=None),
+    decode_head=dict(num_classes=150, in_channels=[160, 320, 640, 1280]),
+    test_cfg=dict(mode='whole'))
+optimizer = dict(
+    _delete_=True, type='AdamW', lr=0.00002, betas=(0.9, 0.999), weight_decay=0.05,
+    constructor='CustomLayerDecayOptimizerConstructor',
+    paramwise_cfg=dict(num_layers=37, layer_decay_rate=0.94,
+                       depths=[5, 5, 22, 5], offset_lr_scale=1.0))
+lr_config = dict(_delete_=True, policy='poly',
+                 warmup='linear',
+                 warmup_iters=1500,
+                 warmup_ratio=1e-6,
+                 power=1.0, min_lr=0.0, by_epoch=False)
+# By default, models are trained on 8 GPUs with 2 images per GPU
+data = dict(samples_per_gpu=2)
+runner = dict(type='IterBasedRunner')
+optimizer_config = dict(_delete_=True, grad_clip=dict(max_norm=0.1, norm_type=2))
+checkpoint_config = dict(by_epoch=False, interval=1000, max_keep_ckpts=1)
+evaluation = dict(interval=4000, metric='mIoU', save_best='mIoU')
+# fp16 = dict(loss_scale=dict(init_scale=512))
--- a/segmentation/configs/cityscapes/segformer_internimage_l_512x1024_80k_mapillary.py
+++ b/segmentation/configs/cityscapes/segformer_internimage_l_512x1024_80k_mapillary.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+_base_ = [
+    '../_base_/models/segformer_mit-b0.py', '../_base_/datasets/mapillary.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+pretrained = 'https://github.com/OpenGVLab/InternImage/releases/download/cls_model/internimage_l_22k_192to384.pth'
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='InternImage',
+        core_op='DCNv3',
+        channels=160,
+        depths=[5, 5, 22, 5],
+        groups=[10, 20, 40, 80],
+        mlp_ratio=4.,
+        drop_path_rate=0.4,
+        norm_layer='LN',
+        layer_scale=1.0,
+        offset_scale=2.0,
+        post_norm=True,
+        with_cp=False,
+        out_indices=(0, 1, 2, 3),
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    decode_head=dict(num_classes=150, in_channels=[160, 320, 640, 1280]),
+    test_cfg=dict(mode='whole'))
+optimizer = dict(
+    _delete_=True, type='AdamW', lr=0.00002, betas=(0.9, 0.999), weight_decay=0.05,
+    constructor='CustomLayerDecayOptimizerConstructor',
+    paramwise_cfg=dict(num_layers=37, layer_decay_rate=0.94,
+                       depths=[5, 5, 22, 5], offset_lr_scale=1.0))
+lr_config = dict(_delete_=True, policy='poly',
+                 warmup='linear',
+                 warmup_iters=1500,
+                 warmup_ratio=1e-6,
+                 power=1.0, min_lr=0.0, by_epoch=False)
+# By default, models are trained on 8 GPUs with 2 images per GPU
+data = dict(samples_per_gpu=2)
+runner = dict(type='IterBasedRunner')
+optimizer_config = dict(_delete_=True, grad_clip=dict(max_norm=0.1, norm_type=2))
+checkpoint_config = dict(by_epoch=False, interval=1000, max_keep_ckpts=1)
+evaluation = dict(interval=8000, metric='mIoU', save_best='mIoU')
+# fp16 = dict(loss_scale=dict(init_scale=512))
--- a/segmentation/configs/cityscapes/segformer_internimage_xl_512x1024_160k_mapillary2cityscapes.py
+++ b/segmentation/configs/cityscapes/segformer_internimage_xl_512x1024_160k_mapillary2cityscapes.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+_base_ = [
+    '../_base_/models/segformer_mit-b0.py', '../_base_/datasets/cityscapes_extra.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+load_from = 'https://github.com/OpenGVLab/InternImage/releases/download/seg_models/segformer_internimage_xl_512x1024_80k_mapillary.pth'
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='InternImage',
+        core_op='DCNv3',
+        channels=192,
+        depths=[5, 5, 24, 5],
+        groups=[12, 24, 48, 96],
+        mlp_ratio=4.,
+        drop_path_rate=0.4,
+        norm_layer='LN',
+        layer_scale=1.0,
+        offset_scale=2.0,
+        post_norm=True,
+        with_cp=False,
+        out_indices=(0, 1, 2, 3),
+        init_cfg=None),
+    decode_head=dict(num_classes=150, in_channels=[192, 384, 768, 1536]),
+    test_cfg=dict(mode='whole'))
+optimizer = dict(
+    _delete_=True, type='AdamW', lr=0.00002, betas=(0.9, 0.999), weight_decay=0.05,
+    constructor='CustomLayerDecayOptimizerConstructor',
+    paramwise_cfg=dict(num_layers=39, layer_decay_rate=0.94,
+                       depths=[5, 5, 24, 5], offset_lr_scale=1.0))
+lr_config = dict(_delete_=True, policy='poly',
+                 warmup='linear',
+                 warmup_iters=1500,
+                 warmup_ratio=1e-6,
+                 power=1.0, min_lr=0.0, by_epoch=False)
+# By default, models are trained on 8 GPUs with 2 images per GPU
+data = dict(samples_per_gpu=2)
+runner = dict(type='IterBasedRunner')
+optimizer_config = dict(_delete_=True, grad_clip=dict(max_norm=0.1, norm_type=2))
+checkpoint_config = dict(by_epoch=False, interval=1000, max_keep_ckpts=1)
+evaluation = dict(interval=4000, metric='mIoU', save_best='mIoU')
+# fp16 = dict(loss_scale=dict(init_scale=512))
--- a/segmentation/configs/cityscapes/segformer_internimage_xl_512x1024_80k_mapillary.py
+++ b/segmentation/configs/cityscapes/segformer_internimage_xl_512x1024_80k_mapillary.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+_base_ = [
+    '../_base_/models/segformer_mit-b0.py', '../_base_/datasets/mapillary.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+pretrained = 'https://github.com/OpenGVLab/InternImage/releases/download/cls_model/internimage_xl_22k_192to384.pth'
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='InternImage',
+        core_op='DCNv3',
+        channels=192,
+        depths=[5, 5, 24, 5],
+        groups=[12, 24, 48, 96],
+        mlp_ratio=4.,
+        drop_path_rate=0.4,
+        norm_layer='LN',
+        layer_scale=1.0,
+        offset_scale=2.0,
+        post_norm=True,
+        with_cp=False,
+        out_indices=(0, 1, 2, 3),
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    decode_head=dict(num_classes=150, in_channels=[192, 384, 768, 1536]),
+    test_cfg=dict(mode='whole'))
+optimizer = dict(
+    _delete_=True, type='AdamW', lr=0.00002, betas=(0.9, 0.999), weight_decay=0.05,
+    constructor='CustomLayerDecayOptimizerConstructor',
+    paramwise_cfg=dict(num_layers=39, layer_decay_rate=0.94,
+                       depths=[5, 5, 24, 5], offset_lr_scale=1.0))
+lr_config = dict(_delete_=True, policy='poly',
+                 warmup='linear',
+                 warmup_iters=1500,
+                 warmup_ratio=1e-6,
+                 power=1.0, min_lr=0.0, by_epoch=False)
+# By default, models are trained on 8 GPUs with 2 images per GPU
+data = dict(samples_per_gpu=2)
+runner = dict(type='IterBasedRunner')
+optimizer_config = dict(_delete_=True, grad_clip=dict(max_norm=0.1, norm_type=2))
+checkpoint_config = dict(by_epoch=False, interval=1000, max_keep_ckpts=1)
+evaluation = dict(interval=8000, metric='mIoU', save_best='mIoU')
+# fp16 = dict(loss_scale=dict(init_scale=512))
--- a/segmentation/configs/cityscapes/upernet_internimage_b_512x1024_160k_cityscapes.py
+++ b/segmentation/configs/cityscapes/upernet_internimage_b_512x1024_160k_cityscapes.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+_base_ = [
+    '../_base_/models/upernet_r50.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+pretrained = 'https://github.com/OpenGVLab/InternImage/releases/download/cls_model/internimage_b_1k_224.pth'
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='InternImage',
+        core_op='DCNv3',
+        channels=112,
+        depths=[4, 4, 21, 4],
+        groups=[7, 14, 28, 56],
+        mlp_ratio=4.,
+        drop_path_rate=0.4,
+        norm_layer='LN',
+        layer_scale=1.0,
+        offset_scale=1.0,
+        post_norm=True,
+        with_cp=False,
+        out_indices=(0, 1, 2, 3),
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    decode_head=dict(num_classes=150, in_channels=[112, 224, 448, 896]),
+    auxiliary_head=dict(num_classes=150, in_channels=448),
+    test_cfg=dict(mode='whole')
+)
+optimizer = dict(
+    _delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.05,
+    constructor='CustomLayerDecayOptimizerConstructor',
+    paramwise_cfg=dict(num_layers=33, layer_decay_rate=1.0,
+                       depths=[4, 4, 21, 4]))
+lr_config = dict(_delete_=True, policy='poly',
+                 warmup='linear',
+                 warmup_iters=1500,
+                 warmup_ratio=1e-6,
+                 power=1.0, min_lr=0.0, by_epoch=False)
+# By default, models are trained on 8 GPUs with 2 images per GPU
+data=dict(samples_per_gpu=2)
+runner = dict(type='IterBasedRunner')
+checkpoint_config = dict(by_epoch=False, interval=1000, max_keep_ckpts=1)
+evaluation = dict(interval=16000, metric='mIoU', save_best='mIoU')
+# fp16 = dict(loss_scale=dict(init_scale=512))
--- a/segmentation/configs/cityscapes/upernet_internimage_l_512x1024_160k_cityscapes.py
+++ b/segmentation/configs/cityscapes/upernet_internimage_l_512x1024_160k_cityscapes.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+_base_ = [
+    '../_base_/models/upernet_r50.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+pretrained = 'https://github.com/OpenGVLab/InternImage/releases/download/cls_model/internimage_l_22k_192to384.pth'
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='InternImage',
+        core_op='DCNv3',
+        channels=160,
+        depths=[5, 5, 22, 5],
+        groups=[10, 20, 40, 80],
+        mlp_ratio=4.,
+        drop_path_rate=0.4,
+        norm_layer='LN',
+        layer_scale=1.0,
+        offset_scale=2.0,
+        post_norm=True,
+        with_cp=False,
+        out_indices=(0, 1, 2, 3),
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    decode_head=dict(num_classes=150, in_channels=[160, 320, 640, 1280]),
+    auxiliary_head=dict(num_classes=150, in_channels=640),
+    test_cfg=dict(mode='whole'))
+optimizer = dict(
+    _delete_=True, type='AdamW', lr=0.00002, betas=(0.9, 0.999), weight_decay=0.05,
+    constructor='CustomLayerDecayOptimizerConstructor',
+    paramwise_cfg=dict(num_layers=37, layer_decay_rate=0.94,
+                       depths=[5, 5, 22, 5], offset_lr_scale=1.0))
+lr_config = dict(_delete_=True, policy='poly',
+                 warmup='linear',
+                 warmup_iters=1500,
+                 warmup_ratio=1e-6,
+                 power=1.0, min_lr=0.0, by_epoch=False)
+# By default, models are trained on 8 GPUs with 2 images per GPU
+data = dict(samples_per_gpu=2)
+runner = dict(type='IterBasedRunner')
+optimizer_config = dict(_delete_=True, grad_clip=dict(max_norm=0.1, norm_type=2))
+checkpoint_config = dict(by_epoch=False, interval=1000, max_keep_ckpts=1)
+evaluation = dict(interval=16000, metric='mIoU', save_best='mIoU')
+# fp16 = dict(loss_scale=dict(init_scale=512))