[release] Add configs and models for cityscapes (#31)

* add _base_ files * support mapillary dataset * reorganize ade20k configs * move figs to docs * release configs and models of cityscapes * update README.md * update load_from urls

[release] Add configs and models for cityscapes (#31)
* add _base_ files * support mapillary dataset * reorganize ade20k configs * move figs to docs * release configs and models of cityscapes * update README.md * update load_from urls
0f94318a · Zhe Chen · zhe chen · 0f982e7a · 0f94318a · 0f94318a
Commit 0f94318a authored Mar 14, 2023 by Zhe Chen Committed by zhe chen Mar 14, 2023
15 changed files
--- a/segmentation/configs/cityscapes/upernet_internimage_l_512x1024_160k_mapillary2cityscapes.py
+++ b/segmentation/configs/cityscapes/upernet_internimage_l_512x1024_160k_mapillary2cityscapes.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+_base_ = [
+    '../_base_/models/upernet_r50.py', '../_base_/datasets/cityscapes_extra.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+load_from = 'https://github.com/OpenGVLab/InternImage/releases/download/seg_models/upernet_internimage_l_512x1024_80k_mapillary.pth'
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='InternImage',
+        core_op='DCNv3',
+        channels=160,
+        depths=[5, 5, 22, 5],
+        groups=[10, 20, 40, 80],
+        mlp_ratio=4.,
+        drop_path_rate=0.4,
+        norm_layer='LN',
+        layer_scale=1.0,
+        offset_scale=2.0,
+        post_norm=True,
+        with_cp=False,
+        out_indices=(0, 1, 2, 3),
+        init_cfg=None),
+    decode_head=dict(num_classes=150, in_channels=[160, 320, 640, 1280]),
+    auxiliary_head=dict(num_classes=150, in_channels=640),
+    test_cfg=dict(mode='whole'))
+optimizer = dict(
+    _delete_=True, type='AdamW', lr=0.00002, betas=(0.9, 0.999), weight_decay=0.05,
+    constructor='CustomLayerDecayOptimizerConstructor',
+    paramwise_cfg=dict(num_layers=37, layer_decay_rate=0.94,
+                       depths=[5, 5, 22, 5], offset_lr_scale=1.0))
+lr_config = dict(_delete_=True, policy='poly',
+                 warmup='linear',
+                 warmup_iters=1500,
+                 warmup_ratio=1e-6,
+                 power=1.0, min_lr=0.0, by_epoch=False)
+# By default, models are trained on 8 GPUs with 2 images per GPU
+data = dict(samples_per_gpu=2)
+runner = dict(type='IterBasedRunner')
+optimizer_config = dict(_delete_=True, grad_clip=dict(max_norm=0.1, norm_type=2))
+checkpoint_config = dict(by_epoch=False, interval=1000, max_keep_ckpts=1)
+evaluation = dict(interval=4000, metric='mIoU', save_best='mIoU')
+# fp16 = dict(loss_scale=dict(init_scale=512))
--- a/segmentation/configs/cityscapes/upernet_internimage_l_512x1024_80k_mapillary.py
+++ b/segmentation/configs/cityscapes/upernet_internimage_l_512x1024_80k_mapillary.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+_base_ = [
+    '../_base_/models/upernet_r50.py', '../_base_/datasets/mapillary.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+pretrained = 'https://github.com/OpenGVLab/InternImage/releases/download/cls_model/internimage_l_22k_192to384.pth'
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='InternImage',
+        core_op='DCNv3',
+        channels=160,
+        depths=[5, 5, 22, 5],
+        groups=[10, 20, 40, 80],
+        mlp_ratio=4.,
+        drop_path_rate=0.4,
+        norm_layer='LN',
+        layer_scale=1.0,
+        offset_scale=2.0,
+        post_norm=True,
+        with_cp=False,
+        out_indices=(0, 1, 2, 3),
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    decode_head=dict(num_classes=150, in_channels=[160, 320, 640, 1280]),
+    auxiliary_head=dict(num_classes=150, in_channels=640),
+    test_cfg=dict(mode='whole'))
+optimizer = dict(
+    _delete_=True, type='AdamW', lr=0.00002, betas=(0.9, 0.999), weight_decay=0.05,
+    constructor='CustomLayerDecayOptimizerConstructor',
+    paramwise_cfg=dict(num_layers=37, layer_decay_rate=0.94,
+                       depths=[5, 5, 22, 5], offset_lr_scale=1.0))
+lr_config = dict(_delete_=True, policy='poly',
+                 warmup='linear',
+                 warmup_iters=1500,
+                 warmup_ratio=1e-6,
+                 power=1.0, min_lr=0.0, by_epoch=False)
+# By default, models are trained on 8 GPUs with 2 images per GPU
+data = dict(samples_per_gpu=2)
+runner = dict(type='IterBasedRunner')
+optimizer_config = dict(_delete_=True, grad_clip=dict(max_norm=0.1, norm_type=2))
+checkpoint_config = dict(by_epoch=False, interval=1000, max_keep_ckpts=1)
+evaluation = dict(interval=8000, metric='mIoU', save_best='mIoU')
+# fp16 = dict(loss_scale=dict(init_scale=512))
--- a/segmentation/configs/cityscapes/upernet_internimage_s_512x1024_160k_cityscapes.py
+++ b/segmentation/configs/cityscapes/upernet_internimage_s_512x1024_160k_cityscapes.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+_base_ = [
+    '../_base_/models/upernet_r50.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+pretrained = 'https://github.com/OpenGVLab/InternImage/releases/download/cls_model/internimage_s_1k_224.pth'
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='InternImage',
+        core_op='DCNv3',
+        channels=80,
+        depths=[4, 4, 21, 4],
+        groups=[5, 10, 20, 40],
+        mlp_ratio=4.,
+        drop_path_rate=0.3,
+        norm_layer='LN',
+        layer_scale=1.0,
+        offset_scale=1.0,
+        post_norm=True,
+        with_cp=False,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    decode_head=dict(num_classes=150, in_channels=[80, 160, 320, 640]),
+    auxiliary_head=dict(num_classes=150, in_channels=320),
+    test_cfg=dict(mode='whole')
+)
+optimizer = dict(
+    _delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.05,
+    constructor='CustomLayerDecayOptimizerConstructor',
+    paramwise_cfg=dict(num_layers=33, layer_decay_rate=1.0,
+                       depths=[4, 4, 21, 4]))
+lr_config = dict(_delete_=True, policy='poly',
+                 warmup='linear',
+                 warmup_iters=1500,
+                 warmup_ratio=1e-6,
+                 power=1.0, min_lr=0.0, by_epoch=False)
+# By default, models are trained on 8 GPUs with 2 images per GPU
+data=dict(samples_per_gpu=2)
+runner = dict(type='IterBasedRunner')
+checkpoint_config = dict(by_epoch=False, interval=1000, max_keep_ckpts=1)
+evaluation = dict(interval=16000, metric='mIoU', save_best='mIoU')
+# fp16 = dict(loss_scale=dict(init_scale=512))
--- a/segmentation/configs/cityscapes/upernet_internimage_t_512x1024_160k_cityscapes.py
+++ b/segmentation/configs/cityscapes/upernet_internimage_t_512x1024_160k_cityscapes.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+_base_ = [
+    '../_base_/models/upernet_r50.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+pretrained = 'https://github.com/OpenGVLab/InternImage/releases/download/cls_model/internimage_t_1k_224.pth'
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='InternImage',
+        core_op='DCNv3',
+        channels=64,
+        depths=[4, 4, 18, 4],
+        groups=[4, 8, 16, 32],
+        mlp_ratio=4.,
+        drop_path_rate=0.2,
+        norm_layer='LN',
+        layer_scale=1.0,
+        offset_scale=1.0,
+        post_norm=False,
+        with_cp=False,
+        out_indices=(0, 1, 2, 3),
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    decode_head=dict(num_classes=150, in_channels=[64, 128, 256, 512]),
+    auxiliary_head=dict(num_classes=150, in_channels=256),
+    test_cfg=dict(mode='whole')
+)
+optimizer = dict(
+    _delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.05,
+    constructor='CustomLayerDecayOptimizerConstructor',
+    paramwise_cfg=dict(num_layers=30, layer_decay_rate=1.0,
+                       depths=[4, 4, 18, 4]))
+lr_config = dict(_delete_=True, policy='poly',
+                 warmup='linear',
+                 warmup_iters=1500,
+                 warmup_ratio=1e-6,
+                 power=1.0, min_lr=0.0, by_epoch=False)
+# By default, models are trained on 8 GPUs with 2 images per GPU
+data=dict(samples_per_gpu=2)
+runner = dict(type='IterBasedRunner')
+checkpoint_config = dict(by_epoch=False, interval=1000, max_keep_ckpts=1)
+evaluation = dict(interval=16000, metric='mIoU', save_best='mIoU')
+# fp16 = dict(loss_scale=dict(init_scale=512))
--- a/segmentation/configs/cityscapes/upernet_internimage_xl_512x1024_160k_cityscapes.py
+++ b/segmentation/configs/cityscapes/upernet_internimage_xl_512x1024_160k_cityscapes.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+_base_ = [
+    '../_base_/models/upernet_r50.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+pretrained = 'https://github.com/OpenGVLab/InternImage/releases/download/cls_model/internimage_xl_22k_192to384.pth'
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='InternImage',
+        core_op='DCNv3',
+        channels=192,
+        depths=[5, 5, 24, 5],
+        groups=[12, 24, 48, 96],
+        mlp_ratio=4.,
+        drop_path_rate=0.4,
+        norm_layer='LN',
+        layer_scale=1.0,
+        offset_scale=2.0,
+        post_norm=True,
+        with_cp=False,
+        out_indices=(0, 1, 2, 3),
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    decode_head=dict(num_classes=150, in_channels=[192, 384, 768, 1536]),
+    auxiliary_head=dict(num_classes=150, in_channels=768),
+    test_cfg=dict(mode='whole'))
+optimizer = dict(
+    _delete_=True, type='AdamW', lr=0.00002, betas=(0.9, 0.999), weight_decay=0.05,
+    constructor='CustomLayerDecayOptimizerConstructor',
+    paramwise_cfg=dict(num_layers=39, layer_decay_rate=0.94,
+                       depths=[5, 5, 24, 5], offset_lr_scale=1.0))
+lr_config = dict(_delete_=True, policy='poly',
+                 warmup='linear',
+                 warmup_iters=1500,
+                 warmup_ratio=1e-6,
+                 power=1.0, min_lr=0.0, by_epoch=False)
+# By default, models are trained on 8 GPUs with 2 images per GPU
+data = dict(samples_per_gpu=2)
+runner = dict(type='IterBasedRunner')
+optimizer_config = dict(_delete_=True, grad_clip=dict(max_norm=0.1, norm_type=2))
+checkpoint_config = dict(by_epoch=False, interval=1000, max_keep_ckpts=1)
+evaluation = dict(interval=16000, metric='mIoU', save_best='mIoU')
+# fp16 = dict(loss_scale=dict(init_scale=512))
--- a/segmentation/configs/cityscapes/upernet_internimage_xl_512x1024_160k_mapillary2cityscapes.py
+++ b/segmentation/configs/cityscapes/upernet_internimage_xl_512x1024_160k_mapillary2cityscapes.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+_base_ = [
+    '../_base_/models/upernet_r50.py', '../_base_/datasets/cityscapes_extra.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+load_from = 'https://github.com/OpenGVLab/InternImage/releases/download/seg_models/upernet_internimage_xl_512x1024_80k_mapillary.pth'
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='InternImage',
+        core_op='DCNv3',
+        channels=192,
+        depths=[5, 5, 24, 5],
+        groups=[12, 24, 48, 96],
+        mlp_ratio=4.,
+        drop_path_rate=0.4,
+        norm_layer='LN',
+        layer_scale=1.0,
+        offset_scale=2.0,
+        post_norm=True,
+        with_cp=False,
+        out_indices=(0, 1, 2, 3),
+        init_cfg=None),
+    decode_head=dict(num_classes=150, in_channels=[192, 384, 768, 1536]),
+    auxiliary_head=dict(num_classes=150, in_channels=768),
+    test_cfg=dict(mode='whole'))
+optimizer = dict(
+    _delete_=True, type='AdamW', lr=0.00002, betas=(0.9, 0.999), weight_decay=0.05,
+    constructor='CustomLayerDecayOptimizerConstructor',
+    paramwise_cfg=dict(num_layers=39, layer_decay_rate=0.94,
+                       depths=[5, 5, 24, 5], offset_lr_scale=1.0))
+lr_config = dict(_delete_=True, policy='poly',
+                 warmup='linear',
+                 warmup_iters=1500,
+                 warmup_ratio=1e-6,
+                 power=1.0, min_lr=0.0, by_epoch=False)
+# By default, models are trained on 8 GPUs with 2 images per GPU
+data = dict(samples_per_gpu=2)
+runner = dict(type='IterBasedRunner')
+optimizer_config = dict(_delete_=True, grad_clip=dict(max_norm=0.1, norm_type=2))
+checkpoint_config = dict(by_epoch=False, interval=1000, max_keep_ckpts=1)
+evaluation = dict(interval=4000, metric='mIoU', save_best='mIoU')
+# fp16 = dict(loss_scale=dict(init_scale=512))
--- a/segmentation/configs/cityscapes/upernet_internimage_xl_512x1024_80k_mapillary.py
+++ b/segmentation/configs/cityscapes/upernet_internimage_xl_512x1024_80k_mapillary.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+_base_ = [
+    '../_base_/models/upernet_r50.py', '../_base_/datasets/mapillary.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+pretrained = 'https://github.com/OpenGVLab/InternImage/releases/download/cls_model/internimage_xl_22k_192to384.pth'
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='InternImage',
+        core_op='DCNv3',
+        channels=192,
+        depths=[5, 5, 24, 5],
+        groups=[12, 24, 48, 96],
+        mlp_ratio=4.,
+        drop_path_rate=0.4,
+        norm_layer='LN',
+        layer_scale=1.0,
+        offset_scale=2.0,
+        post_norm=True,
+        with_cp=False,
+        out_indices=(0, 1, 2, 3),
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    decode_head=dict(num_classes=150, in_channels=[192, 384, 768, 1536]),
+    auxiliary_head=dict(num_classes=150, in_channels=768),
+    test_cfg=dict(mode='whole'))
+optimizer = dict(
+    _delete_=True, type='AdamW', lr=0.00002, betas=(0.9, 0.999), weight_decay=0.05,
+    constructor='CustomLayerDecayOptimizerConstructor',
+    paramwise_cfg=dict(num_layers=39, layer_decay_rate=0.94,
+                       depths=[5, 5, 24, 5], offset_lr_scale=1.0))
+lr_config = dict(_delete_=True, policy='poly',
+                 warmup='linear',
+                 warmup_iters=1500,
+                 warmup_ratio=1e-6,
+                 power=1.0, min_lr=0.0, by_epoch=False)
+# By default, models are trained on 8 GPUs with 2 images per GPU
+data = dict(samples_per_gpu=2)
+runner = dict(type='IterBasedRunner')
+optimizer_config = dict(_delete_=True, grad_clip=dict(max_norm=0.1, norm_type=2))
+checkpoint_config = dict(by_epoch=False, interval=1000, max_keep_ckpts=1)
+evaluation = dict(interval=8000, metric='mIoU', save_best='mIoU')
+# fp16 = dict(loss_scale=dict(init_scale=512))
--- a/segmentation/configs/upernet/README.md
+++ b/segmentation/configs/upernet/README.md
-# UPerNet
-[Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/pdf/1807.10221.pdf)
-## Introduction
-<!-- [ABSTRACT] -->
-Humans recognize the visual world at multiple levels: we effortlessly categorize scenes and detect objects inside, while also identifying the textures and surfaces of the objects along with their different compositional parts. In this paper, we study a new task called Unified Perceptual Parsing, which requires the machine vision systems to recognize as many visual concepts as possible from a given image. A multi-task framework called UPerNet and a training strategy are developed to learn from heterogeneous image annotations. We benchmark our framework on Unified Perceptual Parsing and show that it is able to effectively segment a wide range of concepts from images. The trained networks are further applied to discover visual knowledge in natural scenes. Models are available at [this https URL](https://github.com/CSAILVision/unifiedparsing).
-<!-- [IMAGE] -->
-<div align=center>
-<img src="https://user-images.githubusercontent.com/24582831/142903077-44e8e0da-7276-4bda-bd2b-0df1680ca845.png" width="70%"/>
-</div>
-## Results and models
-### ADE20K
-**ADE20K Semantic Segmentation**
-|    backbone    | resolution | single scale | multi scale | #params | FLOPs | Download | 
-| :------------: | :--------: | :----------: | :---------: | :-----: | :---: |   :---:  |
-| InternImage-T  |  512x512   |     47.9     |    48.1     |   59M   | 944G  | [ckpt](https://github.com/OpenGVLab/InternImage/releases/download/seg_models/upernet_internimage_t_512_160k_ade20k.pth) \| [cfg](./upernet_internimage_t_512_160k_ade20k.py) |
-| InternImage-S  |  512x512   |     50.1     |    50.9     |   80M   | 1017G | [ckpt](https://github.com/OpenGVLab/InternImage/releases/download/seg_models/upernet_internimage_s_512_160k_ade20k.pth) \| [cfg](./upernet_internimage_s_512_160k_ade20k.py) |
-| InternImage-B  |  512x512   |     50.8     |    51.3     |  128M   | 1185G | [ckpt](https://github.com/OpenGVLab/InternImage/releases/download/seg_models/upernet_internimage_b_512_160k_ade20k.pth) \| [cfg](./upernet_internimage_b_512_160k_ade20k.py) |
-| InternImage-L  |  640x640   |     53.9     |    54.1     |  256M   | 2526G | [ckpt](https://github.com/OpenGVLab/InternImage/releases/download/seg_models/upernet_internimage_l_640_160k_ade20k.pth) \| [cfg](./upernet_internimage_l_640_160k_ade20k.py) |
-| InternImage-XL |  640x640   |     55.0     |    55.3     |  368M   | 3142G | [ckpt](https://github.com/OpenGVLab/InternImage/releases/download/seg_models/upernet_internimage_xl_640_160k_ade20k.pth) \| [cfg](./upernet_internimage_xl_640_160k_ade20k.py) |
--- a/segmentation/mmseg_custom/__init__.py
+++ b/segmentation/mmseg_custom/__init__.py
@@ -5,3 +5,4 @@
 # --------------------------------------------------------
 from .models import *  # noqa: F401,F403
+from .datasets import *  # noqa: F401,F403
\ No newline at end of file
--- a/segmentation/mmseg_custom/datasets/__init__.py
+++ b/segmentation/mmseg_custom/datasets/__init__.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from .mapillary import MapillaryDataset  # noqa: F401,F403
+from .nyu_depth_v2 import NYUDepthV2Dataset  # noqa: F401,F403
+from .pipelines import *  # noqa: F401,F403
--- a/segmentation/mmseg_custom/datasets/mapillary.py
+++ b/segmentation/mmseg_custom/datasets/mapillary.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from mmseg.datasets.builder import DATASETS
+from mmseg.datasets.custom import CustomDataset
+@DATASETS.register_module()
+class MapillaryDataset(CustomDataset):
+    """Mapillary dataset.
+    """
+    CLASSES = ('Bird', 'Ground Animal', 'Curb', 'Fence', 'Guard Rail', 'Barrier',
+               'Wall', 'Bike Lane', 'Crosswalk - Plain', 'Curb Cut', 'Parking', 'Pedestrian Area',
+               'Rail Track', 'Road', 'Service Lane', 'Sidewalk', 'Bridge', 'Building', 'Tunnel',
+               'Person', 'Bicyclist', 'Motorcyclist', 'Other Rider', 'Lane Marking - Crosswalk',
+               'Lane Marking - General', 'Mountain', 'Sand', 'Sky', 'Snow', 'Terrain', 'Vegetation',
+               'Water', 'Banner', 'Bench', 'Bike Rack', 'Billboard', 'Catch Basin', 'CCTV Camera',
+               'Fire Hydrant', 'Junction Box', 'Mailbox', 'Manhole', 'Phone Booth', 'Pothole',
+               'Street Light', 'Pole', 'Traffic Sign Frame', 'Utility Pole', 'Traffic Light',
+               'Traffic Sign (Back)', 'Traffic Sign (Front)', 'Trash Can', 'Bicycle', 'Boat',
+               'Bus', 'Car', 'Caravan', 'Motorcycle', 'On Rails', 'Other Vehicle', 'Trailer',
+               'Truck', 'Wheeled Slow', 'Car Mount', 'Ego Vehicle', 'Unlabeled')
+    PALETTE = [[165, 42, 42], [0, 192, 0], [196, 196, 196], [190, 153, 153],
+               [180, 165, 180], [90, 120, 150], [102, 102, 156], [128, 64, 255],
+               [140, 140, 200], [170, 170, 170], [250, 170, 160], [96, 96, 96],
+               [230, 150, 140], [128, 64, 128], [110, 110, 110], [244, 35, 232],
+               [150, 100, 100], [70, 70, 70], [150, 120, 90], [220, 20, 60],
+               [255, 0, 0], [255, 0, 100], [255, 0, 200], [200, 128, 128],
+               [255, 255, 255], [64, 170, 64], [230, 160, 50], [70, 130, 180],
+               [190, 255, 255], [152, 251, 152], [107, 142, 35], [0, 170, 30],
+               [255, 255, 128], [250, 0, 30], [100, 140, 180], [220, 220, 220],
+               [220, 128, 128], [222, 40, 40], [100, 170, 30], [40, 40, 40],
+               [33, 33, 33], [100, 128, 160], [142, 0, 0], [70, 100, 150],
+               [210, 170, 100], [153, 153, 153], [128, 128, 128], [0, 0, 80],
+               [250, 170, 30], [192, 192, 192], [220, 220, 0], [140, 140, 20],
+               [119, 11, 32], [150, 0, 255], [0, 60, 100], [0, 0, 142], [0, 0, 90],
+               [0, 0, 230], [0, 80, 100], [128, 64, 64], [0, 0, 110], [0, 0, 70],
+               [0, 0, 192], [32, 32, 32], [120, 10, 10], [0, 0, 0]]
+    def __init__(self, **kwargs):
+        super(MapillaryDataset, self).__init__(
+            img_suffix='.jpg',
+            seg_map_suffix='.png',
+            reduce_zero_label=False,
+            **kwargs)
\ No newline at end of file
--- a/segmentation/mmseg_custom/datasets/nyu_depth_v2.py
+++ b/segmentation/mmseg_custom/datasets/nyu_depth_v2.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from mmseg.datasets.builder import DATASETS
+from mmseg.datasets.custom import CustomDataset
+@DATASETS.register_module()
+class NYUDepthV2Dataset(CustomDataset):
+    """NYU Depth V2 dataset.
+    """
+    CLASSES = ('wall', 'floor', 'cabinet', 'bed', 'chair',
+               'sofa', 'table', 'door', 'window', 'bookshelf',
+               'picture', 'counter', 'blinds', 'desk', 'shelves',
+               'curtain', 'dresser', 'pillow', 'mirror', 'floor mat',
+               'clothes', 'ceiling', 'books', 'refridgerator', 'television',
+               'paper', 'towel', 'shower curtain', 'box', 'whiteboard',
+               'person', 'night stand', 'toilet', 'sink', 'lamp',
+               'bathtub', 'bag', 'otherstructure', 'otherfurniture', 'otherprop')
+    PALETTE = [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50],
+               [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255],
+               [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7],
+               [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82],
+               [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3],
+               [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255],
+               [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220],
+               [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224],
+               [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255],
+               [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7],]
+    def __init__(self, split, **kwargs):
+        super(NYUDepthV2Dataset, self).__init__(
+            img_suffix='.png',
+            seg_map_suffix='.png',
+            split=split,
+            reduce_zero_label=True,
+            **kwargs)
\ No newline at end of file
--- a/segmentation/mmseg_custom/datasets/pipelines/__init__.py
+++ b/segmentation/mmseg_custom/datasets/pipelines/__init__.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from .formatting import DefaultFormatBundle, ToMask
+from .transform import MapillaryHack, PadShortSide, SETR_Resize
+__all__ = [
+    'DefaultFormatBundle', 'ToMask', 'SETR_Resize',
+    'PadShortSide', 'MapillaryHack'
+]
--- a/segmentation/mmseg_custom/datasets/pipelines/formatting.py
+++ b/segmentation/mmseg_custom/datasets/pipelines/formatting.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+from mmcv.parallel import DataContainer as DC
+from mmseg.datasets.builder import PIPELINES
+from mmseg.datasets.pipelines.formatting import to_tensor
+@PIPELINES.register_module(force=True)
+class DefaultFormatBundle(object):
+    """Default formatting bundle.
+    It simplifies the pipeline of formatting common fields, including "img"
+    and "gt_semantic_seg". These fields are formatted as follows.
+    - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
+    - gt_semantic_seg: (1)unsqueeze dim-0 (2)to tensor,
+                       (3)to DataContainer (stack=True)
+    """
+    def __call__(self, results):
+        """Call function to transform and format common fields in results.
+        Args:
+            results (dict): Result dict contains the data to convert.
+        Returns:
+            dict: The result dict contains the data that is formatted with
+                default bundle.
+        """
+        if 'img' in results:
+            img = results['img']
+            if len(img.shape) < 3:
+                img = np.expand_dims(img, -1)
+            img = np.ascontiguousarray(img.transpose(2, 0, 1))
+            results['img'] = DC(to_tensor(img), stack=True)
+        if 'gt_semantic_seg' in results:
+            # convert to long
+            results['gt_semantic_seg'] = DC(to_tensor(
+                results['gt_semantic_seg'][None, ...].astype(np.int64)),
+                                            stack=True)
+        if 'gt_masks' in results:
+            results['gt_masks'] = DC(to_tensor(results['gt_masks']))
+        if 'gt_labels' in results:
+            results['gt_labels'] = DC(to_tensor(results['gt_labels']))
+        return results
+    def __repr__(self):
+        return self.__class__.__name__
+@PIPELINES.register_module()
+class ToMask(object):
+    """Transfer gt_semantic_seg to binary mask and generate gt_labels."""
+    def __init__(self, ignore_index=255):
+        self.ignore_index = ignore_index
+    def __call__(self, results):
+        gt_semantic_seg = results['gt_semantic_seg']
+        gt_labels = np.unique(gt_semantic_seg)
+        # remove ignored region
+        gt_labels = gt_labels[gt_labels != self.ignore_index]
+        gt_masks = []
+        for class_id in gt_labels:
+            gt_masks.append(gt_semantic_seg == class_id)
+        if len(gt_masks) == 0:
+            # Some image does not have annotation (all ignored)
+            gt_masks = np.empty((0, ) + results['pad_shape'][:-1], dtype=np.int64)
+            gt_labels = np.empty((0, ),  dtype=np.int64)
+        else:
+            gt_masks = np.asarray(gt_masks, dtype=np.int64)
+            gt_labels = np.asarray(gt_labels, dtype=np.int64)
+        results['gt_labels'] = gt_labels
+        results['gt_masks'] = gt_masks
+        return results
+    def __repr__(self):
+        return self.__class__.__name__ + \
+               f'(ignore_index={self.ignore_index})'
--- a/segmentation/mmseg_custom/datasets/pipelines/transform.py
+++ b/segmentation/mmseg_custom/datasets/pipelines/transform.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+from mmseg.datasets.builder import PIPELINES
+@PIPELINES.register_module()
+class SETR_Resize(object):
+    """Resize images & seg.
+    This transform resizes the input image to some scale. If the input dict
+    contains the key "scale", then the scale in the input dict is used,
+    otherwise the specified scale in the init method is used.
+    ``img_scale`` can either be a tuple (single-scale) or a list of tuple
+    (multi-scale). There are 3 multiscale modes:
+    - ``ratio_range is not None``: randomly sample a ratio from the ratio range
+    and multiply it with the image scale.
+    - ``ratio_range is None and multiscale_mode == "range"``: randomly sample a
+    scale from the a range.
+    - ``ratio_range is None and multiscale_mode == "value"``: randomly sample a
+    scale from multiple scales.
+    Args:
+        img_scale (tuple or list[tuple]): Images scales for resizing.
+        multiscale_mode (str): Either "range" or "value".
+        ratio_range (tuple[float]): (min_ratio, max_ratio)
+        keep_ratio (bool): Whether to keep the aspect ratio when resizing the
+            image.
+    """
+    def __init__(self,
+                 img_scale=None,
+                 multiscale_mode='range',
+                 ratio_range=None,
+                 keep_ratio=True,
+                 crop_size=None,
+                 setr_multi_scale=False):
+        if img_scale is None:
+            self.img_scale = None
+        else:
+            if isinstance(img_scale, list):
+                self.img_scale = img_scale
+            else:
+                self.img_scale = [img_scale]
+            # assert mmcv.is_list_of(self.img_scale, tuple)
+        if ratio_range is not None:
+            # mode 1: given a scale and a range of image ratio
+            assert len(self.img_scale) == 1
+        else:
+            # mode 2: given multiple scales or a range of scales
+            assert multiscale_mode in ['value', 'range']
+        self.multiscale_mode = multiscale_mode
+        self.ratio_range = ratio_range
+        self.keep_ratio = keep_ratio
+        self.crop_size = crop_size
+        self.setr_multi_scale = setr_multi_scale
+    @staticmethod
+    def random_select(img_scales):
+        """Randomly select an img_scale from given candidates.
+        Args:
+            img_scales (list[tuple]): Images scales for selection.
+        Returns:
+            (tuple, int): Returns a tuple ``(img_scale, scale_dix)``,
+                where ``img_scale`` is the selected image scale and
+                ``scale_idx`` is the selected index in the given candidates.
+        """
+        assert mmcv.is_list_of(img_scales, tuple)
+        scale_idx = np.random.randint(len(img_scales))
+        img_scale = img_scales[scale_idx]
+        return img_scale, scale_idx
+    @staticmethod
+    def random_sample(img_scales):
+        """Randomly sample an img_scale when ``multiscale_mode=='range'``.
+        Args:
+            img_scales (list[tuple]): Images scale range for sampling.
+                There must be two tuples in img_scales, which specify the lower
+                and uper bound of image scales.
+        Returns:
+            (tuple, None): Returns a tuple ``(img_scale, None)``, where
+                ``img_scale`` is sampled scale and None is just a placeholder
+                to be consistent with :func:`random_select`.
+        """
+        assert mmcv.is_list_of(img_scales, tuple) and len(img_scales) == 2
+        img_scale_long = [max(s) for s in img_scales]
+        img_scale_short = [min(s) for s in img_scales]
+        long_edge = np.random.randint(
+            min(img_scale_long),
+            max(img_scale_long) + 1)
+        short_edge = np.random.randint(
+            min(img_scale_short),
+            max(img_scale_short) + 1)
+        img_scale = (long_edge, short_edge)
+        return img_scale, None
+    @staticmethod
+    def random_sample_ratio(img_scale, ratio_range):
+        """Randomly sample an img_scale when ``ratio_range`` is specified.
+        A ratio will be randomly sampled from the range specified by
+        ``ratio_range``. Then it would be multiplied with ``img_scale`` to
+        generate sampled scale.
+        Args:
+            img_scale (tuple): Images scale base to multiply with ratio.
+            ratio_range (tuple[float]): The minimum and maximum ratio to scale
+                the ``img_scale``.
+        Returns:
+            (tuple, None): Returns a tuple ``(scale, None)``, where
+                ``scale`` is sampled ratio multiplied with ``img_scale`` and
+                None is just a placeholder to be consistent with
+                :func:`random_select`.
+        """
+        assert isinstance(img_scale, tuple) and len(img_scale) == 2
+        min_ratio, max_ratio = ratio_range
+        assert min_ratio <= max_ratio
+        ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio
+        scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio)
+        return scale, None
+    def _random_scale(self, results):
+        """Randomly sample an img_scale according to ``ratio_range`` and
+        ``multiscale_mode``.
+        If ``ratio_range`` is specified, a ratio will be sampled and be
+        multiplied with ``img_scale``.
+        If multiple scales are specified by ``img_scale``, a scale will be
+        sampled according to ``multiscale_mode``.
+        Otherwise, single scale will be used.
+        Args:
+            results (dict): Result dict from :obj:`dataset`.
+        Returns:
+            dict: Two new keys 'scale` and 'scale_idx` are added into
+                ``results``, which would be used by subsequent pipelines.
+        """
+        if self.ratio_range is not None:
+            scale, scale_idx = self.random_sample_ratio(
+                self.img_scale[0], self.ratio_range)
+        elif len(self.img_scale) == 1:
+            scale, scale_idx = self.img_scale[0], 0
+        elif self.multiscale_mode == 'range':
+            scale, scale_idx = self.random_sample(self.img_scale)
+        elif self.multiscale_mode == 'value':
+            scale, scale_idx = self.random_select(self.img_scale)
+        else:
+            raise NotImplementedError
+        results['scale'] = scale
+        results['scale_idx'] = scale_idx
+    def _resize_img(self, results):
+        """Resize images with ``results['scale']``."""
+        if self.keep_ratio:
+            if self.setr_multi_scale:
+                if min(results['scale']) < self.crop_size[0]:
+                    new_short = self.crop_size[0]
+                else:
+                    new_short = min(results['scale'])
+                h, w = results['img'].shape[:2]
+                if h > w:
+                    new_h, new_w = new_short * h / w, new_short
+                else:
+                    new_h, new_w = new_short, new_short * w / h
+                results['scale'] = (new_h, new_w)
+            img, scale_factor = mmcv.imrescale(results['img'],
+                                               results['scale'],
+                                               return_scale=True)
+            # the w_scale and h_scale has minor difference
+            # a real fix should be done in the mmcv.imrescale in the future
+            new_h, new_w = img.shape[:2]
+            h, w = results['img'].shape[:2]
+            w_scale = new_w / w
+            h_scale = new_h / h
+        else:
+            img, w_scale, h_scale = mmcv.imresize(results['img'],
+                                                  results['scale'],
+                                                  return_scale=True)
+        scale_factor = np.array([w_scale, h_scale, w_scale, h_scale],
+                                dtype=np.float32)
+        results['img'] = img
+        results['img_shape'] = img.shape
+        results['pad_shape'] = img.shape  # in case that there is no padding
+        results['scale_factor'] = scale_factor
+        results['keep_ratio'] = self.keep_ratio
+    def _resize_seg(self, results):
+        """Resize semantic segmentation map with ``results['scale']``."""
+        for key in results.get('seg_fields', []):
+            if self.keep_ratio:
+                gt_seg = mmcv.imrescale(results[key],
+                                        results['scale'],
+                                        interpolation='nearest')
+            else:
+                gt_seg = mmcv.imresize(results[key],
+                                       results['scale'],
+                                       interpolation='nearest')
+            results['gt_semantic_seg'] = gt_seg
+    def __call__(self, results):
+        """Call function to resize images, bounding boxes, masks, semantic
+        segmentation map.
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Resized results, 'img_shape', 'pad_shape', 'scale_factor',
+                'keep_ratio' keys are added into result dict.
+        """
+        if 'scale' not in results:
+            self._random_scale(results)
+        self._resize_img(results)
+        self._resize_seg(results)
+        return results
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += (f'(img_scale={self.img_scale}, '
+                     f'multiscale_mode={self.multiscale_mode}, '
+                     f'ratio_range={self.ratio_range}, '
+                     f'keep_ratio={self.keep_ratio})')
+        return repr_str
+@PIPELINES.register_module()
+class PadShortSide(object):
+    """Pad the image & mask.
+    Pad to the minimum size that is equal or larger than a number.
+    Added keys are "pad_shape", "pad_fixed_size",
+    Args:
+        size (int, optional): Fixed padding size.
+        pad_val (float, optional): Padding value. Default: 0.
+        seg_pad_val (float, optional): Padding value of segmentation map.
+            Default: 255.
+    """
+    def __init__(self, size=None, pad_val=0, seg_pad_val=255):
+        self.size = size
+        self.pad_val = pad_val
+        self.seg_pad_val = seg_pad_val
+        # only one of size and size_divisor should be valid
+        assert size is not None
+    def _pad_img(self, results):
+        """Pad images according to ``self.size``."""
+        h, w = results['img'].shape[:2]
+        new_h = max(h, self.size)
+        new_w = max(w, self.size)
+        padded_img = mmcv.impad(results['img'],
+                                shape=(new_h, new_w),
+                                pad_val=self.pad_val)
+        results['img'] = padded_img
+        results['pad_shape'] = padded_img.shape
+        # results['unpad_shape'] = (h, w)
+    def _pad_seg(self, results):
+        """Pad masks according to ``results['pad_shape']``."""
+        for key in results.get('seg_fields', []):
+            results[key] = mmcv.impad(results[key],
+                                      shape=results['pad_shape'][:2],
+                                      pad_val=self.seg_pad_val)
+    def __call__(self, results):
+        """Call function to pad images, masks, semantic segmentation maps.
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Updated result dict.
+        """
+        h, w = results['img'].shape[:2]
+        if h >= self.size and w >= self.size:  # 短边比窗口大，跳过
+            pass
+        else:
+            self._pad_img(results)
+            self._pad_seg(results)
+        return results
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(size={self.size}, pad_val={self.pad_val})'
+        return repr_str
+@PIPELINES.register_module()
+class MapillaryHack(object):
+    """map MV 65 class to 19 class like Cityscapes."""
+    def __init__(self):
+        self.map = [[13, 24, 41], [2, 15], [17], [6], [3],
+                    [45, 47], [48], [50], [30], [29], [27], [19], [20, 21, 22],
+                    [55], [61], [54], [58], [57], [52]]
+        self.others = [i for i in range(66)]
+        for i in self.map:
+            for j in i:
+                if j in self.others:
+                    self.others.remove(j)
+    def __call__(self, results):
+        """Call function to process the image with gamma correction.
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Processed results.
+        """
+        gt_map = results['gt_semantic_seg']
+        # others -> 255
+        new_gt_map = np.zeros_like(gt_map)
+        for value in self.others:
+            new_gt_map[gt_map == value] = 255
+        for index, map in enumerate(self.map):
+            for value in map:
+                new_gt_map[gt_map == value] = index
+        results['gt_semantic_seg'] = new_gt_map
+        return results
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        return repr_str