Release checkpoint of InternImage-H+Mask2Former (#207)

* update configs * update readme * upload mask2former_beit.py

Release checkpoint of InternImage-H+Mask2Former (#207)
* update configs * update readme * upload mask2former_beit.py
7142c933 · Zhe Chen · GitHub · 305e110f · 7142c933 · 7142c933
Unverified Commit 7142c933 authored Jul 18, 2023 by Zhe Chen Committed by GitHub Jul 18, 2023
5 changed files
--- a/segmentation/configs/_base_/models/mask2former_beit.py
+++ b/segmentation/configs/_base_/models/mask2former_beit.py
+# model_cfg
+num_things_classes = 100
+num_stuff_classes = 50
+num_classes = num_things_classes + num_stuff_classes
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='EncoderDecoderMask2Former',
+    pretrained=None,
+    backbone=dict(
+        type='XCiT',
+        patch_size=16,
+        embed_dim=384,
+        depth=12,
+        num_heads=8,
+        mlp_ratio=4,
+        qkv_bias=True,
+        use_abs_pos_emb=True,
+        use_rel_pos_bias=False,
+    ),
+    decode_head=dict(
+        type='Mask2FormerHead',
+        in_channels=[256, 512, 1024, 2048],  # pass to pixel_decoder inside
+        # strides=[4, 8, 16, 32],
+        feat_channels=256,
+        out_channels=256,
+        in_index=[0, 1, 2, 3],
+        num_things_classes=num_things_classes,
+        num_stuff_classes=num_stuff_classes,
+        num_queries=100,
+        num_transformer_feat_level=3,
+        pixel_decoder=dict(
+            type='MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(
+                type='DetrTransformerEncoder',
+                num_layers=6,
+                transformerlayers=dict(
+                    type='BaseTransformerLayer',
+                    attn_cfgs=dict(
+                        type='MultiScaleDeformableAttention',
+                        embed_dims=256,
+                        num_heads=8,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=False,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfgs=dict(
+                        type='FFN',
+                        embed_dims=256,
+                        feedforward_channels=1024,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True)),
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm')),
+                init_cfg=None),
+            positional_encoding=dict(
+                type='SinePositionalEncoding', num_feats=128, normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True),
+        transformer_decoder=dict(
+            type='DetrTransformerDecoder',
+            return_intermediate=True,
+            num_layers=9,
+            transformerlayers=dict(
+                type='DetrTransformerDecoderLayer',
+                attn_cfgs=dict(
+                    type='MultiheadAttention',
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=False),
+                ffn_cfgs=dict(
+                    embed_dims=256,
+                    feedforward_channels=2048,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True),
+                feedforward_channels=2048,
+                operation_order=('cross_attn', 'norm', 'self_attn', 'norm',
+                                 'ffn', 'norm')),
+            init_cfg=None),
+        loss_cls=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0)),
+    train_cfg=dict(
+        num_points=12544,
+        oversample_ratio=3.0,
+        importance_sample_ratio=0.75,
+        assigner=dict(
+            type='MaskHungarianAssigner',
+            cls_cost=dict(type='ClassificationCost', weight=2.0),
+            mask_cost=dict(
+                type='CrossEntropyLossCost', weight=5.0, use_sigmoid=True),
+            dice_cost=dict(
+                type='DiceCost', weight=5.0, pred_act=True, eps=1.0)),
+        sampler=dict(type='MaskPseudoSampler')),
+    test_cfg=dict(
+        panoptic_on=True,
+        # For now, the dataset does not support
+        # evaluating semantic segmentation metric.
+        semantic_on=False,
+        instance_on=True,
+        # max_per_image is for instance segmentation.
+        max_per_image=100,
+        iou_thr=0.8,
+        # In Mask2Former's panoptic postprocessing,
+        # it will filter mask area where score is less than 0.5 .
+        filter_low_score=True),
+    init_cfg=None)
+# find_unused_parameters = True
\ No newline at end of file
--- a/segmentation/configs/ade20k/README.md
+++ b/segmentation/configs/ade20k/README.md
@@ -28,4 +28,4 @@ The ADE20K semantic segmentation dataset contains more than 20K scene-centric im
 | backbone       | resolution | mIoU (ss/ms) | train speed | train time | #param | FLOPs | Config | Download            |
 |:--------------:|:----------:|:-----------:|:-----------:|:----------:|:-------:|:-----:|:-----:|:-------------------:|
-| InternImage-H  | 896x896    | 62.5 / 62.9  | 1.21s / iter       | 1.5d (2n)       | 1.31B    | 4635G | [config](./mask2former_internimage_h_896_80k_cocostuff2ade20k_ss.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/mask2former_internimage_h_896_80k_cocostuff2ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/mask2former_internimage_h_896_80k_cocostuff2ade20k.log.json) |
+| InternImage-H  | 896x896    | 62.6 / 62.9  | 1.21s / iter       | 1.5d (2n)       | 1.31B    | 4635G | [config](./mask2former_internimage_h_896_80k_cocostuff2ade20k_ss.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/mask2former_internimage_h_896_80k_cocostuff2ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/mask2former_internimage_h_896_80k_cocostuff2ade20k.log.json) |
--- a/segmentation/configs/ade20k/mask2former_internimage_h_896_80k_cocostuff2ade20k_ss.py
+++ b/segmentation/configs/ade20k/mask2former_internimage_h_896_80k_cocostuff2ade20k_ss.py
@@ -161,4 +161,3 @@ optimizer_config = dict(_delete_=True, grad_clip=dict(max_norm=0.1, norm_type=2)
 checkpoint_config = dict(by_epoch=False, interval=1000, max_keep_ckpts=1)
 evaluation = dict(interval=2000, metric='mIoU', save_best='mIoU')
 # fp16 = dict(loss_scale=dict(init_scale=512))
--- a/segmentation/configs/coco_stuff164k/README.md
+++ b/segmentation/configs/coco_stuff164k/README.md
+# COCO-Stuff-164K
+<!-- [ALGORITHM] -->
+## Introduction
+The Common Objects in COntext-stuff (COCO-stuff) dataset is a dataset for scene understanding tasks like semantic segmentation, object detection and image captioning. It is constructed by annotating the original COCO dataset, which originally annotated things while neglecting stuff annotations.  There are 164k images in COCO-Stuff-164K dataset that span over 172 categories including 80 things, 91 stuff, and 1 unlabeled class.
+## Model Zoo
+### Mask2Former + InternImage
+| backbone       | resolution | mIoU (ss) | train speed | train time | #param | FLOPs | Config | Download            |
+|:--------------:|:----------:|:-----------:|:-----------:|:----------:|:-------:|:-----:|:-----:|:-------------------:|
+| InternImage-H  | 896x896    | 52.6  | 1.6s / iter       | 1.5d (2n)       | 1.31B    | 4635G | [config](./mask2former_internimage_h_896_80k_cocostuff164k_ss.py) | [ckpt](https://huggingface.co/OpenGVLab/InternImage/resolve/main/mask2former_internimage_h_896_80k_cocostuff164k.pth) \| [log](https://huggingface.co/OpenGVLab/InternImage/raw/main/mask2former_internimage_h_896_80k_cocostuff164k.log.json) |
--- a/segmentation/configs/coco_stuff164k/mask2former_internimage_h_896_80k_cocostuff164k_ss.py
+++ b/segmentation/configs/coco_stuff164k/mask2former_internimage_h_896_80k_cocostuff164k_ss.py
+# --------------------------------------------------------
+# InternImage
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+_base_ = [
+    '../_base_/models/mask2former_beit.py', '../_base_/datasets/coco-stuff164k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+num_classes = 171
+pretrained = 'https://huggingface.co/OpenGVLab/InternImage/resolve/main/internimage_h_jointto22k_384.pth'
+model = dict(
+    type='EncoderDecoderMask2Former',
+    backbone=dict(
+        _delete_=True,
+        type='InternImage',
+        core_op='DCNv3',
+        channels=320,
+        depths=[6, 6, 32, 6],
+        groups=[10, 20, 40, 80],
+        mlp_ratio=4.,
+        drop_path_rate=0.5,
+        norm_layer='LN',
+        layer_scale=None,
+        offset_scale=1.0,
+        post_norm=False,
+        dw_kernel_size=5, # for InternImage-H/G
+        res_post_norm=True, # for InternImage-H/G
+        level2_post_norm=True, # for InternImage-H/G
+        level2_post_norm_block_ids=[5, 11, 17, 23, 29], # for InternImage-H/G
+        center_feature_scale=True, # for InternImage-H/G
+        with_cp=False,
+        out_indices=(0, 1, 2, 3),
+        init_cfg=None),
+    decode_head=dict(
+        in_channels=[320, 640, 1280, 2560],
+        feat_channels=1024,
+        out_channels=1024,
+        num_classes=num_classes,
+        num_queries=200,
+        pixel_decoder=dict(
+            type='MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(
+                type='DetrTransformerEncoder',
+                num_layers=6,
+                transformerlayers=dict(
+                    type='BaseTransformerLayer',
+                    attn_cfgs=dict(
+                        type='MultiScaleDeformableAttention',
+                        embed_dims=1024,
+                        num_heads=32,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=False,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfgs=dict(
+                        type='FFN',
+                        embed_dims=1024,
+                        feedforward_channels=4096,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        with_cp=False,  # set with_cp=True to save memory
+                        act_cfg=dict(type='ReLU', inplace=True)),
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm')),
+                init_cfg=None),
+            positional_encoding=dict(
+                type='SinePositionalEncoding', num_feats=512, normalize=True),
+            init_cfg=None),
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=512, normalize=True),
+        transformer_decoder=dict(
+            type='DetrTransformerDecoder',
+            return_intermediate=True,
+            num_layers=9,
+            transformerlayers=dict(
+                type='DetrTransformerDecoderLayer',
+                attn_cfgs=dict(
+                    type='MultiheadAttention',
+                    embed_dims=1024,
+                    num_heads=32,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=False),
+                ffn_cfgs=dict(
+                    embed_dims=1024,
+                    feedforward_channels=4096,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    with_cp=False,  # set with_cp=True to save memory
+                    add_identity=True),
+                feedforward_channels=4096,
+                operation_order=('cross_attn', 'norm', 'self_attn', 'norm',
+                                 'ffn', 'norm')),
+            init_cfg=None),
+        loss_cls=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1])
+    ),
+    test_cfg=dict(mode='whole'))
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+crop_size = (896, 896)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(type='Resize', img_scale=(3584, 896), ratio_range=(0.5, 2.0)),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
+    dict(type='ToMask'),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_semantic_seg', 'gt_masks', 'gt_labels'])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(3584, 896),
+        # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='ResizeToMultiple', size_divisor=32),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+optimizer = dict(
+    _delete_=True, type='AdamW', lr=0.00002, betas=(0.9, 0.999), weight_decay=0.05,
+    constructor='CustomLayerDecayOptimizerConstructor',
+    paramwise_cfg=dict(num_layers=50, layer_decay_rate=0.95,
+                       depths=[6, 6, 32, 6], offset_lr_scale=1.0))
+lr_config = dict(_delete_=True, policy='poly',
+                 warmup='linear',
+                 warmup_iters=1500,
+                 warmup_ratio=1e-6,
+                 power=1.0, min_lr=0.0, by_epoch=False)
+# By default, models are trained on 16 GPUs with 1 images per GPU
+data = dict(samples_per_gpu=1,
+            train=dict(pipeline=train_pipeline),
+            val=dict(pipeline=test_pipeline),
+            test=dict(pipeline=test_pipeline))
+runner = dict(type='IterBasedRunner')
+optimizer_config = dict(_delete_=True, grad_clip=dict(max_norm=0.1, norm_type=2))
+checkpoint_config = dict(by_epoch=False, interval=1000, max_keep_ckpts=1)
+evaluation = dict(interval=8000, metric='mIoU', save_best='mIoU')
+# fp16 = dict(loss_scale=dict(init_scale=512))