first commit

dff2c686 · renzhc · 8f9dd0ed · dff2c686 · dff2c686 · dff2c686
Commit dff2c686 authored Sep 03, 2024 by renzhc
20 changed files
--- a/configs/riformer/deploy/riformer-s24-deploy_8xb128_in1k-384px.py
+++ b/configs/riformer/deploy/riformer-s24-deploy_8xb128_in1k-384px.py
+_base_ = '../riformer-s24_8xb128_in1k-384px.py'
+
+model = dict(backbone=dict(deploy=True))
--- a/configs/riformer/deploy/riformer-s24-deploy_8xb128_in1k.py
+++ b/configs/riformer/deploy/riformer-s24-deploy_8xb128_in1k.py
+_base_ = '../riformer-s24_8xb128_in1k.py'
+
+model = dict(backbone=dict(deploy=True))
--- a/configs/riformer/deploy/riformer-s36-deploy_8xb128_in1k.py
+++ b/configs/riformer/deploy/riformer-s36-deploy_8xb128_in1k.py
+_base_ = '../riformer-s36_8xb128_in1k.py'
+
+model = dict(backbone=dict(deploy=True))
--- a/configs/riformer/deploy/riformer-s36-deploy_8xb64_in1k-384px.py
+++ b/configs/riformer/deploy/riformer-s36-deploy_8xb64_in1k-384px.py
+_base_ = '../riformer-s36_8xb64_in1k-384px.py'
+
+model = dict(backbone=dict(deploy=True))
--- a/configs/riformer/metafile.yml
+++ b/configs/riformer/metafile.yml
+Collections:
+  - Name: RIFormer
+    Metadata:
+      Training Data: ImageNet-1k
+      Training Resources: 8x A100 GPUs
+      Architecture:
+        - Affine
+        - 1x1 Convolution
+        - LayerScale
+    Paper:
+      URL: https://arxiv.org/abs/xxxx.xxxxx
+      Title: "RIFormer: Keep Your Vision Backbone Effective But Removing Token Mixer"
+    README: configs/riformer/README.md
+    Code:
+      Version: v1.0.0rc7
+      URL: null
+
+Models:
+  - Name: riformer-s12_in1k
+    Metadata:
+      FLOPs: 1822000000
+      Parameters: 11915000
+    In Collection: RIFormer
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 76.90
+          Top 5 Accuracy: 93.06
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v1/riformer/riformer-s12_32xb128_in1k_20230406-6741ce71.pth
+    Config: configs/riformer/riformer-s12_8xb128_in1k.py
+  - Name: riformer-s24_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 3412000000
+      Parameters: 21389000
+    In Collection: RIFormer
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 80.28
+          Top 5 Accuracy: 94.80
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v1/riformer/riformer-s24_32xb128_in1k_20230406-fdab072a.pth
+    Config: configs/riformer/riformer-s24_8xb128_in1k.py
+  - Name: riformer-s36_in1k
+    Metadata:
+      FLOPs: 5003000000
+      Parameters: 30863000
+    In Collection: RIFormer
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 81.29
+          Top 5 Accuracy: 95.41
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v1/riformer/riformer-s36_32xb128_in1k_20230406-fdfcd3b0.pth
+    Config: configs/riformer/riformer-s36_8xb128_in1k.py
+  - Name: riformer-m36_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 8801000000
+      Parameters: 56173000
+    In Collection: RIFormer
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 82.57
+          Top 5 Accuracy: 95.99
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v1/riformer/riformer-m36_32xb128_in1k_20230406-2fcb9d9b.pth
+    Config: configs/riformer/riformer-m36_8xb128_in1k.py
+  - Name: riformer-m48_in1k
+    Metadata:
+      FLOPs: 11590000000
+      Parameters: 73473000
+    In Collection: RIFormer
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 82.75
+          Top 5 Accuracy: 96.11
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v1/riformer/riformer-m48_32xb128_in1k_20230406-2b9d1abf.pth
+    Config: configs/riformer/riformer-m48_8xb64_in1k.py
+  - Name: riformer-s12_in1k-384
+    Metadata:
+      FLOPs: 5355000000
+      Parameters: 11915000
+    In Collection: RIFormer
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 78.29
+          Top 5 Accuracy: 93.93
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v1/riformer/riformer-s12_32xb128_in1k-384px_20230406-145eda4c.pth
+    Config: configs/riformer/riformer-s12_8xb128_in1k-384px.py
+  - Name: riformer-s24_in1k-384
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 10028000000
+      Parameters: 21389000
+    In Collection: RIFormer
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 81.36
+          Top 5 Accuracy: 95.40
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v1/riformer/riformer-s24_32xb128_in1k-384px_20230406-bafae7ab.pth
+    Config: configs/riformer/riformer-s24_8xb128_in1k-384px.py
+  - Name: riformer-s36_in1k-384
+    Metadata:
+      FLOPs: 14702000000
+      Parameters: 30863000
+    In Collection: RIFormer
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 82.22
+          Top 5 Accuracy: 95.95
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v1/riformer/riformer-s36_32xb128_in1k-384px_20230406-017ed3c4.pth
+    Config: configs/riformer/riformer-s36_8xb64_in1k-384px.py
+  - Name: riformer-m36_in1k-384
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 25865000000
+      Parameters: 56173000
+    In Collection: RIFormer
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 83.39
+          Top 5 Accuracy: 96.40
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v1/riformer/riformer-m36_32xb128_in1k-384px_20230406-66a6f764.pth
+    Config: configs/riformer/riformer-m36_8xb64_in1k-384px.py
+  - Name: riformer-m48_in1k-384
+    Metadata:
+      FLOPs: 34060000000
+      Parameters: 73473000
+    In Collection: RIFormer
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 83.70
+          Top 5 Accuracy: 96.60
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v1/riformer/riformer-m48_32xb128_in1k-384px_20230406-2e874826.pth
+    Config: configs/riformer/riformer-m48_8xb64_in1k-384px.py
--- a/configs/riformer/riformer-m36_8xb128_in1k.py
+++ b/configs/riformer/riformer-m36_8xb128_in1k.py
+_base_ = [
+    '../_base_/datasets/imagenet_bs128_poolformer_medium_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# Model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='RIFormer',
+        arch='m36',
+        drop_path_rate=0.1,
+        init_cfg=[
+            dict(
+                type='TruncNormal',
+                layer=['Conv2d', 'Linear'],
+                std=.02,
+                bias=0.),
+            dict(type='Constant', layer=['GroupNorm'], val=1., bias=0.),
+        ]),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=768,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+    ))
+
+# schedule settings
+optim_wrapper = dict(
+    optimizer=dict(lr=4e-3),
+    clip_grad=dict(max_norm=5.0),
+)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+# base_batch_size = (32 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
--- a/configs/riformer/riformer-m36_8xb64_in1k-384px.py
+++ b/configs/riformer/riformer-m36_8xb64_in1k-384px.py
+_base_ = [
+    '../_base_/datasets/imagenet_bs128_riformer_medium_384.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# Model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='RIFormer',
+        arch='m36',
+        drop_path_rate=0.1,
+        init_cfg=[
+            dict(
+                type='TruncNormal',
+                layer=['Conv2d', 'Linear'],
+                std=.02,
+                bias=0.),
+            dict(type='Constant', layer=['GroupNorm'], val=1., bias=0.),
+        ]),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=768,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+    ))
+
+# schedule settings
+optim_wrapper = dict(
+    optimizer=dict(lr=4e-3),
+    clip_grad=dict(max_norm=5.0),
+)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+# base_batch_size = (32 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
--- a/configs/riformer/riformer-m48_8xb64_in1k-384px.py
+++ b/configs/riformer/riformer-m48_8xb64_in1k-384px.py
+_base_ = [
+    '../_base_/datasets/imagenet_bs128_riformer_medium_384.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# Model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='RIFormer',
+        arch='m48',
+        drop_path_rate=0.1,
+        init_cfg=[
+            dict(
+                type='TruncNormal',
+                layer=['Conv2d', 'Linear'],
+                std=.02,
+                bias=0.),
+            dict(type='Constant', layer=['GroupNorm'], val=1., bias=0.),
+        ]),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=768,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+    ))
+
+# schedule settings
+optim_wrapper = dict(
+    optimizer=dict(lr=4e-3),
+    clip_grad=dict(max_norm=5.0),
+)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+# base_batch_size = (32 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
--- a/configs/riformer/riformer-m48_8xb64_in1k.py
+++ b/configs/riformer/riformer-m48_8xb64_in1k.py
+_base_ = [
+    '../_base_/datasets/imagenet_bs128_poolformer_medium_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# Model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='RIFormer',
+        arch='m48',
+        drop_path_rate=0.1,
+        init_cfg=[
+            dict(
+                type='TruncNormal',
+                layer=['Conv2d', 'Linear'],
+                std=.02,
+                bias=0.),
+            dict(type='Constant', layer=['GroupNorm'], val=1., bias=0.),
+        ]),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=768,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+    ))
+
+# schedule settings
+optim_wrapper = dict(
+    optimizer=dict(lr=4e-3),
+    clip_grad=dict(max_norm=5.0),
+)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+# base_batch_size = (32 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
--- a/configs/riformer/riformer-s12_8xb128_in1k-384px.py
+++ b/configs/riformer/riformer-s12_8xb128_in1k-384px.py
+_base_ = [
+    '../_base_/datasets/imagenet_bs128_riformer_small_384.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# Model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='RIFormer',
+        arch='s12',
+        drop_path_rate=0.1,
+        init_cfg=[
+            dict(
+                type='TruncNormal',
+                layer=['Conv2d', 'Linear'],
+                std=.02,
+                bias=0.),
+            dict(type='Constant', layer=['GroupNorm'], val=1., bias=0.),
+        ]),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=512,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+    ))
+
+# schedule settings
+optim_wrapper = dict(
+    optimizer=dict(lr=4e-3),
+    clip_grad=dict(max_norm=5.0),
+)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+# base_batch_size = (32 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
--- a/configs/riformer/riformer-s12_8xb128_in1k.py
+++ b/configs/riformer/riformer-s12_8xb128_in1k.py
+_base_ = [
+    '../_base_/datasets/imagenet_bs128_poolformer_small_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# Model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='RIFormer',
+        arch='s12',
+        drop_path_rate=0.1,
+        init_cfg=[
+            dict(
+                type='TruncNormal',
+                layer=['Conv2d', 'Linear'],
+                std=.02,
+                bias=0.),
+            dict(type='Constant', layer=['GroupNorm'], val=1., bias=0.),
+        ]),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=512,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+    ))
+
+# schedule settings
+optim_wrapper = dict(
+    optimizer=dict(lr=4e-3),
+    clip_grad=dict(max_norm=5.0),
+)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+# base_batch_size = (32 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
--- a/configs/riformer/riformer-s24_8xb128_in1k-384px.py
+++ b/configs/riformer/riformer-s24_8xb128_in1k-384px.py
+_base_ = [
+    '../_base_/datasets/imagenet_bs128_riformer_small_384.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# Model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='RIFormer',
+        arch='s24',
+        drop_path_rate=0.1,
+        init_cfg=[
+            dict(
+                type='TruncNormal',
+                layer=['Conv2d', 'Linear'],
+                std=.02,
+                bias=0.),
+            dict(type='Constant', layer=['GroupNorm'], val=1., bias=0.),
+        ]),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=512,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+    ))
+
+# schedule settings
+optim_wrapper = dict(
+    optimizer=dict(lr=4e-3),
+    clip_grad=dict(max_norm=5.0),
+)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+# base_batch_size = (32 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
--- a/configs/riformer/riformer-s24_8xb128_in1k.py
+++ b/configs/riformer/riformer-s24_8xb128_in1k.py
+_base_ = [
+    '../_base_/datasets/imagenet_bs128_poolformer_small_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# Model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='RIFormer',
+        arch='s24',
+        drop_path_rate=0.1,
+        init_cfg=[
+            dict(
+                type='TruncNormal',
+                layer=['Conv2d', 'Linear'],
+                std=.02,
+                bias=0.),
+            dict(type='Constant', layer=['GroupNorm'], val=1., bias=0.),
+        ]),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=512,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+    ))
+
+# schedule settings
+optim_wrapper = dict(
+    optimizer=dict(lr=4e-3),
+    clip_grad=dict(max_norm=5.0),
+)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+# base_batch_size = (32 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
--- a/configs/riformer/riformer-s36_8xb128_in1k.py
+++ b/configs/riformer/riformer-s36_8xb128_in1k.py
+_base_ = [
+    '../_base_/datasets/imagenet_bs128_poolformer_small_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# Model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='RIFormer',
+        arch='s36',
+        drop_path_rate=0.1,
+        init_cfg=[
+            dict(
+                type='TruncNormal',
+                layer=['Conv2d', 'Linear'],
+                std=.02,
+                bias=0.),
+            dict(type='Constant', layer=['GroupNorm'], val=1., bias=0.),
+        ]),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=512,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+    ))
+
+# schedule settings
+optim_wrapper = dict(
+    optimizer=dict(lr=4e-3),
+    clip_grad=dict(max_norm=5.0),
+)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+# base_batch_size = (32 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
--- a/configs/riformer/riformer-s36_8xb64_in1k-384px.py
+++ b/configs/riformer/riformer-s36_8xb64_in1k-384px.py
+_base_ = [
+    '../_base_/datasets/imagenet_bs128_riformer_small_384.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# Model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='RIFormer',
+        arch='s36',
+        drop_path_rate=0.1,
+        init_cfg=[
+            dict(
+                type='TruncNormal',
+                layer=['Conv2d', 'Linear'],
+                std=.02,
+                bias=0.),
+            dict(type='Constant', layer=['GroupNorm'], val=1., bias=0.),
+        ]),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=512,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+    ))
+
+# schedule settings
+optim_wrapper = dict(
+    optimizer=dict(lr=4e-3),
+    clip_grad=dict(max_norm=5.0),
+)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+# base_batch_size = (32 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
--- a/configs/sam/README.md
+++ b/configs/sam/README.md
+# SAM
+
+> [Segment Anything](https://arxiv.org/abs/2304.02643)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+We introduce the Segment Anything (SA) project: a new task, model, and dataset for image segmentation. Using our efficient model in a data collection loop, we built the largest segmentation dataset to date (by far), with over 1 billionmasks on 11M licensed and privacy respecting images. The model is designed and trained to be promptable, so it can transfer zero-shot to new image distributions and tasks. We evaluate its capabilities on numerous tasks and find that its zero-shot performance is impressive – often competitive with or even superior to prior fully supervised results. We are releasing the Segment Anything Model (SAM) and corresponding dataset (SA-1B) of 1B masks and 11M images at https://segment-anything.com to foster research into foundation models for computer vision.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/36138628/231106092-261ff035-dd3b-4a8b-b2e7-e91f195090a1.png" width="100%"/>
+</div>
+
+## How to use it?
+
+<!-- [TABS-BEGIN] -->
+
+**Use the model**
+
+```python
+import torch
+from mmpretrain import get_model
+
+model = get_model('vit-base-p16_sam-pre_3rdparty_sa1b-1024px', pretrained=True)
+inputs = torch.rand(1, 3, 1024, 1024)
+out = model(inputs)
+print(type(out))
+# To extract features.
+feats = model.extract_feat(inputs)
+print(type(feats))
+```
+
+<!-- [TABS-END] -->
+
+## Models and results
+
+### Pretrained models
+
+| Model                                          | Params (M) | Flops (G) |                 Config                  |                                             Download                                             |
+| :--------------------------------------------- | :--------: | :-------: | :-------------------------------------: | :----------------------------------------------------------------------------------------------: |
+| `vit-base-p16_sam-pre_3rdparty_sa1b-1024px`\*  |   89.67    |  486.00   | [config](vit-base-p16_sam_headless.py)  | [model](https://download.openmmlab.com/mmclassification/v1/vit_sam/vit-base-p16_sam-pre_3rdparty_sa1b-1024px_20230411-2320f9cc.pth) |
+| `vit-large-p16_sam-pre_3rdparty_sa1b-1024px`\* |   308.00   |  1494.00  | [config](vit-large-p16_sam_headless.py) | [model](https://download.openmmlab.com/mmclassification/v1/vit_sam/vit-large-p16_sam-pre_3rdparty_sa1b-1024px_20230411-595feafd.pth) |
+| `vit-huge-p16_sam-pre_3rdparty_sa1b-1024px`\*  |   637.00   |  2982.00  | [config](vit-huge-p16_sam_headless.py)  | [model](https://download.openmmlab.com/mmclassification/v1/vit_sam/vit-huge-p16_sam-pre_3rdparty_sa1b-1024px_20230411-3f13c653.pth) |
+
+*Models with * are converted from the [official repo](https://github.com/facebookresearch/segment-anything/). The config files of these models are only for inference. We haven't reproduce the training results.*
+
+## Citation
+
+```bibtex
+@article{kirillov2023segany,
+  title={Segment Anything},
+  author={Kirillov, Alexander and Mintun, Eric and Ravi, Nikhila and Mao, Hanzi and Rolland, Chloe and Gustafson, Laura and Xiao, Tete and Whitehead, Spencer and Berg, Alexander C. and Lo, Wan-Yen and Doll{\'a}r, Piotr and Girshick, Ross},
+  journal={arXiv:2304.02643},
+  year={2023}
+}
+```
--- a/configs/sam/metafile.yml
+++ b/configs/sam/metafile.yml
+Collections:
+  - Name: SAM
+    Metadata:
+      Architecture:
+        - Convolution
+        - Dense Connections
+        - Dropout
+        - GELU
+        - Layer Normalization
+        - Multi-Head Attention
+        - Scaled Dot-Product Attention
+    Paper:
+      Title: 'Segment Anything'
+      URL: https://arxiv.org/abs/2304.02643
+    README: configs/sam/README.md
+    Code:
+      URL: null
+      Version: null
+
+Models:
+  - Name: vit-base-p16_sam-pre_3rdparty_sa1b-1024px
+    Metadata:
+      FLOPs: 486000000000
+      Parameters: 89671000
+      Training Data:
+        - SA-1B
+    In Collection: SAM
+    Results: null
+    Weights: https://download.openmmlab.com/mmclassification/v1/vit_sam/vit-base-p16_sam-pre_3rdparty_sa1b-1024px_20230411-2320f9cc.pth
+    Config: configs/sam/vit-base-p16_sam_headless.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth
+      Code: https://github.com/facebookresearch/segment-anything/
+
+  - Name: vit-large-p16_sam-pre_3rdparty_sa1b-1024px
+    Metadata:
+      FLOPs: 1494000000000
+      Parameters: 308000000
+      Training Data:
+        - SA-1B
+    In Collection: SAM
+    Results: null
+    Weights: https://download.openmmlab.com/mmclassification/v1/vit_sam/vit-large-p16_sam-pre_3rdparty_sa1b-1024px_20230411-595feafd.pth
+    Config: configs/sam/vit-large-p16_sam_headless.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth
+      Code: https://github.com/facebookresearch/segment-anything/
+
+  - Name: vit-huge-p16_sam-pre_3rdparty_sa1b-1024px
+    Metadata:
+      FLOPs: 2982000000000
+      Parameters: 637000000
+      Training Data:
+        - SA-1B
+    In Collection: SAM
+    Results: null
+    Weights: https://download.openmmlab.com/mmclassification/v1/vit_sam/vit-huge-p16_sam-pre_3rdparty_sa1b-1024px_20230411-3f13c653.pth
+    Config: configs/sam/vit-huge-p16_sam_headless.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth
+      Code: https://github.com/facebookresearch/segment-anything/
--- a/configs/sam/vit-base-p16_sam_headless.py
+++ b/configs/sam/vit-base-p16_sam_headless.py
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='ViTSAM',
+        arch='base',
+        img_size=1024,
+        patch_size=16,
+        out_channels=256,
+        use_abs_pos=True,
+        use_rel_pos=True,
+        window_size=14,
+    ),
+    neck=None,
+    head=None,
+)
+
+data_preprocessor = dict(
+    # RGB format normalization parameters
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    # convert image from BGR to RGB
+    to_rgb=True,
+)
--- a/configs/sam/vit-huge-p16_sam_headless.py
+++ b/configs/sam/vit-huge-p16_sam_headless.py
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='ViTSAM',
+        arch='huge',
+        img_size=1024,
+        patch_size=16,
+        out_channels=256,
+        use_abs_pos=True,
+        use_rel_pos=True,
+        window_size=14,
+    ),
+    neck=None,
+    head=None,
+)
+
+data_preprocessor = dict(
+    # RGB format normalization parameters
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    # convert image from BGR to RGB
+    to_rgb=True,
+)
--- a/configs/sam/vit-large-p16_sam_headless.py
+++ b/configs/sam/vit-large-p16_sam_headless.py
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='ViTSAM',
+        arch='large',
+        img_size=1024,
+        patch_size=16,
+        out_channels=256,
+        use_abs_pos=True,
+        use_rel_pos=True,
+        window_size=14,
+    ),
+    neck=None,
+    head=None,
+)
+
+data_preprocessor = dict(
+    # RGB format normalization parameters
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    # convert image from BGR to RGB
+    to_rgb=True,
+)