Commit dff2c686 authored by renzhc's avatar renzhc
Browse files

first commit

parent 8f9dd0ed
Pipeline #1665 canceled with stages
_base_ = '../riformer-s24_8xb128_in1k-384px.py'
model = dict(backbone=dict(deploy=True))
_base_ = '../riformer-s24_8xb128_in1k.py'
model = dict(backbone=dict(deploy=True))
_base_ = '../riformer-s36_8xb128_in1k.py'
model = dict(backbone=dict(deploy=True))
_base_ = '../riformer-s36_8xb64_in1k-384px.py'
model = dict(backbone=dict(deploy=True))
Collections:
- Name: RIFormer
Metadata:
Training Data: ImageNet-1k
Training Resources: 8x A100 GPUs
Architecture:
- Affine
- 1x1 Convolution
- LayerScale
Paper:
URL: https://arxiv.org/abs/xxxx.xxxxx
Title: "RIFormer: Keep Your Vision Backbone Effective But Removing Token Mixer"
README: configs/riformer/README.md
Code:
Version: v1.0.0rc7
URL: null
Models:
- Name: riformer-s12_in1k
Metadata:
FLOPs: 1822000000
Parameters: 11915000
In Collection: RIFormer
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 76.90
Top 5 Accuracy: 93.06
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v1/riformer/riformer-s12_32xb128_in1k_20230406-6741ce71.pth
Config: configs/riformer/riformer-s12_8xb128_in1k.py
- Name: riformer-s24_in1k
Metadata:
Training Data: ImageNet-1k
FLOPs: 3412000000
Parameters: 21389000
In Collection: RIFormer
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 80.28
Top 5 Accuracy: 94.80
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v1/riformer/riformer-s24_32xb128_in1k_20230406-fdab072a.pth
Config: configs/riformer/riformer-s24_8xb128_in1k.py
- Name: riformer-s36_in1k
Metadata:
FLOPs: 5003000000
Parameters: 30863000
In Collection: RIFormer
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 81.29
Top 5 Accuracy: 95.41
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v1/riformer/riformer-s36_32xb128_in1k_20230406-fdfcd3b0.pth
Config: configs/riformer/riformer-s36_8xb128_in1k.py
- Name: riformer-m36_in1k
Metadata:
Training Data: ImageNet-1k
FLOPs: 8801000000
Parameters: 56173000
In Collection: RIFormer
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 82.57
Top 5 Accuracy: 95.99
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v1/riformer/riformer-m36_32xb128_in1k_20230406-2fcb9d9b.pth
Config: configs/riformer/riformer-m36_8xb128_in1k.py
- Name: riformer-m48_in1k
Metadata:
FLOPs: 11590000000
Parameters: 73473000
In Collection: RIFormer
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 82.75
Top 5 Accuracy: 96.11
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v1/riformer/riformer-m48_32xb128_in1k_20230406-2b9d1abf.pth
Config: configs/riformer/riformer-m48_8xb64_in1k.py
- Name: riformer-s12_in1k-384
Metadata:
FLOPs: 5355000000
Parameters: 11915000
In Collection: RIFormer
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 78.29
Top 5 Accuracy: 93.93
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v1/riformer/riformer-s12_32xb128_in1k-384px_20230406-145eda4c.pth
Config: configs/riformer/riformer-s12_8xb128_in1k-384px.py
- Name: riformer-s24_in1k-384
Metadata:
Training Data: ImageNet-1k
FLOPs: 10028000000
Parameters: 21389000
In Collection: RIFormer
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 81.36
Top 5 Accuracy: 95.40
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v1/riformer/riformer-s24_32xb128_in1k-384px_20230406-bafae7ab.pth
Config: configs/riformer/riformer-s24_8xb128_in1k-384px.py
- Name: riformer-s36_in1k-384
Metadata:
FLOPs: 14702000000
Parameters: 30863000
In Collection: RIFormer
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 82.22
Top 5 Accuracy: 95.95
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v1/riformer/riformer-s36_32xb128_in1k-384px_20230406-017ed3c4.pth
Config: configs/riformer/riformer-s36_8xb64_in1k-384px.py
- Name: riformer-m36_in1k-384
Metadata:
Training Data: ImageNet-1k
FLOPs: 25865000000
Parameters: 56173000
In Collection: RIFormer
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 83.39
Top 5 Accuracy: 96.40
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v1/riformer/riformer-m36_32xb128_in1k-384px_20230406-66a6f764.pth
Config: configs/riformer/riformer-m36_8xb64_in1k-384px.py
- Name: riformer-m48_in1k-384
Metadata:
FLOPs: 34060000000
Parameters: 73473000
In Collection: RIFormer
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 83.70
Top 5 Accuracy: 96.60
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v1/riformer/riformer-m48_32xb128_in1k-384px_20230406-2e874826.pth
Config: configs/riformer/riformer-m48_8xb64_in1k-384px.py
_base_ = [
'../_base_/datasets/imagenet_bs128_poolformer_medium_224.py',
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../_base_/default_runtime.py',
]
# Model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='RIFormer',
arch='m36',
drop_path_rate=0.1,
init_cfg=[
dict(
type='TruncNormal',
layer=['Conv2d', 'Linear'],
std=.02,
bias=0.),
dict(type='Constant', layer=['GroupNorm'], val=1., bias=0.),
]),
neck=dict(type='GlobalAveragePooling'),
head=dict(
type='LinearClsHead',
num_classes=1000,
in_channels=768,
loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
))
# schedule settings
optim_wrapper = dict(
optimizer=dict(lr=4e-3),
clip_grad=dict(max_norm=5.0),
)
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
# base_batch_size = (32 GPUs) x (128 samples per GPU)
auto_scale_lr = dict(base_batch_size=4096)
_base_ = [
'../_base_/datasets/imagenet_bs128_riformer_medium_384.py',
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../_base_/default_runtime.py',
]
# Model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='RIFormer',
arch='m36',
drop_path_rate=0.1,
init_cfg=[
dict(
type='TruncNormal',
layer=['Conv2d', 'Linear'],
std=.02,
bias=0.),
dict(type='Constant', layer=['GroupNorm'], val=1., bias=0.),
]),
neck=dict(type='GlobalAveragePooling'),
head=dict(
type='LinearClsHead',
num_classes=1000,
in_channels=768,
loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
))
# schedule settings
optim_wrapper = dict(
optimizer=dict(lr=4e-3),
clip_grad=dict(max_norm=5.0),
)
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
# base_batch_size = (32 GPUs) x (128 samples per GPU)
auto_scale_lr = dict(base_batch_size=4096)
_base_ = [
'../_base_/datasets/imagenet_bs128_riformer_medium_384.py',
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../_base_/default_runtime.py',
]
# Model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='RIFormer',
arch='m48',
drop_path_rate=0.1,
init_cfg=[
dict(
type='TruncNormal',
layer=['Conv2d', 'Linear'],
std=.02,
bias=0.),
dict(type='Constant', layer=['GroupNorm'], val=1., bias=0.),
]),
neck=dict(type='GlobalAveragePooling'),
head=dict(
type='LinearClsHead',
num_classes=1000,
in_channels=768,
loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
))
# schedule settings
optim_wrapper = dict(
optimizer=dict(lr=4e-3),
clip_grad=dict(max_norm=5.0),
)
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
# base_batch_size = (32 GPUs) x (128 samples per GPU)
auto_scale_lr = dict(base_batch_size=4096)
_base_ = [
'../_base_/datasets/imagenet_bs128_poolformer_medium_224.py',
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../_base_/default_runtime.py',
]
# Model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='RIFormer',
arch='m48',
drop_path_rate=0.1,
init_cfg=[
dict(
type='TruncNormal',
layer=['Conv2d', 'Linear'],
std=.02,
bias=0.),
dict(type='Constant', layer=['GroupNorm'], val=1., bias=0.),
]),
neck=dict(type='GlobalAveragePooling'),
head=dict(
type='LinearClsHead',
num_classes=1000,
in_channels=768,
loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
))
# schedule settings
optim_wrapper = dict(
optimizer=dict(lr=4e-3),
clip_grad=dict(max_norm=5.0),
)
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
# base_batch_size = (32 GPUs) x (128 samples per GPU)
auto_scale_lr = dict(base_batch_size=4096)
_base_ = [
'../_base_/datasets/imagenet_bs128_riformer_small_384.py',
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../_base_/default_runtime.py',
]
# Model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='RIFormer',
arch='s12',
drop_path_rate=0.1,
init_cfg=[
dict(
type='TruncNormal',
layer=['Conv2d', 'Linear'],
std=.02,
bias=0.),
dict(type='Constant', layer=['GroupNorm'], val=1., bias=0.),
]),
neck=dict(type='GlobalAveragePooling'),
head=dict(
type='LinearClsHead',
num_classes=1000,
in_channels=512,
loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
))
# schedule settings
optim_wrapper = dict(
optimizer=dict(lr=4e-3),
clip_grad=dict(max_norm=5.0),
)
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
# base_batch_size = (32 GPUs) x (128 samples per GPU)
auto_scale_lr = dict(base_batch_size=4096)
_base_ = [
'../_base_/datasets/imagenet_bs128_poolformer_small_224.py',
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../_base_/default_runtime.py',
]
# Model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='RIFormer',
arch='s12',
drop_path_rate=0.1,
init_cfg=[
dict(
type='TruncNormal',
layer=['Conv2d', 'Linear'],
std=.02,
bias=0.),
dict(type='Constant', layer=['GroupNorm'], val=1., bias=0.),
]),
neck=dict(type='GlobalAveragePooling'),
head=dict(
type='LinearClsHead',
num_classes=1000,
in_channels=512,
loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
))
# schedule settings
optim_wrapper = dict(
optimizer=dict(lr=4e-3),
clip_grad=dict(max_norm=5.0),
)
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
# base_batch_size = (32 GPUs) x (128 samples per GPU)
auto_scale_lr = dict(base_batch_size=4096)
_base_ = [
'../_base_/datasets/imagenet_bs128_riformer_small_384.py',
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../_base_/default_runtime.py',
]
# Model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='RIFormer',
arch='s24',
drop_path_rate=0.1,
init_cfg=[
dict(
type='TruncNormal',
layer=['Conv2d', 'Linear'],
std=.02,
bias=0.),
dict(type='Constant', layer=['GroupNorm'], val=1., bias=0.),
]),
neck=dict(type='GlobalAveragePooling'),
head=dict(
type='LinearClsHead',
num_classes=1000,
in_channels=512,
loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
))
# schedule settings
optim_wrapper = dict(
optimizer=dict(lr=4e-3),
clip_grad=dict(max_norm=5.0),
)
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
# base_batch_size = (32 GPUs) x (128 samples per GPU)
auto_scale_lr = dict(base_batch_size=4096)
_base_ = [
'../_base_/datasets/imagenet_bs128_poolformer_small_224.py',
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../_base_/default_runtime.py',
]
# Model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='RIFormer',
arch='s24',
drop_path_rate=0.1,
init_cfg=[
dict(
type='TruncNormal',
layer=['Conv2d', 'Linear'],
std=.02,
bias=0.),
dict(type='Constant', layer=['GroupNorm'], val=1., bias=0.),
]),
neck=dict(type='GlobalAveragePooling'),
head=dict(
type='LinearClsHead',
num_classes=1000,
in_channels=512,
loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
))
# schedule settings
optim_wrapper = dict(
optimizer=dict(lr=4e-3),
clip_grad=dict(max_norm=5.0),
)
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
# base_batch_size = (32 GPUs) x (128 samples per GPU)
auto_scale_lr = dict(base_batch_size=4096)
_base_ = [
'../_base_/datasets/imagenet_bs128_poolformer_small_224.py',
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../_base_/default_runtime.py',
]
# Model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='RIFormer',
arch='s36',
drop_path_rate=0.1,
init_cfg=[
dict(
type='TruncNormal',
layer=['Conv2d', 'Linear'],
std=.02,
bias=0.),
dict(type='Constant', layer=['GroupNorm'], val=1., bias=0.),
]),
neck=dict(type='GlobalAveragePooling'),
head=dict(
type='LinearClsHead',
num_classes=1000,
in_channels=512,
loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
))
# schedule settings
optim_wrapper = dict(
optimizer=dict(lr=4e-3),
clip_grad=dict(max_norm=5.0),
)
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
# base_batch_size = (32 GPUs) x (128 samples per GPU)
auto_scale_lr = dict(base_batch_size=4096)
_base_ = [
'../_base_/datasets/imagenet_bs128_riformer_small_384.py',
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../_base_/default_runtime.py',
]
# Model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='RIFormer',
arch='s36',
drop_path_rate=0.1,
init_cfg=[
dict(
type='TruncNormal',
layer=['Conv2d', 'Linear'],
std=.02,
bias=0.),
dict(type='Constant', layer=['GroupNorm'], val=1., bias=0.),
]),
neck=dict(type='GlobalAveragePooling'),
head=dict(
type='LinearClsHead',
num_classes=1000,
in_channels=512,
loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
))
# schedule settings
optim_wrapper = dict(
optimizer=dict(lr=4e-3),
clip_grad=dict(max_norm=5.0),
)
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
# base_batch_size = (32 GPUs) x (128 samples per GPU)
auto_scale_lr = dict(base_batch_size=4096)
# SAM
> [Segment Anything](https://arxiv.org/abs/2304.02643)
<!-- [ALGORITHM] -->
## Abstract
We introduce the Segment Anything (SA) project: a new task, model, and dataset for image segmentation. Using our efficient model in a data collection loop, we built the largest segmentation dataset to date (by far), with over 1 billionmasks on 11M licensed and privacy respecting images. The model is designed and trained to be promptable, so it can transfer zero-shot to new image distributions and tasks. We evaluate its capabilities on numerous tasks and find that its zero-shot performance is impressive – often competitive with or even superior to prior fully supervised results. We are releasing the Segment Anything Model (SAM) and corresponding dataset (SA-1B) of 1B masks and 11M images at https://segment-anything.com to foster research into foundation models for computer vision.
<div align=center>
<img src="https://user-images.githubusercontent.com/36138628/231106092-261ff035-dd3b-4a8b-b2e7-e91f195090a1.png" width="100%"/>
</div>
## How to use it?
<!-- [TABS-BEGIN] -->
**Use the model**
```python
import torch
from mmpretrain import get_model
model = get_model('vit-base-p16_sam-pre_3rdparty_sa1b-1024px', pretrained=True)
inputs = torch.rand(1, 3, 1024, 1024)
out = model(inputs)
print(type(out))
# To extract features.
feats = model.extract_feat(inputs)
print(type(feats))
```
<!-- [TABS-END] -->
## Models and results
### Pretrained models
| Model | Params (M) | Flops (G) | Config | Download |
| :--------------------------------------------- | :--------: | :-------: | :-------------------------------------: | :----------------------------------------------------------------------------------------------: |
| `vit-base-p16_sam-pre_3rdparty_sa1b-1024px`\* | 89.67 | 486.00 | [config](vit-base-p16_sam_headless.py) | [model](https://download.openmmlab.com/mmclassification/v1/vit_sam/vit-base-p16_sam-pre_3rdparty_sa1b-1024px_20230411-2320f9cc.pth) |
| `vit-large-p16_sam-pre_3rdparty_sa1b-1024px`\* | 308.00 | 1494.00 | [config](vit-large-p16_sam_headless.py) | [model](https://download.openmmlab.com/mmclassification/v1/vit_sam/vit-large-p16_sam-pre_3rdparty_sa1b-1024px_20230411-595feafd.pth) |
| `vit-huge-p16_sam-pre_3rdparty_sa1b-1024px`\* | 637.00 | 2982.00 | [config](vit-huge-p16_sam_headless.py) | [model](https://download.openmmlab.com/mmclassification/v1/vit_sam/vit-huge-p16_sam-pre_3rdparty_sa1b-1024px_20230411-3f13c653.pth) |
*Models with * are converted from the [official repo](https://github.com/facebookresearch/segment-anything/). The config files of these models are only for inference. We haven't reproduce the training results.*
## Citation
```bibtex
@article{kirillov2023segany,
title={Segment Anything},
author={Kirillov, Alexander and Mintun, Eric and Ravi, Nikhila and Mao, Hanzi and Rolland, Chloe and Gustafson, Laura and Xiao, Tete and Whitehead, Spencer and Berg, Alexander C. and Lo, Wan-Yen and Doll{\'a}r, Piotr and Girshick, Ross},
journal={arXiv:2304.02643},
year={2023}
}
```
Collections:
- Name: SAM
Metadata:
Architecture:
- Convolution
- Dense Connections
- Dropout
- GELU
- Layer Normalization
- Multi-Head Attention
- Scaled Dot-Product Attention
Paper:
Title: 'Segment Anything'
URL: https://arxiv.org/abs/2304.02643
README: configs/sam/README.md
Code:
URL: null
Version: null
Models:
- Name: vit-base-p16_sam-pre_3rdparty_sa1b-1024px
Metadata:
FLOPs: 486000000000
Parameters: 89671000
Training Data:
- SA-1B
In Collection: SAM
Results: null
Weights: https://download.openmmlab.com/mmclassification/v1/vit_sam/vit-base-p16_sam-pre_3rdparty_sa1b-1024px_20230411-2320f9cc.pth
Config: configs/sam/vit-base-p16_sam_headless.py
Converted From:
Weights: https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth
Code: https://github.com/facebookresearch/segment-anything/
- Name: vit-large-p16_sam-pre_3rdparty_sa1b-1024px
Metadata:
FLOPs: 1494000000000
Parameters: 308000000
Training Data:
- SA-1B
In Collection: SAM
Results: null
Weights: https://download.openmmlab.com/mmclassification/v1/vit_sam/vit-large-p16_sam-pre_3rdparty_sa1b-1024px_20230411-595feafd.pth
Config: configs/sam/vit-large-p16_sam_headless.py
Converted From:
Weights: https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth
Code: https://github.com/facebookresearch/segment-anything/
- Name: vit-huge-p16_sam-pre_3rdparty_sa1b-1024px
Metadata:
FLOPs: 2982000000000
Parameters: 637000000
Training Data:
- SA-1B
In Collection: SAM
Results: null
Weights: https://download.openmmlab.com/mmclassification/v1/vit_sam/vit-huge-p16_sam-pre_3rdparty_sa1b-1024px_20230411-3f13c653.pth
Config: configs/sam/vit-huge-p16_sam_headless.py
Converted From:
Weights: https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth
Code: https://github.com/facebookresearch/segment-anything/
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='ViTSAM',
arch='base',
img_size=1024,
patch_size=16,
out_channels=256,
use_abs_pos=True,
use_rel_pos=True,
window_size=14,
),
neck=None,
head=None,
)
data_preprocessor = dict(
# RGB format normalization parameters
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
# convert image from BGR to RGB
to_rgb=True,
)
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='ViTSAM',
arch='huge',
img_size=1024,
patch_size=16,
out_channels=256,
use_abs_pos=True,
use_rel_pos=True,
window_size=14,
),
neck=None,
head=None,
)
data_preprocessor = dict(
# RGB format normalization parameters
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
# convert image from BGR to RGB
to_rgb=True,
)
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='ViTSAM',
arch='large',
img_size=1024,
patch_size=16,
out_channels=256,
use_abs_pos=True,
use_rel_pos=True,
window_size=14,
),
neck=None,
head=None,
)
data_preprocessor = dict(
# RGB format normalization parameters
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
# convert image from BGR to RGB
to_rgb=True,
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment