Commit dff2c686 authored by renzhc's avatar renzhc
Browse files

first commit

parent 8f9dd0ed
Pipeline #1665 canceled with stages
_base_ = [
'../_base_/datasets/coco_caption.py',
'../_base_/default_runtime.py',
]
# model settings
model = dict(
type='OFA',
task='caption',
vocab_size=59457,
embedding_dim=768,
encoder_cfg=dict(
embed_images=dict(type='OFAResNet', depth=101),
num_layers=6,
),
decoder_cfg=dict(num_layers=6),
generation_cfg=dict(use_cache=True),
tokenizer=dict(type='OFATokenizer', name_or_path='OFA-Sys/OFA-base'),
)
# data settings
data_preprocessor = dict(
type='MultiModalDataPreprocessor',
mean=[127.5, 127.5, 127.5],
std=[127.5, 127.5, 127.5],
to_rgb=True,
)
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='Resize', scale=(480, 480)),
dict(type='PackInputs', meta_keys=('image_id', )),
]
train_dataloader = None
test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
# schedule settings
train_cfg = None
val_cfg = dict()
test_cfg = dict()
_base_ = [
'../_base_/datasets/refcoco.py',
'../_base_/default_runtime.py',
]
# model settings
model = dict(
type='OFA',
task='refcoco',
vocab_size=59457,
embedding_dim=768,
encoder_cfg=dict(
embed_images=dict(type='OFAResNet', depth=101),
num_layers=6,
),
decoder_cfg=dict(num_layers=6),
generation_cfg=dict(use_cache=True),
tokenizer=dict(type='OFATokenizer', name_or_path='OFA-Sys/OFA-base'),
)
# data settings
data_preprocessor = dict(
type='MultiModalDataPreprocessor',
mean=[127.5, 127.5, 127.5],
std=[127.5, 127.5, 127.5],
to_rgb=True,
)
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='Resize', scale=(512, 512)),
dict(
type='PackInputs',
algorithm_keys=['text', 'gt_bboxes'],
meta_keys=['image_id', 'scale_factor'],
),
]
train_dataloader = None
test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
# schedule settings
train_cfg = None
val_cfg = dict()
test_cfg = dict()
_base_ = [
'../_base_/datasets/coco_vqa.py',
'../_base_/default_runtime.py',
]
ANS2LABEL = 'https://ofa-beijing.oss-cn-beijing.aliyuncs.com/datasets/vqa_data/trainval_ans2label.pkl' # noqa: E501
# model settings
model = dict(
type='OFA',
task='vqa',
vocab_size=59457,
embedding_dim=768,
ans2label=ANS2LABEL,
encoder_cfg=dict(
embed_images=dict(type='OFAResNet', depth=101),
num_layers=6,
num_heads=12,
),
decoder_cfg=dict(
num_layers=6,
num_heads=12,
),
generation_cfg=dict(
num_beams=5,
max_new_tokens=200,
length_penalty=0., # VQA doesn't require longer answer.
use_cache=True,
),
tokenizer=dict(type='OFATokenizer', name_or_path='OFA-Sys/OFA-base'),
)
# data settings
data_preprocessor = dict(
type='MultiModalDataPreprocessor',
mean=[127.5, 127.5, 127.5],
std=[127.5, 127.5, 127.5],
to_rgb=True,
)
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='Resize',
scale=(480, 480),
interpolation='bicubic',
backend='pillow'),
dict(type='OFAAddObjects'),
dict(
type='PackInputs',
algorithm_keys=[
'question', 'gt_answer', 'gt_answer_weight', 'decoder_prompt'
],
meta_keys=['question_id', 'image_id'],
),
]
train_dataloader = None # Eval only
test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
# schedule settings
train_cfg = None
val_cfg = dict()
test_cfg = dict()
_base_ = [
'../_base_/datasets/coco_vqa.py',
'../_base_/default_runtime.py',
]
# model settings
model = dict(
type='OFA',
task='vqa',
vocab_size=59457,
embedding_dim=768,
encoder_cfg=dict(
embed_images=dict(type='OFAResNet', depth=101),
num_layers=6,
num_heads=12,
),
decoder_cfg=dict(
num_layers=6,
num_heads=12,
),
generation_cfg=dict(
num_beams=20,
max_new_tokens=200,
length_penalty=0., # VQA doesn't require longer answer.
use_cache=True,
),
tokenizer=dict(type='OFATokenizer', name_or_path='OFA-Sys/OFA-base'),
)
# data settings
data_preprocessor = dict(
mean=[127.5, 127.5, 127.5],
std=[127.5, 127.5, 127.5],
to_rgb=True,
)
train_dataloader = None # Eval only
# schedule settings
train_cfg = None
val_cfg = dict()
test_cfg = dict()
_base_ = [
'../_base_/datasets/coco_vqa.py',
'../_base_/default_runtime.py',
]
# model settings
model = dict(
type='OFA',
task='vqa',
vocab_size=59457,
embedding_dim=1024,
encoder_cfg=dict(
embed_images=dict(type='OFAResNet', depth=152),
num_layers=12,
num_heads=16,
),
decoder_cfg=dict(
num_layers=12,
num_heads=16,
),
generation_cfg=dict(
num_beams=20,
max_new_tokens=200,
length_penalty=0., # VQA doesn't require longer answer.
use_cache=True,
),
tokenizer=dict(type='OFATokenizer', name_or_path='OFA-Sys/OFA-large'),
)
# data settings
data_preprocessor = dict(
type='MultiModalDataPreprocessor',
mean=[127.5, 127.5, 127.5],
std=[127.5, 127.5, 127.5],
to_rgb=True,
)
train_dataloader = None # Eval only
# schedule settings
train_cfg = None
val_cfg = dict()
test_cfg = dict()
# Otter
> [Otter: A Multi-Modal Model with In-Context Instruction Tuning](https://arxiv.org/abs/2305.03726)
<!-- [ALGORITHM] -->
## Abstract
Large language models (LLMs) have demonstrated significant universal capabilities as few/zero-shot learners in various tasks due to their pre-training on vast amounts of text data, as exemplified by GPT-3, which boosted to InstrctGPT and ChatGPT, effectively following natural language instructions to accomplish real-world tasks. In this paper, we propose to introduce instruction tuning into multi-modal models, motivated by the Flamingo model's upstream interleaved format pretraining dataset. We adopt a similar approach to construct our MultI-Modal In-Context Instruction Tuning (MIMIC-IT) dataset. We then introduce Otter, a multi-modal model based on OpenFlamingo (open-sourced version of DeepMind's Flamingo), trained on MIMIC-IT and showcasing improved instruction-following ability and in-context learning. We also optimize OpenFlamingo's implementation for researchers, democratizing the required training resources from 1$\times$ A100 GPU to 4$\times$ RTX-3090 GPUs, and integrate both OpenFlamingo and Otter into Huggingface Transformers for more researchers to incorporate the models into their customized training and inference pipelines.
<div align=center>
<img src="https://camo.githubusercontent.com/70613ab882a7827808148a2c577029d544371e707b0832a0b01151c54ce553c3/68747470733a2f2f692e706f7374696d672e63632f5477315a304243572f6f7474657276302d322d64656d6f2e706e67" width="80%"/>
</div>
## How to use it?
<!-- [TABS-BEGIN] -->
**Use the model**
```python
import torch
from mmpretrain import get_model, inference_model
model = get_model('otter-9b_3rdparty_caption', pretrained=True, device='cuda', generation_cfg=dict(max_new_tokens=50))
out = inference_model(model, 'demo/cat-dog.png')
print(out)
# {'pred_caption': 'The image features two adorable small puppies sitting next to each other on the grass. One puppy is brown and white, while the other is tan and white. They appear to be relaxing outdoors, enjoying each other'}
```
**Test Command**
Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
Test:
```shell
python tools/test.py configs/otter/otter-9b_caption.py https://download.openmmlab.com/mmclassification/v1/otter/otter-9b-adapter_20230613-51c5be8d.pth
```
<!-- [TABS-END] -->
## Models and results
### Image Caption on COCO
| Model | Params (M) | BLEU-4 | CIDER | Config | Download |
| :---------------------------- | :--------: | :------: | :------: | :---------------------------: | :------------------------------------------------------------------------------------------------------: |
| `otter-9b_3rdparty_caption`\* | 8220.45 | Upcoming | Upcoming | [config](otter-9b_caption.py) | [model](https://download.openmmlab.com/mmclassification/v1/otter/otter-9b-adapter_20230613-51c5be8d.pth) |
*Models with * are converted from the [official repo](https://github.com/Luodian/Otter/tree/main). The config files of these models are only for inference. We haven't reproduce the training results.*
### Visual Question Answering on VQAv2
| Model | Params (M) | Accuracy | Config | Download |
| :------------------------ | :--------: | :------: | :-----------------------: | :------------------------------------------------------------------------------------------------------: |
| `otter-9b_3rdparty_vqa`\* | 8220.45 | Upcoming | [config](otter-9b_vqa.py) | [model](https://download.openmmlab.com/mmclassification/v1/otter/otter-9b-adapter_20230613-51c5be8d.pth) |
*Models with * are converted from the [official repo](https://github.com/Luodian/Otter/tree/main). The config files of these models are only for inference. We haven't reproduce the training results.*
## Citation
```bibtex
@article{li2023otter,
title={Otter: A Multi-Modal Model with In-Context Instruction Tuning},
author={Li, Bo and Zhang, Yuanhan and Chen, Liangyu and Wang, Jinghao and Yang, Jingkang and Liu, Ziwei},
journal={arXiv preprint arXiv:2305.03726},
year={2023}
}
@article{li2023mimicit,
title={MIMIC-IT: Multi-Modal In-Context Instruction Tuning},
author={Bo Li and Yuanhan Zhang and Liangyu Chen and Jinghao Wang and Fanyi Pu and Jingkang Yang and Chunyuan Li and Ziwei Liu},
year={2023},
eprint={2306.05425},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
```
Collections:
- Name: Otter
Metadata:
Architecture:
- Transformer
- Gated Cross-Attention Dense
Paper:
Title: 'Otter: A Multi-Modal Model with In-Context Instruction Tuning'
URL: https://arxiv.org/abs/2305.03726
README: configs/otter/README.md
Models:
- Name: otter-9b_3rdparty_caption
Metadata:
FLOPs: null
Parameters: 8220452880
In Collection: Otter
Results:
- Task: Image Caption
Dataset: COCO
Metrics:
BLEU-4: null
CIDER: null
Weights: https://download.openmmlab.com/mmclassification/v1/otter/otter-9b-adapter_20230613-51c5be8d.pth
Config: configs/otter/otter-9b_caption.py
Converted From:
Weights: https://huggingface.co/luodian/otter-9b-hf
Code: https://github.com/Luodian/Otter/tree/main
- Name: otter-9b_3rdparty_vqa
Metadata:
FLOPs: null
Parameters: 8220452880
In Collection: Otter
Results:
- Task: Visual Question Answering
Dataset: VQAv2
Metrics:
Accuracy: null
Weights: https://download.openmmlab.com/mmclassification/v1/otter/otter-9b-adapter_20230613-51c5be8d.pth
Config: configs/otter/otter-9b_vqa.py
Converted From:
Weights: https://huggingface.co/luodian/otter-9b-hf
Code: https://github.com/Luodian/Otter/tree/main
_base_ = [
'../_base_/default_runtime.py',
]
# model settings
model = dict(
type='Otter',
tokenizer=dict(type='LlamaTokenizer', name_or_path='huggyllama/llama-7b'),
vision_encoder=dict(
type='VisionTransformer',
arch='l',
patch_size=14,
pre_norm=True,
norm_cfg=dict(type='LN', eps=1e-5),
layer_cfgs=dict(act_cfg=dict(type='mmpretrain.QuickGELU')),
final_norm=False,
out_type='raw',
pretrained=(
'https://download.openmmlab.com/mmclassification/v0/clip/'
'vit-large-p14_clip-openai-pre_3rdparty_20230517-95e2af0b.pth'),
),
lang_encoder=dict(
base=dict(
type='AutoModelForCausalLM',
name_or_path='huggyllama/llama-7b',
local_files_only=True),
adapter=dict(
type='FlamingoLMAdapter',
vis_hidden_size=1024,
cross_attn_every_n_layers=4,
use_media_placement_augmentation=False,
only_attend_previous=True,
),
),
task='caption',
final_prompt_tmpl='<image>User:Please describe the image. GPT:<answer>',
generation_cfg=dict(
num_beams=3, max_new_tokens=24, no_repeat_ngram_size=3),
)
# data settings
data_preprocessor = dict(
type='MultiModalDataPreprocessor',
mean=[122.770938, 116.7460125, 104.09373615],
std=[68.5005327, 66.6321579, 70.32316305],
to_rgb=True,
)
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='ResizeEdge',
scale=224,
interpolation='bicubic',
backend='pillow'),
dict(type='CenterCrop', crop_size=(224, 224)),
dict(
type='PackInputs',
algorithm_keys=['gt_caption'],
meta_keys=['image_id'],
),
]
val_dataloader = dict(
batch_size=8,
num_workers=8,
dataset=dict(
type='COCOCaption',
data_root='data/coco',
ann_file='annotations/coco_karpathy_val.json',
pipeline=test_pipeline,
),
sampler=dict(type='DefaultSampler', shuffle=False),
persistent_workers=True,
)
val_evaluator = dict(
type='COCOCaption',
ann_file='data/coco/annotations/coco_karpathy_val_gt.json')
# If you want standard test, please manually configure the test dataset
test_dataloader = val_dataloader
test_evaluator = val_evaluator
# schedule settings
val_cfg = dict()
test_cfg = dict()
_base_ = [
'../_base_/default_runtime.py',
]
# model settings
model = dict(
type='Otter',
tokenizer=dict(type='LlamaTokenizer', name_or_path='huggyllama/llama-7b'),
vision_encoder=dict(
type='VisionTransformer',
arch='l',
patch_size=14,
pre_norm=True,
norm_cfg=dict(type='LN', eps=1e-5),
layer_cfgs=dict(act_cfg=dict(type='QuickGELU')),
final_norm=False,
out_type='raw',
pretrained=(
'https://download.openmmlab.com/mmclassification/v0/clip/'
'vit-large-p14_clip-openai-pre_3rdparty_20230517-95e2af0b.pth'),
),
lang_encoder=dict(
base=dict(
type='AutoModelForCausalLM',
name_or_path='huggyllama/llama-7b',
local_files_only=True),
adapter=dict(
type='FlamingoLMAdapter',
vis_hidden_size=1024,
cross_attn_every_n_layers=4,
use_media_placement_augmentation=False,
only_attend_previous=True,
),
),
task='vqa',
final_prompt_tmpl='<image>User:{question} GPT:<answer>',
generation_cfg=dict(
num_beams=3, max_new_tokens=24, no_repeat_ngram_size=3),
)
# data settings
data_preprocessor = dict(
type='MultiModalDataPreprocessor',
mean=[122.770938, 116.7460125, 104.09373615],
std=[68.5005327, 66.6321579, 70.32316305],
to_rgb=True,
)
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='ResizeEdge',
scale=224,
interpolation='bicubic',
backend='pillow'),
dict(type='CenterCrop', crop_size=(224, 224)),
dict(
type='PackInputs',
algorithm_keys=['question', 'gt_answer', 'gt_answer_weight', 'shots'],
meta_keys=['image_id'],
),
]
val_dataloader = dict(
batch_size=8,
num_workers=8,
dataset=dict(
type='FlamingoEvalCOCOVQA',
data_root='data/coco',
data_prefix='val2014',
question_file='annotations/v2_OpenEnded_mscoco_val2014_questions.json',
ann_file='annotations/v2_mscoco_val2014_annotations.json',
pipeline=test_pipeline,
num_shots=0,
num_support_examples=2048,
num_query_examples=5000,
),
sampler=dict(type='DefaultSampler', shuffle=False),
persistent_workers=True,
)
val_evaluator = dict(type='VQAAcc')
test_dataloader = dict(
batch_size=8,
num_workers=8,
dataset=dict(
type='FlamingoEvalCOCOVQA',
data_root='data/coco',
data_prefix='test2015',
question_file=
'annotations/v2_OpenEnded_mscoco_test-dev2015_questions.json',
pipeline=test_pipeline,
num_shots=0,
num_support_examples=2048,
num_query_examples=5000,
),
sampler=dict(type='DefaultSampler', shuffle=False),
persistent_workers=True,
)
test_evaluator = dict(type='ReportVQA', file_path='vqa_test-dev.json')
# schedule settings
val_cfg = dict()
test_cfg = dict()
# PoolFormer
> [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418)
<!-- [ALGORITHM] -->
## Abstract
Transformers have shown great potential in computer vision tasks. A common belief is their attention-based token mixer module contributes most to their competence. However, recent works show the attention-based module in transformers can be replaced by spatial MLPs and the resulted models still perform quite well. Based on this observation, we hypothesize that the general architecture of the transformers, instead of the specific token mixer module, is more essential to the model's performance. To verify this, we deliberately replace the attention module in transformers with an embarrassingly simple spatial pooling operator to conduct only basic token mixing. Surprisingly, we observe that the derived model, termed as PoolFormer, achieves competitive performance on multiple computer vision tasks. For example, on ImageNet-1K, PoolFormer achieves 82.1% top-1 accuracy, surpassing well-tuned vision transformer/MLP-like baselines DeiT-B/ResMLP-B24 by 0.3%/1.1% accuracy with 35%/52% fewer parameters and 49%/61% fewer MACs. The effectiveness of PoolFormer verifies our hypothesis and urges us to initiate the concept of "MetaFormer", a general architecture abstracted from transformers without specifying the token mixer. Based on the extensive experiments, we argue that MetaFormer is the key player in achieving superior results for recent transformer and MLP-like models on vision tasks. This work calls for more future research dedicated to improving MetaFormer instead of focusing on the token mixer modules. Additionally, our proposed PoolFormer could serve as a starting baseline for future MetaFormer architecture design.
<div align=center>
<img src="https://user-images.githubusercontent.com/15921929/144710761-1635f59a-abde-4946-984c-a2c3f22a19d2.png" width="100%"/>
</div>
## How to use it?
<!-- [TABS-BEGIN] -->
**Predict image**
```python
from mmpretrain import inference_model
predict = inference_model('poolformer-s12_3rdparty_32xb128_in1k', 'demo/bird.JPEG')
print(predict['pred_class'])
print(predict['pred_score'])
```
**Use the model**
```python
import torch
from mmpretrain import get_model
model = get_model('poolformer-s12_3rdparty_32xb128_in1k', pretrained=True)
inputs = torch.rand(1, 3, 224, 224)
out = model(inputs)
print(type(out))
# To extract features.
feats = model.extract_feat(inputs)
print(type(feats))
```
**Test Command**
Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
Test:
```shell
python tools/test.py configs/poolformer/poolformer-s12_32xb128_in1k.py https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-s12_3rdparty_32xb128_in1k_20220414-f8d83051.pth
```
<!-- [TABS-END] -->
## Models and results
### Image Classification on ImageNet-1k
| Model | Pretrain | Params (M) | Flops (G) | Top-1 (%) | Top-5 (%) | Config | Download |
| :--------------------------------------- | :----------: | :--------: | :-------: | :-------: | :-------: | :--------------------------------------: | :---------------------------------------------------------------------: |
| `poolformer-s12_3rdparty_32xb128_in1k`\* | From scratch | 11.92 | 1.87 | 77.24 | 93.51 | [config](poolformer-s12_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-s12_3rdparty_32xb128_in1k_20220414-f8d83051.pth) |
| `poolformer-s24_3rdparty_32xb128_in1k`\* | From scratch | 21.39 | 3.51 | 80.33 | 95.05 | [config](poolformer-s24_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-s24_3rdparty_32xb128_in1k_20220414-d7055904.pth) |
| `poolformer-s36_3rdparty_32xb128_in1k`\* | From scratch | 30.86 | 5.15 | 81.43 | 95.45 | [config](poolformer-s36_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-s36_3rdparty_32xb128_in1k_20220414-d78ff3e8.pth) |
| `poolformer-m36_3rdparty_32xb128_in1k`\* | From scratch | 56.17 | 8.96 | 82.14 | 95.71 | [config](poolformer-m36_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-m36_3rdparty_32xb128_in1k_20220414-c55e0949.pth) |
| `poolformer-m48_3rdparty_32xb128_in1k`\* | From scratch | 73.47 | 11.80 | 82.51 | 95.95 | [config](poolformer-m48_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-m48_3rdparty_32xb128_in1k_20220414-9378f3eb.pth) |
*Models with * are converted from the [official repo](https://github.com/sail-sg/poolformer). The config files of these models are only for inference. We haven't reproduce the training results.*
## Citation
```bibtex
@inproceedings{yu2022metaformer,
title={Metaformer is actually what you need for vision},
author={Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng},
booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
pages={10819--10829},
year={2022}
}
```
Collections:
- Name: PoolFormer
Metadata:
Training Data: ImageNet-1k
Architecture:
- Pooling
- 1x1 Convolution
- LayerScale
Paper:
URL: https://arxiv.org/abs/2111.11418
Title: MetaFormer is Actually What You Need for Vision
README: configs/poolformer/README.md
Code:
Version: v0.22.1
URL: https://github.com/open-mmlab/mmpretrain/blob/v0.22.1/mmcls/models/backbones/poolformer.py
Models:
- Name: poolformer-s12_3rdparty_32xb128_in1k
Metadata:
FLOPs: 1871399424
Parameters: 11915176
In Collection: PoolFormer
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 77.24
Top 5 Accuracy: 93.51
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-s12_3rdparty_32xb128_in1k_20220414-f8d83051.pth
Config: configs/poolformer/poolformer-s12_32xb128_in1k.py
Converted From:
Weights: https://github.com/sail-sg/poolformer/releases/download/v1.0/poolformer_s12.pth.tar
Code: https://github.com/sail-sg/poolformer
- Name: poolformer-s24_3rdparty_32xb128_in1k
Metadata:
Training Data: ImageNet-1k
FLOPs: 3510411008
Parameters: 21388968
In Collection: PoolFormer
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 80.33
Top 5 Accuracy: 95.05
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-s24_3rdparty_32xb128_in1k_20220414-d7055904.pth
Config: configs/poolformer/poolformer-s24_32xb128_in1k.py
Converted From:
Weights: https://github.com/sail-sg/poolformer/releases/download/v1.0/poolformer_s24.pth.tar
Code: https://github.com/sail-sg/poolformer
- Name: poolformer-s36_3rdparty_32xb128_in1k
Metadata:
FLOPs: 5149422592
Parameters: 30862760
In Collection: PoolFormer
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 81.43
Top 5 Accuracy: 95.45
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-s36_3rdparty_32xb128_in1k_20220414-d78ff3e8.pth
Config: configs/poolformer/poolformer-s36_32xb128_in1k.py
Converted From:
Weights: https://github.com/sail-sg/poolformer/releases/download/v1.0/poolformer_s36.pth.tar
Code: https://github.com/sail-sg/poolformer
- Name: poolformer-m36_3rdparty_32xb128_in1k
Metadata:
Training Data: ImageNet-1k
FLOPs: 8960175744
Parameters: 56172520
In Collection: PoolFormer
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 82.14
Top 5 Accuracy: 95.71
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-m36_3rdparty_32xb128_in1k_20220414-c55e0949.pth
Config: configs/poolformer/poolformer-m36_32xb128_in1k.py
Converted From:
Weights: https://github.com/sail-sg/poolformer/releases/download/v1.0/poolformer_m36.pth.tar
Code: https://github.com/sail-sg/poolformer
- Name: poolformer-m48_3rdparty_32xb128_in1k
Metadata:
FLOPs: 11801805696
Parameters: 73473448
In Collection: PoolFormer
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 82.51
Top 5 Accuracy: 95.95
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-m48_3rdparty_32xb128_in1k_20220414-9378f3eb.pth
Config: configs/poolformer/poolformer-m48_32xb128_in1k.py
Converted From:
Weights: https://github.com/sail-sg/poolformer/releases/download/v1.0/poolformer_m48.pth.tar
Code: https://github.com/sail-sg/poolformer
_base_ = [
'../_base_/models/poolformer/poolformer_m36.py',
'../_base_/datasets/imagenet_bs128_poolformer_medium_224.py',
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../_base_/default_runtime.py',
]
# schedule settings
optim_wrapper = dict(
optimizer=dict(lr=4e-3),
clip_grad=dict(max_norm=5.0),
)
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
# base_batch_size = (32 GPUs) x (128 samples per GPU)
auto_scale_lr = dict(base_batch_size=4096)
_base_ = [
'../_base_/models/poolformer/poolformer_m48.py',
'../_base_/datasets/imagenet_bs128_poolformer_medium_224.py',
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../_base_/default_runtime.py',
]
# schedule settings
optim_wrapper = dict(
optimizer=dict(lr=4e-3),
clip_grad=dict(max_norm=5.0),
)
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
# base_batch_size = (32 GPUs) x (128 samples per GPU)
auto_scale_lr = dict(base_batch_size=4096)
_base_ = [
'../_base_/models/poolformer/poolformer_s12.py',
'../_base_/datasets/imagenet_bs128_poolformer_small_224.py',
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../_base_/default_runtime.py',
]
# schedule settings
optim_wrapper = dict(
optimizer=dict(lr=4e-3),
clip_grad=dict(max_norm=5.0),
)
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
# base_batch_size = (32 GPUs) x (128 samples per GPU)
auto_scale_lr = dict(base_batch_size=4096)
_base_ = [
'../_base_/models/poolformer/poolformer_s24.py',
'../_base_/datasets/imagenet_bs128_poolformer_small_224.py',
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../_base_/default_runtime.py',
]
# schedule settings
optim_wrapper = dict(
optimizer=dict(lr=4e-3),
clip_grad=dict(max_norm=5.0),
)
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
# base_batch_size = (32 GPUs) x (128 samples per GPU)
auto_scale_lr = dict(base_batch_size=4096)
_base_ = [
'../_base_/models/poolformer/poolformer_s36.py',
'../_base_/datasets/imagenet_bs128_poolformer_small_224.py',
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../_base_/default_runtime.py',
]
# schedule settings
optim_wrapper = dict(
optimizer=dict(lr=4e-3),
clip_grad=dict(max_norm=5.0),
)
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
# base_batch_size = (32 GPUs) x (128 samples per GPU)
auto_scale_lr = dict(base_batch_size=4096)
# RegNet
> [Designing Network Design Spaces](https://arxiv.org/abs/2003.13678)
<!-- [ALGORITHM] -->
## Abstract
In this work, we present a new network design paradigm. Our goal is to help advance the understanding of network design and discover design principles that generalize across settings. Instead of focusing on designing individual network instances, we design network design spaces that parametrize populations of networks. The overall process is analogous to classic manual design of networks, but elevated to the design space level. Using our methodology we explore the structure aspect of network design and arrive at a low-dimensional design space consisting of simple, regular networks that we call RegNet. The core insight of the RegNet parametrization is surprisingly simple: widths and depths of good networks can be explained by a quantized linear function. We analyze the RegNet design space and arrive at interesting findings that do not match the current practice of network design. The RegNet design space provides simple and fast networks that work well across a wide range of flop regimes. Under comparable training settings and flops, the RegNet models outperform the popular EfficientNet models while being up to 5x faster on GPUs.
<div align=center>
<img src="https://user-images.githubusercontent.com/26739999/142572813-5dad3317-9d58-4177-971f-d346e01fb3c4.png" width=60%/>
</div>
## How to use it?
<!-- [TABS-BEGIN] -->
**Predict image**
```python
from mmpretrain import inference_model
predict = inference_model('regnetx-400mf_8xb128_in1k', 'demo/bird.JPEG')
print(predict['pred_class'])
print(predict['pred_score'])
```
**Use the model**
```python
import torch
from mmpretrain import get_model
model = get_model('regnetx-400mf_8xb128_in1k', pretrained=True)
inputs = torch.rand(1, 3, 224, 224)
out = model(inputs)
print(type(out))
# To extract features.
feats = model.extract_feat(inputs)
print(type(feats))
```
**Train/Test Command**
Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
Train:
```shell
python tools/train.py configs/regnet/regnetx-400mf_8xb128_in1k.py
```
Test:
```shell
python tools/test.py configs/regnet/regnetx-400mf_8xb128_in1k.py https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-400mf_8xb128_in1k_20211213-89bfc226.pth
```
<!-- [TABS-END] -->
## Models and results
### Image Classification on ImageNet-1k
| Model | Pretrain | Params (M) | Flops (G) | Top-1 (%) | Top-5 (%) | Config | Download |
| :-------------------------- | :----------: | :--------: | :-------: | :-------: | :-------: | :------------------------------------: | :------------------------------------------------------------------------------------: |
| `regnetx-400mf_8xb128_in1k` | From scratch | 5.16 | 0.41 | 72.56 | 90.78 | [config](regnetx-400mf_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-400mf_8xb128_in1k_20211213-89bfc226.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-400mf_8xb128_in1k_20211208_143316.json) |
| `regnetx-800mf_8xb128_in1k` | From scratch | 7.26 | 0.81 | 74.76 | 92.32 | [config](regnetx-800mf_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-800mf_8xb128_in1k_20211213-222b0f11.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-800mf_8xb128_in1k_20211207_143037.log.json) |
| `regnetx-1.6gf_8xb128_in1k` | From scratch | 9.19 | 1.63 | 76.84 | 93.31 | [config](regnetx-1.6gf_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-1.6gf_8xb128_in1k_20211213-d1b89758.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-1.6gf_8xb128_in1k_20211208_143018.log.json) |
| `regnetx-3.2gf_8xb64_in1k` | From scratch | 3.21 | 1.53 | 78.09 | 94.08 | [config](regnetx-3.2gf_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-3.2gf_8xb64_in1k_20211213-1fdd82ae.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-3.2gf_8xb64_in1k_20211208_142720.log.json) |
| `regnetx-4.0gf_8xb64_in1k` | From scratch | 22.12 | 4.00 | 78.60 | 94.17 | [config](regnetx-4.0gf_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-4.0gf_8xb64_in1k_20211213-efed675c.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-4.0gf_8xb64_in1k_20211207_150431.log.json) |
| `regnetx-6.4gf_8xb64_in1k` | From scratch | 26.21 | 6.51 | 79.38 | 94.65 | [config](regnetx-6.4gf_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-6.4gf_8xb64_in1k_20211215-5c6089da.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-6.4gf_8xb64_in1k_20211213_172748.log.json) |
| `regnetx-8.0gf_8xb64_in1k` | From scratch | 39.57 | 8.03 | 79.12 | 94.51 | [config](regnetx-8.0gf_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-8.0gf_8xb64_in1k_20211213-9a9fcc76.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-8.0gf_8xb64_in1k_20211208_103250.log.json) |
| `regnetx-12gf_8xb64_in1k` | From scratch | 46.11 | 12.15 | 79.67 | 95.03 | [config](regnetx-12gf_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-12gf_8xb64_in1k_20211213-5df8c2f8.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-12gf_8xb64_in1k_20211208_143713.log.json) |
## Citation
```bibtex
@article{radosavovic2020designing,
title={Designing Network Design Spaces},
author={Ilija Radosavovic and Raj Prateek Kosaraju and Ross Girshick and Kaiming He and Piotr Dollár},
year={2020},
eprint={2003.13678},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
```
Collections:
- Name: RegNet
Metadata:
Training Data: ImageNet-1k
Architecture:
- Neural Architecture Search
- Design Space Design
- Precise BN
- SGD with nesterov
Paper:
URL: https://arxiv.org/abs/2003.13678
Title: Designing Network Design Spaces
README: configs/regnet/README.md
Code:
URL: https://github.com/open-mmlab/mmpretrain/blob/v0.18.0/mmcls/models/backbones/regnet.py
Version: v0.18.0
Models:
- Name: regnetx-400mf_8xb128_in1k
In Collection: RegNet
Config: configs/regnet/regnetx-400mf_8xb128_in1k.py
Metadata:
FLOPs: 410000000 # 0.41G
Parameters: 5160000 # 5.16M
Results:
- Dataset: ImageNet-1k
Task: Image Classification
Metrics:
Top 1 Accuracy: 72.56
Top 5 Accuracy: 90.78
Weights: https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-400mf_8xb128_in1k_20211213-89bfc226.pth
- Name: regnetx-800mf_8xb128_in1k
In Collection: RegNet
Config: configs/regnet/regnetx-800mf_8xb128_in1k.py
Metadata:
FLOPs: 810000000 # 0.81G
Parameters: 7260000 # 7.26M
Results:
- Dataset: ImageNet-1k
Task: Image Classification
Metrics:
Top 1 Accuracy: 74.76
Top 5 Accuracy: 92.32
Weights: https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-800mf_8xb128_in1k_20211213-222b0f11.pth
- Name: regnetx-1.6gf_8xb128_in1k
In Collection: RegNet
Config: configs/regnet/regnetx-1.6gf_8xb128_in1k.py
Metadata:
FLOPs: 1630000000 # 1.63G
Parameters: 9190000 # 9.19M
Results:
- Dataset: ImageNet-1k
Task: Image Classification
Metrics:
Top 1 Accuracy: 76.84
Top 5 Accuracy: 93.31
Weights: https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-1.6gf_8xb128_in1k_20211213-d1b89758.pth
- Name: regnetx-3.2gf_8xb64_in1k
In Collection: RegNet
Config: configs/regnet/regnetx-3.2gf_8xb64_in1k.py
Metadata:
FLOPs: 1530000000 # 1.53G
Parameters: 3210000 # 32.1M
Results:
- Dataset: ImageNet-1k
Task: Image Classification
Metrics:
Top 1 Accuracy: 78.09
Top 5 Accuracy: 94.08
Weights: https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-3.2gf_8xb64_in1k_20211213-1fdd82ae.pth
- Name: regnetx-4.0gf_8xb64_in1k
In Collection: RegNet
Config: configs/regnet/regnetx-4.0gf_8xb64_in1k.py
Metadata:
FLOPs: 4000000000 # 4G
Parameters: 22120000 # 22.12M
Results:
- Dataset: ImageNet-1k
Task: Image Classification
Metrics:
Top 1 Accuracy: 78.60
Top 5 Accuracy: 94.17
Weights: https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-4.0gf_8xb64_in1k_20211213-efed675c.pth
- Name: regnetx-6.4gf_8xb64_in1k
In Collection: RegNet
Config: configs/regnet/regnetx-6.4gf_8xb64_in1k.py
Metadata:
FLOPs: 6510000000 # 6.51G
Parameters: 26210000 # 26.21M
Results:
- Dataset: ImageNet-1k
Task: Image Classification
Metrics:
Top 1 Accuracy: 79.38
Top 5 Accuracy: 94.65
Weights: https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-6.4gf_8xb64_in1k_20211215-5c6089da.pth
- Name: regnetx-8.0gf_8xb64_in1k
In Collection: RegNet
Config: configs/regnet/regnetx-8.0gf_8xb64_in1k.py
Metadata:
FLOPs: 8030000000 # 8.03G
Parameters: 39570000 # 39.57M
Results:
- Dataset: ImageNet-1k
Task: Image Classification
Metrics:
Top 1 Accuracy: 79.12
Top 5 Accuracy: 94.51
Weights: https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-8.0gf_8xb64_in1k_20211213-9a9fcc76.pth
- Name: regnetx-12gf_8xb64_in1k
In Collection: RegNet
Config: configs/regnet/regnetx-12gf_8xb64_in1k.py
Metadata:
FLOPs: 12150000000 # 12.15G
Parameters: 46110000 # 46.11M
Results:
- Dataset: ImageNet-1k
Task: Image Classification
Metrics:
Top 1 Accuracy: 79.67
Top 5 Accuracy: 95.03
Weights: https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-12gf_8xb64_in1k_20211213-5df8c2f8.pth
_base_ = ['./regnetx-400mf_8xb128_in1k.py']
# model settings
model = dict(
backbone=dict(type='RegNet', arch='regnetx_1.6gf'),
head=dict(in_channels=912, ))
_base_ = ['./regnetx-400mf_8xb128_in1k.py']
# model settings
model = dict(
backbone=dict(type='RegNet', arch='regnetx_12gf'),
head=dict(in_channels=2240, ))
# dataset settings
train_dataloader = dict(batch_size=64)
# schedule settings
# for batch_size 512, use lr = 0.4
optim_wrapper = dict(optimizer=dict(lr=0.4))
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
# base_batch_size = (8 GPUs) x (64 samples per GPU)
auto_scale_lr = dict(base_batch_size=512)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment