first commit

dff2c686 · renzhc · 8f9dd0ed · dff2c686 · dff2c686 · dff2c686
Commit dff2c686 authored Sep 03, 2024 by renzhc
20 changed files
--- a/configs/ofa/ofa-base_finetuned_caption.py
+++ b/configs/ofa/ofa-base_finetuned_caption.py
+_base_ = [
+    '../_base_/datasets/coco_caption.py',
+    '../_base_/default_runtime.py',
+]
+
+# model settings
+model = dict(
+    type='OFA',
+    task='caption',
+    vocab_size=59457,
+    embedding_dim=768,
+    encoder_cfg=dict(
+        embed_images=dict(type='OFAResNet', depth=101),
+        num_layers=6,
+    ),
+    decoder_cfg=dict(num_layers=6),
+    generation_cfg=dict(use_cache=True),
+    tokenizer=dict(type='OFATokenizer', name_or_path='OFA-Sys/OFA-base'),
+)
+
+# data settings
+data_preprocessor = dict(
+    type='MultiModalDataPreprocessor',
+    mean=[127.5, 127.5, 127.5],
+    std=[127.5, 127.5, 127.5],
+    to_rgb=True,
+)
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(480, 480)),
+    dict(type='PackInputs', meta_keys=('image_id', )),
+]
+
+train_dataloader = None
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# schedule settings
+train_cfg = None
+val_cfg = dict()
+test_cfg = dict()
--- a/configs/ofa/ofa-base_finetuned_refcoco.py
+++ b/configs/ofa/ofa-base_finetuned_refcoco.py
+_base_ = [
+    '../_base_/datasets/refcoco.py',
+    '../_base_/default_runtime.py',
+]
+
+# model settings
+model = dict(
+    type='OFA',
+    task='refcoco',
+    vocab_size=59457,
+    embedding_dim=768,
+    encoder_cfg=dict(
+        embed_images=dict(type='OFAResNet', depth=101),
+        num_layers=6,
+    ),
+    decoder_cfg=dict(num_layers=6),
+    generation_cfg=dict(use_cache=True),
+    tokenizer=dict(type='OFATokenizer', name_or_path='OFA-Sys/OFA-base'),
+)
+
+# data settings
+data_preprocessor = dict(
+    type='MultiModalDataPreprocessor',
+    mean=[127.5, 127.5, 127.5],
+    std=[127.5, 127.5, 127.5],
+    to_rgb=True,
+)
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(512, 512)),
+    dict(
+        type='PackInputs',
+        algorithm_keys=['text', 'gt_bboxes'],
+        meta_keys=['image_id', 'scale_factor'],
+    ),
+]
+
+train_dataloader = None
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# schedule settings
+train_cfg = None
+val_cfg = dict()
+test_cfg = dict()
--- a/configs/ofa/ofa-base_finetuned_vqa.py
+++ b/configs/ofa/ofa-base_finetuned_vqa.py
+_base_ = [
+    '../_base_/datasets/coco_vqa.py',
+    '../_base_/default_runtime.py',
+]
+
+ANS2LABEL = 'https://ofa-beijing.oss-cn-beijing.aliyuncs.com/datasets/vqa_data/trainval_ans2label.pkl'  # noqa: E501
+
+# model settings
+model = dict(
+    type='OFA',
+    task='vqa',
+    vocab_size=59457,
+    embedding_dim=768,
+    ans2label=ANS2LABEL,
+    encoder_cfg=dict(
+        embed_images=dict(type='OFAResNet', depth=101),
+        num_layers=6,
+        num_heads=12,
+    ),
+    decoder_cfg=dict(
+        num_layers=6,
+        num_heads=12,
+    ),
+    generation_cfg=dict(
+        num_beams=5,
+        max_new_tokens=200,
+        length_penalty=0.,  # VQA doesn't require longer answer.
+        use_cache=True,
+    ),
+    tokenizer=dict(type='OFATokenizer', name_or_path='OFA-Sys/OFA-base'),
+)
+
+# data settings
+data_preprocessor = dict(
+    type='MultiModalDataPreprocessor',
+    mean=[127.5, 127.5, 127.5],
+    std=[127.5, 127.5, 127.5],
+    to_rgb=True,
+)
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='Resize',
+        scale=(480, 480),
+        interpolation='bicubic',
+        backend='pillow'),
+    dict(type='OFAAddObjects'),
+    dict(
+        type='PackInputs',
+        algorithm_keys=[
+            'question', 'gt_answer', 'gt_answer_weight', 'decoder_prompt'
+        ],
+        meta_keys=['question_id', 'image_id'],
+    ),
+]
+
+train_dataloader = None  # Eval only
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# schedule settings
+train_cfg = None
+val_cfg = dict()
+test_cfg = dict()
--- a/configs/ofa/ofa-base_zeroshot_vqa.py
+++ b/configs/ofa/ofa-base_zeroshot_vqa.py
+_base_ = [
+    '../_base_/datasets/coco_vqa.py',
+    '../_base_/default_runtime.py',
+]
+
+# model settings
+model = dict(
+    type='OFA',
+    task='vqa',
+    vocab_size=59457,
+    embedding_dim=768,
+    encoder_cfg=dict(
+        embed_images=dict(type='OFAResNet', depth=101),
+        num_layers=6,
+        num_heads=12,
+    ),
+    decoder_cfg=dict(
+        num_layers=6,
+        num_heads=12,
+    ),
+    generation_cfg=dict(
+        num_beams=20,
+        max_new_tokens=200,
+        length_penalty=0.,  # VQA doesn't require longer answer.
+        use_cache=True,
+    ),
+    tokenizer=dict(type='OFATokenizer', name_or_path='OFA-Sys/OFA-base'),
+)
+
+# data settings
+data_preprocessor = dict(
+    mean=[127.5, 127.5, 127.5],
+    std=[127.5, 127.5, 127.5],
+    to_rgb=True,
+)
+
+train_dataloader = None  # Eval only
+
+# schedule settings
+train_cfg = None
+val_cfg = dict()
+test_cfg = dict()
--- a/configs/ofa/ofa-large_zeroshot_vqa.py
+++ b/configs/ofa/ofa-large_zeroshot_vqa.py
+_base_ = [
+    '../_base_/datasets/coco_vqa.py',
+    '../_base_/default_runtime.py',
+]
+
+# model settings
+model = dict(
+    type='OFA',
+    task='vqa',
+    vocab_size=59457,
+    embedding_dim=1024,
+    encoder_cfg=dict(
+        embed_images=dict(type='OFAResNet', depth=152),
+        num_layers=12,
+        num_heads=16,
+    ),
+    decoder_cfg=dict(
+        num_layers=12,
+        num_heads=16,
+    ),
+    generation_cfg=dict(
+        num_beams=20,
+        max_new_tokens=200,
+        length_penalty=0.,  # VQA doesn't require longer answer.
+        use_cache=True,
+    ),
+    tokenizer=dict(type='OFATokenizer', name_or_path='OFA-Sys/OFA-large'),
+)
+
+# data settings
+data_preprocessor = dict(
+    type='MultiModalDataPreprocessor',
+    mean=[127.5, 127.5, 127.5],
+    std=[127.5, 127.5, 127.5],
+    to_rgb=True,
+)
+
+train_dataloader = None  # Eval only
+
+# schedule settings
+train_cfg = None
+val_cfg = dict()
+test_cfg = dict()
--- a/configs/otter/README.md
+++ b/configs/otter/README.md
+# Otter
+
+> [Otter: A Multi-Modal Model with In-Context Instruction Tuning](https://arxiv.org/abs/2305.03726)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Large language models (LLMs) have demonstrated significant universal capabilities as few/zero-shot learners in various tasks due to their pre-training on vast amounts of text data, as exemplified by GPT-3, which boosted to InstrctGPT and ChatGPT, effectively following natural language instructions to accomplish real-world tasks. In this paper, we propose to introduce instruction tuning into multi-modal models, motivated by the Flamingo model's upstream interleaved format pretraining dataset. We adopt a similar approach to construct our MultI-Modal In-Context Instruction Tuning (MIMIC-IT) dataset. We then introduce Otter, a multi-modal model based on OpenFlamingo (open-sourced version of DeepMind's Flamingo), trained on MIMIC-IT and showcasing improved instruction-following ability and in-context learning. We also optimize OpenFlamingo's implementation for researchers, democratizing the required training resources from 1$\times$ A100 GPU to 4$\times$ RTX-3090 GPUs, and integrate both OpenFlamingo and Otter into Huggingface Transformers for more researchers to incorporate the models into their customized training and inference pipelines.
+
+<div align=center>
+<img src="https://camo.githubusercontent.com/70613ab882a7827808148a2c577029d544371e707b0832a0b01151c54ce553c3/68747470733a2f2f692e706f7374696d672e63632f5477315a304243572f6f7474657276302d322d64656d6f2e706e67" width="80%"/>
+</div>
+
+## How to use it?
+
+<!-- [TABS-BEGIN] -->
+
+**Use the model**
+
+```python
+import torch
+from mmpretrain import get_model, inference_model
+
+model = get_model('otter-9b_3rdparty_caption', pretrained=True, device='cuda', generation_cfg=dict(max_new_tokens=50))
+out = inference_model(model, 'demo/cat-dog.png')
+print(out)
+# {'pred_caption': 'The image features two adorable small puppies sitting next to each other on the grass. One puppy is brown and white, while the other is tan and white. They appear to be relaxing outdoors, enjoying each other'}
+```
+
+**Test Command**
+
+Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
+
+Test:
+
+```shell
+python tools/test.py configs/otter/otter-9b_caption.py https://download.openmmlab.com/mmclassification/v1/otter/otter-9b-adapter_20230613-51c5be8d.pth
+```
+
+<!-- [TABS-END] -->
+
+## Models and results
+
+### Image Caption on COCO
+
+| Model                         | Params (M) |  BLEU-4  |  CIDER   |            Config             |                                                 Download                                                 |
+| :---------------------------- | :--------: | :------: | :------: | :---------------------------: | :------------------------------------------------------------------------------------------------------: |
+| `otter-9b_3rdparty_caption`\* |  8220.45   | Upcoming | Upcoming | [config](otter-9b_caption.py) | [model](https://download.openmmlab.com/mmclassification/v1/otter/otter-9b-adapter_20230613-51c5be8d.pth) |
+
+*Models with * are converted from the [official repo](https://github.com/Luodian/Otter/tree/main). The config files of these models are only for inference. We haven't reproduce the training results.*
+
+### Visual Question Answering on VQAv2
+
+| Model                     | Params (M) | Accuracy |          Config           |                                                 Download                                                 |
+| :------------------------ | :--------: | :------: | :-----------------------: | :------------------------------------------------------------------------------------------------------: |
+| `otter-9b_3rdparty_vqa`\* |  8220.45   | Upcoming | [config](otter-9b_vqa.py) | [model](https://download.openmmlab.com/mmclassification/v1/otter/otter-9b-adapter_20230613-51c5be8d.pth) |
+
+*Models with * are converted from the [official repo](https://github.com/Luodian/Otter/tree/main). The config files of these models are only for inference. We haven't reproduce the training results.*
+
+## Citation
+
+```bibtex
+@article{li2023otter,
+  title={Otter: A Multi-Modal Model with In-Context Instruction Tuning},
+  author={Li, Bo and Zhang, Yuanhan and Chen, Liangyu and Wang, Jinghao and Yang, Jingkang and Liu, Ziwei},
+  journal={arXiv preprint arXiv:2305.03726},
+  year={2023}
+}
+
+@article{li2023mimicit,
+    title={MIMIC-IT: Multi-Modal In-Context Instruction Tuning},
+    author={Bo Li and Yuanhan Zhang and Liangyu Chen and Jinghao Wang and Fanyi Pu and Jingkang Yang and Chunyuan Li and Ziwei Liu},
+    year={2023},
+    eprint={2306.05425},
+    archivePrefix={arXiv},
+    primaryClass={cs.CV}
+}
+```
--- a/configs/otter/metafile.yml
+++ b/configs/otter/metafile.yml
+Collections:
+  - Name: Otter
+    Metadata:
+      Architecture:
+        - Transformer
+        - Gated Cross-Attention Dense
+    Paper:
+      Title: 'Otter: A Multi-Modal Model with In-Context Instruction Tuning'
+      URL: https://arxiv.org/abs/2305.03726
+    README: configs/otter/README.md
+
+Models:
+  - Name: otter-9b_3rdparty_caption
+    Metadata:
+      FLOPs: null
+      Parameters: 8220452880
+    In Collection: Otter
+    Results:
+      - Task: Image Caption
+        Dataset: COCO
+        Metrics:
+          BLEU-4: null
+          CIDER: null
+    Weights: https://download.openmmlab.com/mmclassification/v1/otter/otter-9b-adapter_20230613-51c5be8d.pth
+    Config: configs/otter/otter-9b_caption.py
+    Converted From:
+      Weights: https://huggingface.co/luodian/otter-9b-hf
+      Code: https://github.com/Luodian/Otter/tree/main
+  - Name: otter-9b_3rdparty_vqa
+    Metadata:
+      FLOPs: null
+      Parameters: 8220452880
+    In Collection: Otter
+    Results:
+      - Task: Visual Question Answering
+        Dataset: VQAv2
+        Metrics:
+          Accuracy: null
+    Weights: https://download.openmmlab.com/mmclassification/v1/otter/otter-9b-adapter_20230613-51c5be8d.pth
+    Config: configs/otter/otter-9b_vqa.py
+    Converted From:
+      Weights: https://huggingface.co/luodian/otter-9b-hf
+      Code: https://github.com/Luodian/Otter/tree/main
--- a/configs/otter/otter-9b_caption.py
+++ b/configs/otter/otter-9b_caption.py
+_base_ = [
+    '../_base_/default_runtime.py',
+]
+
+# model settings
+model = dict(
+    type='Otter',
+    tokenizer=dict(type='LlamaTokenizer', name_or_path='huggyllama/llama-7b'),
+    vision_encoder=dict(
+        type='VisionTransformer',
+        arch='l',
+        patch_size=14,
+        pre_norm=True,
+        norm_cfg=dict(type='LN', eps=1e-5),
+        layer_cfgs=dict(act_cfg=dict(type='mmpretrain.QuickGELU')),
+        final_norm=False,
+        out_type='raw',
+        pretrained=(
+            'https://download.openmmlab.com/mmclassification/v0/clip/'
+            'vit-large-p14_clip-openai-pre_3rdparty_20230517-95e2af0b.pth'),
+    ),
+    lang_encoder=dict(
+        base=dict(
+            type='AutoModelForCausalLM',
+            name_or_path='huggyllama/llama-7b',
+            local_files_only=True),
+        adapter=dict(
+            type='FlamingoLMAdapter',
+            vis_hidden_size=1024,
+            cross_attn_every_n_layers=4,
+            use_media_placement_augmentation=False,
+            only_attend_previous=True,
+        ),
+    ),
+    task='caption',
+    final_prompt_tmpl='<image>User:Please describe the image. GPT:<answer>',
+    generation_cfg=dict(
+        num_beams=3, max_new_tokens=24, no_repeat_ngram_size=3),
+)
+
+# data settings
+data_preprocessor = dict(
+    type='MultiModalDataPreprocessor',
+    mean=[122.770938, 116.7460125, 104.09373615],
+    std=[68.5005327, 66.6321579, 70.32316305],
+    to_rgb=True,
+)
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='ResizeEdge',
+        scale=224,
+        interpolation='bicubic',
+        backend='pillow'),
+    dict(type='CenterCrop', crop_size=(224, 224)),
+    dict(
+        type='PackInputs',
+        algorithm_keys=['gt_caption'],
+        meta_keys=['image_id'],
+    ),
+]
+
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    dataset=dict(
+        type='COCOCaption',
+        data_root='data/coco',
+        ann_file='annotations/coco_karpathy_val.json',
+        pipeline=test_pipeline,
+    ),
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    persistent_workers=True,
+)
+
+val_evaluator = dict(
+    type='COCOCaption',
+    ann_file='data/coco/annotations/coco_karpathy_val_gt.json')
+
+# If you want standard test, please manually configure the test dataset
+test_dataloader = val_dataloader
+test_evaluator = val_evaluator
+
+# schedule settings
+val_cfg = dict()
+test_cfg = dict()
--- a/configs/otter/otter-9b_vqa.py
+++ b/configs/otter/otter-9b_vqa.py
+_base_ = [
+    '../_base_/default_runtime.py',
+]
+
+# model settings
+model = dict(
+    type='Otter',
+    tokenizer=dict(type='LlamaTokenizer', name_or_path='huggyllama/llama-7b'),
+    vision_encoder=dict(
+        type='VisionTransformer',
+        arch='l',
+        patch_size=14,
+        pre_norm=True,
+        norm_cfg=dict(type='LN', eps=1e-5),
+        layer_cfgs=dict(act_cfg=dict(type='QuickGELU')),
+        final_norm=False,
+        out_type='raw',
+        pretrained=(
+            'https://download.openmmlab.com/mmclassification/v0/clip/'
+            'vit-large-p14_clip-openai-pre_3rdparty_20230517-95e2af0b.pth'),
+    ),
+    lang_encoder=dict(
+        base=dict(
+            type='AutoModelForCausalLM',
+            name_or_path='huggyllama/llama-7b',
+            local_files_only=True),
+        adapter=dict(
+            type='FlamingoLMAdapter',
+            vis_hidden_size=1024,
+            cross_attn_every_n_layers=4,
+            use_media_placement_augmentation=False,
+            only_attend_previous=True,
+        ),
+    ),
+    task='vqa',
+    final_prompt_tmpl='<image>User:{question} GPT:<answer>',
+    generation_cfg=dict(
+        num_beams=3, max_new_tokens=24, no_repeat_ngram_size=3),
+)
+
+# data settings
+data_preprocessor = dict(
+    type='MultiModalDataPreprocessor',
+    mean=[122.770938, 116.7460125, 104.09373615],
+    std=[68.5005327, 66.6321579, 70.32316305],
+    to_rgb=True,
+)
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='ResizeEdge',
+        scale=224,
+        interpolation='bicubic',
+        backend='pillow'),
+    dict(type='CenterCrop', crop_size=(224, 224)),
+    dict(
+        type='PackInputs',
+        algorithm_keys=['question', 'gt_answer', 'gt_answer_weight', 'shots'],
+        meta_keys=['image_id'],
+    ),
+]
+
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    dataset=dict(
+        type='FlamingoEvalCOCOVQA',
+        data_root='data/coco',
+        data_prefix='val2014',
+        question_file='annotations/v2_OpenEnded_mscoco_val2014_questions.json',
+        ann_file='annotations/v2_mscoco_val2014_annotations.json',
+        pipeline=test_pipeline,
+        num_shots=0,
+        num_support_examples=2048,
+        num_query_examples=5000,
+    ),
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    persistent_workers=True,
+)
+val_evaluator = dict(type='VQAAcc')
+
+test_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    dataset=dict(
+        type='FlamingoEvalCOCOVQA',
+        data_root='data/coco',
+        data_prefix='test2015',
+        question_file=
+        'annotations/v2_OpenEnded_mscoco_test-dev2015_questions.json',
+        pipeline=test_pipeline,
+        num_shots=0,
+        num_support_examples=2048,
+        num_query_examples=5000,
+    ),
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    persistent_workers=True,
+)
+test_evaluator = dict(type='ReportVQA', file_path='vqa_test-dev.json')
+
+# schedule settings
+val_cfg = dict()
+test_cfg = dict()
--- a/configs/poolformer/README.md
+++ b/configs/poolformer/README.md
+# PoolFormer
+
+> [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Transformers have shown great potential in computer vision tasks. A common belief is their attention-based token mixer module contributes most to their competence. However, recent works show the attention-based module in transformers can be replaced by spatial MLPs and the resulted models still perform quite well. Based on this observation, we hypothesize that the general architecture of the transformers, instead of the specific token mixer module, is more essential to the model's performance. To verify this, we deliberately replace the attention module in transformers with an embarrassingly simple spatial pooling operator to conduct only basic token mixing. Surprisingly, we observe that the derived model, termed as PoolFormer, achieves competitive performance on multiple computer vision tasks. For example, on ImageNet-1K, PoolFormer achieves 82.1% top-1 accuracy, surpassing well-tuned vision transformer/MLP-like baselines DeiT-B/ResMLP-B24 by 0.3%/1.1% accuracy with 35%/52% fewer parameters and 49%/61% fewer MACs. The effectiveness of PoolFormer verifies our hypothesis and urges us to initiate the concept of "MetaFormer", a general architecture abstracted from transformers without specifying the token mixer. Based on the extensive experiments, we argue that MetaFormer is the key player in achieving superior results for recent transformer and MLP-like models on vision tasks. This work calls for more future research dedicated to improving MetaFormer instead of focusing on the token mixer modules. Additionally, our proposed PoolFormer could serve as a starting baseline for future MetaFormer architecture design.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/15921929/144710761-1635f59a-abde-4946-984c-a2c3f22a19d2.png" width="100%"/>
+</div>
+
+## How to use it?
+
+<!-- [TABS-BEGIN] -->
+
+**Predict image**
+
+```python
+from mmpretrain import inference_model
+
+predict = inference_model('poolformer-s12_3rdparty_32xb128_in1k', 'demo/bird.JPEG')
+print(predict['pred_class'])
+print(predict['pred_score'])
+```
+
+**Use the model**
+
+```python
+import torch
+from mmpretrain import get_model
+
+model = get_model('poolformer-s12_3rdparty_32xb128_in1k', pretrained=True)
+inputs = torch.rand(1, 3, 224, 224)
+out = model(inputs)
+print(type(out))
+# To extract features.
+feats = model.extract_feat(inputs)
+print(type(feats))
+```
+
+**Test Command**
+
+Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
+
+Test:
+
+```shell
+python tools/test.py configs/poolformer/poolformer-s12_32xb128_in1k.py https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-s12_3rdparty_32xb128_in1k_20220414-f8d83051.pth
+```
+
+<!-- [TABS-END] -->
+
+## Models and results
+
+### Image Classification on ImageNet-1k
+
+| Model                                    |   Pretrain   | Params (M) | Flops (G) | Top-1 (%) | Top-5 (%) |                  Config                  |                                Download                                 |
+| :--------------------------------------- | :----------: | :--------: | :-------: | :-------: | :-------: | :--------------------------------------: | :---------------------------------------------------------------------: |
+| `poolformer-s12_3rdparty_32xb128_in1k`\* | From scratch |   11.92    |   1.87    |   77.24   |   93.51   | [config](poolformer-s12_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-s12_3rdparty_32xb128_in1k_20220414-f8d83051.pth) |
+| `poolformer-s24_3rdparty_32xb128_in1k`\* | From scratch |   21.39    |   3.51    |   80.33   |   95.05   | [config](poolformer-s24_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-s24_3rdparty_32xb128_in1k_20220414-d7055904.pth) |
+| `poolformer-s36_3rdparty_32xb128_in1k`\* | From scratch |   30.86    |   5.15    |   81.43   |   95.45   | [config](poolformer-s36_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-s36_3rdparty_32xb128_in1k_20220414-d78ff3e8.pth) |
+| `poolformer-m36_3rdparty_32xb128_in1k`\* | From scratch |   56.17    |   8.96    |   82.14   |   95.71   | [config](poolformer-m36_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-m36_3rdparty_32xb128_in1k_20220414-c55e0949.pth) |
+| `poolformer-m48_3rdparty_32xb128_in1k`\* | From scratch |   73.47    |   11.80   |   82.51   |   95.95   | [config](poolformer-m48_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-m48_3rdparty_32xb128_in1k_20220414-9378f3eb.pth) |
+
+*Models with * are converted from the [official repo](https://github.com/sail-sg/poolformer). The config files of these models are only for inference. We haven't reproduce the training results.*
+
+## Citation
+
+```bibtex
+@inproceedings{yu2022metaformer,
+  title={Metaformer is actually what you need for vision},
+  author={Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng},
+  booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
+  pages={10819--10829},
+  year={2022}
+}
+```
--- a/configs/poolformer/metafile.yml
+++ b/configs/poolformer/metafile.yml
+Collections:
+  - Name: PoolFormer
+    Metadata:
+      Training Data: ImageNet-1k
+      Architecture:
+        - Pooling
+        - 1x1 Convolution
+        - LayerScale
+    Paper:
+      URL: https://arxiv.org/abs/2111.11418
+      Title: MetaFormer is Actually What You Need for Vision
+    README: configs/poolformer/README.md
+    Code:
+      Version: v0.22.1
+      URL: https://github.com/open-mmlab/mmpretrain/blob/v0.22.1/mmcls/models/backbones/poolformer.py
+
+Models:
+  - Name: poolformer-s12_3rdparty_32xb128_in1k
+    Metadata:
+      FLOPs: 1871399424
+      Parameters: 11915176
+    In Collection: PoolFormer
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 77.24
+          Top 5 Accuracy: 93.51
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-s12_3rdparty_32xb128_in1k_20220414-f8d83051.pth
+    Config: configs/poolformer/poolformer-s12_32xb128_in1k.py
+    Converted From:
+      Weights: https://github.com/sail-sg/poolformer/releases/download/v1.0/poolformer_s12.pth.tar
+      Code: https://github.com/sail-sg/poolformer
+  - Name: poolformer-s24_3rdparty_32xb128_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 3510411008
+      Parameters: 21388968
+    In Collection: PoolFormer
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 80.33
+          Top 5 Accuracy: 95.05
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-s24_3rdparty_32xb128_in1k_20220414-d7055904.pth
+    Config: configs/poolformer/poolformer-s24_32xb128_in1k.py
+    Converted From:
+      Weights: https://github.com/sail-sg/poolformer/releases/download/v1.0/poolformer_s24.pth.tar
+      Code: https://github.com/sail-sg/poolformer
+  - Name: poolformer-s36_3rdparty_32xb128_in1k
+    Metadata:
+      FLOPs: 5149422592
+      Parameters: 30862760
+    In Collection: PoolFormer
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 81.43
+          Top 5 Accuracy: 95.45
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-s36_3rdparty_32xb128_in1k_20220414-d78ff3e8.pth
+    Config: configs/poolformer/poolformer-s36_32xb128_in1k.py
+    Converted From:
+      Weights: https://github.com/sail-sg/poolformer/releases/download/v1.0/poolformer_s36.pth.tar
+      Code: https://github.com/sail-sg/poolformer
+  - Name: poolformer-m36_3rdparty_32xb128_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 8960175744
+      Parameters: 56172520
+    In Collection: PoolFormer
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 82.14
+          Top 5 Accuracy: 95.71
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-m36_3rdparty_32xb128_in1k_20220414-c55e0949.pth
+    Config: configs/poolformer/poolformer-m36_32xb128_in1k.py
+    Converted From:
+      Weights: https://github.com/sail-sg/poolformer/releases/download/v1.0/poolformer_m36.pth.tar
+      Code: https://github.com/sail-sg/poolformer
+  - Name: poolformer-m48_3rdparty_32xb128_in1k
+    Metadata:
+      FLOPs: 11801805696
+      Parameters: 73473448
+    In Collection: PoolFormer
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 82.51
+          Top 5 Accuracy: 95.95
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-m48_3rdparty_32xb128_in1k_20220414-9378f3eb.pth
+    Config: configs/poolformer/poolformer-m48_32xb128_in1k.py
+    Converted From:
+      Weights: https://github.com/sail-sg/poolformer/releases/download/v1.0/poolformer_m48.pth.tar
+      Code: https://github.com/sail-sg/poolformer
--- a/configs/poolformer/poolformer-m36_32xb128_in1k.py
+++ b/configs/poolformer/poolformer-m36_32xb128_in1k.py
+_base_ = [
+    '../_base_/models/poolformer/poolformer_m36.py',
+    '../_base_/datasets/imagenet_bs128_poolformer_medium_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# schedule settings
+optim_wrapper = dict(
+    optimizer=dict(lr=4e-3),
+    clip_grad=dict(max_norm=5.0),
+)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+# base_batch_size = (32 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
--- a/configs/poolformer/poolformer-m48_32xb128_in1k.py
+++ b/configs/poolformer/poolformer-m48_32xb128_in1k.py
+_base_ = [
+    '../_base_/models/poolformer/poolformer_m48.py',
+    '../_base_/datasets/imagenet_bs128_poolformer_medium_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# schedule settings
+optim_wrapper = dict(
+    optimizer=dict(lr=4e-3),
+    clip_grad=dict(max_norm=5.0),
+)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+# base_batch_size = (32 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
--- a/configs/poolformer/poolformer-s12_32xb128_in1k.py
+++ b/configs/poolformer/poolformer-s12_32xb128_in1k.py
+_base_ = [
+    '../_base_/models/poolformer/poolformer_s12.py',
+    '../_base_/datasets/imagenet_bs128_poolformer_small_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# schedule settings
+optim_wrapper = dict(
+    optimizer=dict(lr=4e-3),
+    clip_grad=dict(max_norm=5.0),
+)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+# base_batch_size = (32 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
--- a/configs/poolformer/poolformer-s24_32xb128_in1k.py
+++ b/configs/poolformer/poolformer-s24_32xb128_in1k.py
+_base_ = [
+    '../_base_/models/poolformer/poolformer_s24.py',
+    '../_base_/datasets/imagenet_bs128_poolformer_small_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# schedule settings
+optim_wrapper = dict(
+    optimizer=dict(lr=4e-3),
+    clip_grad=dict(max_norm=5.0),
+)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+# base_batch_size = (32 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
--- a/configs/poolformer/poolformer-s36_32xb128_in1k.py
+++ b/configs/poolformer/poolformer-s36_32xb128_in1k.py
+_base_ = [
+    '../_base_/models/poolformer/poolformer_s36.py',
+    '../_base_/datasets/imagenet_bs128_poolformer_small_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# schedule settings
+optim_wrapper = dict(
+    optimizer=dict(lr=4e-3),
+    clip_grad=dict(max_norm=5.0),
+)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+# base_batch_size = (32 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
--- a/configs/regnet/README.md
+++ b/configs/regnet/README.md
+# RegNet
+
+> [Designing Network Design Spaces](https://arxiv.org/abs/2003.13678)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+In this work, we present a new network design paradigm. Our goal is to help advance the understanding of network design and discover design principles that generalize across settings. Instead of focusing on designing individual network instances, we design network design spaces that parametrize populations of networks. The overall process is analogous to classic manual design of networks, but elevated to the design space level. Using our methodology we explore the structure aspect of network design and arrive at a low-dimensional design space consisting of simple, regular networks that we call RegNet. The core insight of the RegNet parametrization is surprisingly simple: widths and depths of good networks can be explained by a quantized linear function. We analyze the RegNet design space and arrive at interesting findings that do not match the current practice of network design. The RegNet design space provides simple and fast networks that work well across a wide range of flop regimes. Under comparable training settings and flops, the RegNet models outperform the popular EfficientNet models while being up to 5x faster on GPUs.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/26739999/142572813-5dad3317-9d58-4177-971f-d346e01fb3c4.png" width=60%/>
+</div>
+
+## How to use it?
+
+<!-- [TABS-BEGIN] -->
+
+**Predict image**
+
+```python
+from mmpretrain import inference_model
+
+predict = inference_model('regnetx-400mf_8xb128_in1k', 'demo/bird.JPEG')
+print(predict['pred_class'])
+print(predict['pred_score'])
+```
+
+**Use the model**
+
+```python
+import torch
+from mmpretrain import get_model
+
+model = get_model('regnetx-400mf_8xb128_in1k', pretrained=True)
+inputs = torch.rand(1, 3, 224, 224)
+out = model(inputs)
+print(type(out))
+# To extract features.
+feats = model.extract_feat(inputs)
+print(type(feats))
+```
+
+**Train/Test Command**
+
+Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
+
+Train:
+
+```shell
+python tools/train.py configs/regnet/regnetx-400mf_8xb128_in1k.py
+```
+
+Test:
+
+```shell
+python tools/test.py configs/regnet/regnetx-400mf_8xb128_in1k.py https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-400mf_8xb128_in1k_20211213-89bfc226.pth
+```
+
+<!-- [TABS-END] -->
+
+## Models and results
+
+### Image Classification on ImageNet-1k
+
+| Model                       |   Pretrain   | Params (M) | Flops (G) | Top-1 (%) | Top-5 (%) |                 Config                 |                                        Download                                        |
+| :-------------------------- | :----------: | :--------: | :-------: | :-------: | :-------: | :------------------------------------: | :------------------------------------------------------------------------------------: |
+| `regnetx-400mf_8xb128_in1k` | From scratch |    5.16    |   0.41    |   72.56   |   90.78   | [config](regnetx-400mf_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-400mf_8xb128_in1k_20211213-89bfc226.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-400mf_8xb128_in1k_20211208_143316.json) |
+| `regnetx-800mf_8xb128_in1k` | From scratch |    7.26    |   0.81    |   74.76   |   92.32   | [config](regnetx-800mf_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-800mf_8xb128_in1k_20211213-222b0f11.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-800mf_8xb128_in1k_20211207_143037.log.json) |
+| `regnetx-1.6gf_8xb128_in1k` | From scratch |    9.19    |   1.63    |   76.84   |   93.31   | [config](regnetx-1.6gf_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-1.6gf_8xb128_in1k_20211213-d1b89758.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-1.6gf_8xb128_in1k_20211208_143018.log.json) |
+| `regnetx-3.2gf_8xb64_in1k`  | From scratch |    3.21    |   1.53    |   78.09   |   94.08   | [config](regnetx-3.2gf_8xb64_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-3.2gf_8xb64_in1k_20211213-1fdd82ae.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-3.2gf_8xb64_in1k_20211208_142720.log.json) |
+| `regnetx-4.0gf_8xb64_in1k`  | From scratch |   22.12    |   4.00    |   78.60   |   94.17   | [config](regnetx-4.0gf_8xb64_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-4.0gf_8xb64_in1k_20211213-efed675c.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-4.0gf_8xb64_in1k_20211207_150431.log.json) |
+| `regnetx-6.4gf_8xb64_in1k`  | From scratch |   26.21    |   6.51    |   79.38   |   94.65   | [config](regnetx-6.4gf_8xb64_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-6.4gf_8xb64_in1k_20211215-5c6089da.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-6.4gf_8xb64_in1k_20211213_172748.log.json) |
+| `regnetx-8.0gf_8xb64_in1k`  | From scratch |   39.57    |   8.03    |   79.12   |   94.51   | [config](regnetx-8.0gf_8xb64_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-8.0gf_8xb64_in1k_20211213-9a9fcc76.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-8.0gf_8xb64_in1k_20211208_103250.log.json) |
+| `regnetx-12gf_8xb64_in1k`   | From scratch |   46.11    |   12.15   |   79.67   |   95.03   |  [config](regnetx-12gf_8xb64_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-12gf_8xb64_in1k_20211213-5df8c2f8.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-12gf_8xb64_in1k_20211208_143713.log.json) |
+
+## Citation
+
+```bibtex
+@article{radosavovic2020designing,
+    title={Designing Network Design Spaces},
+    author={Ilija Radosavovic and Raj Prateek Kosaraju and Ross Girshick and Kaiming He and Piotr Dollár},
+    year={2020},
+    eprint={2003.13678},
+    archivePrefix={arXiv},
+    primaryClass={cs.CV}
+}
+```
--- a/configs/regnet/metafile.yml
+++ b/configs/regnet/metafile.yml
+Collections:
+  - Name: RegNet
+    Metadata:
+      Training Data: ImageNet-1k
+      Architecture:
+        - Neural Architecture Search
+        - Design Space Design
+        - Precise BN
+        - SGD with nesterov
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: Designing Network Design Spaces
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmpretrain/blob/v0.18.0/mmcls/models/backbones/regnet.py
+      Version: v0.18.0
+
+Models:
+  - Name: regnetx-400mf_8xb128_in1k
+    In Collection: RegNet
+    Config: configs/regnet/regnetx-400mf_8xb128_in1k.py
+    Metadata:
+      FLOPs: 410000000     # 0.41G
+      Parameters: 5160000  # 5.16M
+    Results:
+    - Dataset: ImageNet-1k
+      Task: Image Classification
+      Metrics:
+        Top 1 Accuracy: 72.56
+        Top 5 Accuracy: 90.78
+    Weights: https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-400mf_8xb128_in1k_20211213-89bfc226.pth
+  - Name: regnetx-800mf_8xb128_in1k
+    In Collection: RegNet
+    Config: configs/regnet/regnetx-800mf_8xb128_in1k.py
+    Metadata:
+      FLOPs: 810000000     # 0.81G
+      Parameters: 7260000  # 7.26M
+    Results:
+    - Dataset: ImageNet-1k
+      Task: Image Classification
+      Metrics:
+        Top 1 Accuracy: 74.76
+        Top 5 Accuracy: 92.32
+    Weights: https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-800mf_8xb128_in1k_20211213-222b0f11.pth
+  - Name: regnetx-1.6gf_8xb128_in1k
+    In Collection: RegNet
+    Config: configs/regnet/regnetx-1.6gf_8xb128_in1k.py
+    Metadata:
+      FLOPs: 1630000000     # 1.63G
+      Parameters: 9190000   # 9.19M
+    Results:
+    - Dataset: ImageNet-1k
+      Task: Image Classification
+      Metrics:
+        Top 1 Accuracy: 76.84
+        Top 5 Accuracy: 93.31
+    Weights: https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-1.6gf_8xb128_in1k_20211213-d1b89758.pth
+  - Name: regnetx-3.2gf_8xb64_in1k
+    In Collection: RegNet
+    Config: configs/regnet/regnetx-3.2gf_8xb64_in1k.py
+    Metadata:
+      FLOPs: 1530000000     # 1.53G
+      Parameters: 3210000   # 32.1M
+    Results:
+    - Dataset: ImageNet-1k
+      Task: Image Classification
+      Metrics:
+        Top 1 Accuracy: 78.09
+        Top 5 Accuracy: 94.08
+    Weights: https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-3.2gf_8xb64_in1k_20211213-1fdd82ae.pth
+  - Name: regnetx-4.0gf_8xb64_in1k
+    In Collection: RegNet
+    Config: configs/regnet/regnetx-4.0gf_8xb64_in1k.py
+    Metadata:
+      FLOPs: 4000000000     # 4G
+      Parameters: 22120000  # 22.12M
+    Results:
+    - Dataset: ImageNet-1k
+      Task: Image Classification
+      Metrics:
+        Top 1 Accuracy: 78.60
+        Top 5 Accuracy: 94.17
+    Weights: https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-4.0gf_8xb64_in1k_20211213-efed675c.pth
+  - Name: regnetx-6.4gf_8xb64_in1k
+    In Collection: RegNet
+    Config: configs/regnet/regnetx-6.4gf_8xb64_in1k.py
+    Metadata:
+      FLOPs: 6510000000      # 6.51G
+      Parameters: 26210000   # 26.21M
+    Results:
+    - Dataset: ImageNet-1k
+      Task: Image Classification
+      Metrics:
+        Top 1 Accuracy: 79.38
+        Top 5 Accuracy: 94.65
+    Weights: https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-6.4gf_8xb64_in1k_20211215-5c6089da.pth
+  - Name: regnetx-8.0gf_8xb64_in1k
+    In Collection: RegNet
+    Config: configs/regnet/regnetx-8.0gf_8xb64_in1k.py
+    Metadata:
+      FLOPs: 8030000000     # 8.03G
+      Parameters: 39570000  # 39.57M
+    Results:
+    - Dataset: ImageNet-1k
+      Task: Image Classification
+      Metrics:
+        Top 1 Accuracy: 79.12
+        Top 5 Accuracy: 94.51
+    Weights: https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-8.0gf_8xb64_in1k_20211213-9a9fcc76.pth
+  - Name: regnetx-12gf_8xb64_in1k
+    In Collection: RegNet
+    Config: configs/regnet/regnetx-12gf_8xb64_in1k.py
+    Metadata:
+      FLOPs: 12150000000      # 12.15G
+      Parameters: 46110000    # 46.11M
+    Results:
+    - Dataset: ImageNet-1k
+      Task: Image Classification
+      Metrics:
+        Top 1 Accuracy: 79.67
+        Top 5 Accuracy: 95.03
+    Weights: https://download.openmmlab.com/mmclassification/v0/regnet/regnetx-12gf_8xb64_in1k_20211213-5df8c2f8.pth
--- a/configs/regnet/regnetx-1.6gf_8xb128_in1k.py
+++ b/configs/regnet/regnetx-1.6gf_8xb128_in1k.py
+_base_ = ['./regnetx-400mf_8xb128_in1k.py']
+
+# model settings
+model = dict(
+    backbone=dict(type='RegNet', arch='regnetx_1.6gf'),
+    head=dict(in_channels=912, ))
--- a/configs/regnet/regnetx-12gf_8xb64_in1k.py
+++ b/configs/regnet/regnetx-12gf_8xb64_in1k.py
+_base_ = ['./regnetx-400mf_8xb128_in1k.py']
+
+# model settings
+model = dict(
+    backbone=dict(type='RegNet', arch='regnetx_12gf'),
+    head=dict(in_channels=2240, ))
+
+# dataset settings
+train_dataloader = dict(batch_size=64)
+
+# schedule settings
+# for batch_size 512, use lr = 0.4
+optim_wrapper = dict(optimizer=dict(lr=0.4))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+# base_batch_size = (8 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=512)