first commit

dff2c686 · renzhc · 8f9dd0ed · dff2c686 · dff2c686 · dff2c686
Commit dff2c686 authored Sep 03, 2024 by renzhc
20 changed files
--- a/configs/clip/vit-base-p16_pt-64xb64_in1k.py
+++ b/configs/clip/vit-base-p16_pt-64xb64_in1k.py
+_base_ = [
+    '../_base_/models/vit-base-p16.py',
+    '../_base_/datasets/imagenet_bs64_pil_resize.py',
+    '../_base_/schedules/imagenet_bs4096_AdamW.py',
+    '../_base_/default_runtime.py'
+]
+
+# model setting
+model = dict(backbone=dict(pre_norm=True))
+
+# data settings
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='RandomResizedCrop',
+        scale=224,
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='ResizeEdge',
+        scale=224,
+        edge='short',
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='PackInputs'),
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# schedule setting
+optim_wrapper = dict(clip_grad=dict(max_norm=1.0))
--- a/configs/clip/vit-base-p32_pt-64xb64_in1k-384px.py
+++ b/configs/clip/vit-base-p32_pt-64xb64_in1k-384px.py
+_base_ = [
+    '../_base_/models/vit-base-p32.py',
+    '../_base_/datasets/imagenet_bs64_pil_resize.py',
+    '../_base_/schedules/imagenet_bs4096_AdamW.py',
+    '../_base_/default_runtime.py'
+]
+
+# model setting
+model = dict(backbone=dict(pre_norm=True))
+
+# data settings
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='RandomResizedCrop',
+        scale=384,
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='ResizeEdge',
+        scale=384,
+        edge='short',
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='CenterCrop', crop_size=384),
+    dict(type='PackInputs'),
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# schedule setting
+optim_wrapper = dict(clip_grad=dict(max_norm=1.0))
--- a/configs/clip/vit-base-p32_pt-64xb64_in1k-448px.py
+++ b/configs/clip/vit-base-p32_pt-64xb64_in1k-448px.py
+_base_ = [
+    '../_base_/models/vit-base-p32.py',
+    '../_base_/datasets/imagenet_bs64_pil_resize.py',
+    '../_base_/schedules/imagenet_bs4096_AdamW.py',
+    '../_base_/default_runtime.py'
+]
+
+# model setting
+model = dict(backbone=dict(pre_norm=True))
+
+# data settings
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='RandomResizedCrop',
+        scale=448,
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='ResizeEdge',
+        scale=448,
+        edge='short',
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='CenterCrop', crop_size=448),
+    dict(type='PackInputs'),
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# schedule setting
+optim_wrapper = dict(clip_grad=dict(max_norm=1.0))
--- a/configs/clip/vit-base-p32_pt-64xb64_in1k.py
+++ b/configs/clip/vit-base-p32_pt-64xb64_in1k.py
+_base_ = [
+    '../_base_/models/vit-base-p32.py',
+    '../_base_/datasets/imagenet_bs64_pil_resize.py',
+    '../_base_/schedules/imagenet_bs4096_AdamW.py',
+    '../_base_/default_runtime.py'
+]
+
+# model setting
+model = dict(backbone=dict(pre_norm=True))
+
+# data settings
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='RandomResizedCrop',
+        scale=224,
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='ResizeEdge',
+        scale=224,
+        edge='short',
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='PackInputs'),
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# schedule setting
+optim_wrapper = dict(clip_grad=dict(max_norm=1.0))
--- a/configs/clip/vit-large-p14_headless.py
+++ b/configs/clip/vit-large-p14_headless.py
+_base_ = ['../_base_/default_runtime.py']
+
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='VisionTransformer',
+        arch='l',
+        img_size=224,
+        patch_size=16,
+        drop_rate=0.1,
+        pre_norm=True,
+    ),
+)
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='ResizeEdge', scale=256, edge='short', backend='pillow'),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='PackInputs'),
+]
+
+test_dataloader = dict(
+    batch_size=64,
+    num_workers=5,
+    dataset=dict(
+        type='ImageNet',
+        data_root='data/imagenet',
+        ann_file='meta/val.txt',
+        data_prefix='val',
+        pipeline=test_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=False),
+)
+test_evaluator = None
--- a/configs/conformer/README.md
+++ b/configs/conformer/README.md
+# Conformer
+
+> [Conformer: Local Features Coupling Global Representations for Visual Recognition](https://arxiv.org/abs/2105.03889)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Within Convolutional Neural Network (CNN), the convolution operations are good at extracting local features but experience difficulty to capture global representations. Within visual transformer, the cascaded self-attention modules can capture long-distance feature dependencies but unfortunately deteriorate local feature details. In this paper, we propose a hybrid network structure, termed Conformer, to take advantage of convolutional operations and self-attention mechanisms for enhanced representation learning. Conformer roots in the Feature Coupling Unit (FCU), which fuses local features and global representations under different resolutions in an interactive fashion. Conformer adopts a concurrent structure so that local features and global representations are retained to the maximum extent. Experiments show that Conformer, under the comparable parameter complexity, outperforms the visual transformer (DeiT-B) by 2.3% on ImageNet. On MSCOCO, it outperforms ResNet-101 by 3.7% and 3.6% mAPs for object detection and instance segmentation, respectively, demonstrating the great potential to be a general backbone network.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/26739999/144957687-926390ed-6119-4e4c-beaa-9bc0017fe953.png" width="90%"/>
+</div>
+
+## How to use it?
+
+<!-- [TABS-BEGIN] -->
+
+**Predict image**
+
+```python
+from mmpretrain import inference_model
+
+predict = inference_model('conformer-tiny-p16_3rdparty_in1k', 'demo/bird.JPEG')
+print(predict['pred_class'])
+print(predict['pred_score'])
+```
+
+**Use the model**
+
+```python
+import torch
+from mmpretrain import get_model
+
+model = get_model('conformer-tiny-p16_3rdparty_in1k', pretrained=True)
+inputs = torch.rand(1, 3, 224, 224)
+out = model(inputs)
+print(type(out))
+# To extract features.
+feats = model.extract_feat(inputs)
+print(type(feats))
+```
+
+**Train/Test Command**
+
+Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
+
+Train:
+
+```shell
+python tools/train.py configs/conformer/conformer-small-p32_8xb128_in1k.py
+```
+
+Test:
+
+```shell
+python tools/test.py configs/conformer/conformer-tiny-p16_8xb128_in1k.py https://download.openmmlab.com/mmclassification/v0/conformer/conformer-tiny-p16_3rdparty_8xb128_in1k_20211206-f6860372.pth
+```
+
+<!-- [TABS-END] -->
+
+## Models and results
+
+### Image Classification on ImageNet-1k
+
+| Model                                 |   Pretrain   | Params (M) | Flops (G) | Top-1 (%) | Top-5 (%) |                    Config                    |                                Download                                |
+| :------------------------------------ | :----------: | :--------: | :-------: | :-------: | :-------: | :------------------------------------------: | :--------------------------------------------------------------------: |
+| `conformer-tiny-p16_3rdparty_in1k`\*  | From scratch |   23.52    |   4.90    |   81.31   |   95.60   | [config](conformer-tiny-p16_8xb128_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/conformer/conformer-tiny-p16_3rdparty_8xb128_in1k_20211206-f6860372.pth) |
+| `conformer-small-p16_3rdparty_in1k`\* | From scratch |   37.67    |   10.31   |   83.32   |   96.46   | [config](conformer-small-p16_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/conformer/conformer-small-p16_3rdparty_8xb128_in1k_20211206-3065dcf5.pth) |
+| `conformer-small-p32_8xb128_in1k`     | From scratch |   38.85    |   7.09    |   81.96   |   96.02   | [config](conformer-small-p32_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/conformer/conformer-small-p32_8xb128_in1k_20211206-947a0816.pth) |
+| `conformer-base-p16_3rdparty_in1k`\*  | From scratch |   83.29    |   22.89   |   83.82   |   96.59   | [config](conformer-base-p16_8xb128_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/conformer/conformer-base-p16_3rdparty_8xb128_in1k_20211206-bfdf8637.pth) |
+
+*Models with * are converted from the [official repo](https://github.com/pengzhiliang/Conformer/blob/main/models.py#L89). The config files of these models are only for inference. We haven't reproduce the training results.*
+
+## Citation
+
+```bibtex
+@article{peng2021conformer,
+      title={Conformer: Local Features Coupling Global Representations for Visual Recognition},
+      author={Zhiliang Peng and Wei Huang and Shanzhi Gu and Lingxi Xie and Yaowei Wang and Jianbin Jiao and Qixiang Ye},
+      journal={arXiv preprint arXiv:2105.03889},
+      year={2021},
+}
+```
--- a/configs/conformer/conformer-base-p16_8xb128_in1k.py
+++ b/configs/conformer/conformer-base-p16_8xb128_in1k.py
+_base_ = [
+    '../_base_/models/conformer/base-p16.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_conformer.py',
+    '../_base_/default_runtime.py'
+]
+
+train_dataloader = dict(batch_size=128)
--- a/configs/conformer/conformer-small-p16_8xb128_in1k.py
+++ b/configs/conformer/conformer-small-p16_8xb128_in1k.py
+_base_ = [
+    '../_base_/models/conformer/small-p16.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_conformer.py',
+    '../_base_/default_runtime.py'
+]
+
+train_dataloader = dict(batch_size=128)
--- a/configs/conformer/conformer-small-p32_8xb128_in1k.py
+++ b/configs/conformer/conformer-small-p32_8xb128_in1k.py
+_base_ = [
+    '../_base_/models/conformer/small-p32.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_conformer.py',
+    '../_base_/default_runtime.py'
+]
+
+train_dataloader = dict(batch_size=128)
--- a/configs/conformer/conformer-tiny-p16_8xb128_in1k.py
+++ b/configs/conformer/conformer-tiny-p16_8xb128_in1k.py
+_base_ = [
+    '../_base_/models/conformer/tiny-p16.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_conformer.py',
+    '../_base_/default_runtime.py'
+]
+
+train_dataloader = dict(batch_size=128)
--- a/configs/conformer/metafile.yml
+++ b/configs/conformer/metafile.yml
+Collections:
+  - Name: Conformer
+    Metadata:
+      Training Data: ImageNet-1k
+      Architecture:
+        - Layer Normalization
+        - Scaled Dot-Product Attention
+        - Dropout
+    Paper:
+      URL: https://arxiv.org/abs/2105.03889
+      Title: "Conformer: Local Features Coupling Global Representations for Visual Recognition"
+    README: configs/conformer/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmpretrain/blob/v0.19.0/mmcls/models/backbones/conformer.py
+      Version: v0.19.0
+
+Models:
+  - Name: conformer-tiny-p16_3rdparty_in1k
+    In Collection: Conformer
+    Config: configs/conformer/conformer-tiny-p16_8xb128_in1k.py
+    Metadata:
+      FLOPs: 4899611328
+      Parameters: 23524704
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 81.31
+          Top 5 Accuracy: 95.60
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/conformer/conformer-tiny-p16_3rdparty_8xb128_in1k_20211206-f6860372.pth
+    Converted From:
+      Weights: https://drive.google.com/file/d/19SxGhKcWOR5oQSxNUWUM2MGYiaWMrF1z/view?usp=sharing
+      Code: https://github.com/pengzhiliang/Conformer/blob/main/models.py#L65
+  - Name: conformer-small-p16_3rdparty_in1k
+    In Collection: Conformer
+    Config: configs/conformer/conformer-small-p16_8xb128_in1k.py
+    Metadata:
+      FLOPs: 10311309312
+      Parameters: 37673424
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 83.32
+          Top 5 Accuracy: 96.46
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/conformer/conformer-small-p16_3rdparty_8xb128_in1k_20211206-3065dcf5.pth
+    Converted From:
+      Weights: https://drive.google.com/file/d/1mpOlbLaVxOfEwV4-ha78j_1Ebqzj2B83/view?usp=sharing
+      Code: https://github.com/pengzhiliang/Conformer/blob/main/models.py#L73
+  - Name: conformer-small-p32_8xb128_in1k
+    In Collection: Conformer
+    Config: configs/conformer/conformer-small-p32_8xb128_in1k.py
+    Metadata:
+      FLOPs: 7087281792
+      Parameters: 38853072
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 81.96
+          Top 5 Accuracy: 96.02
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/conformer/conformer-small-p32_8xb128_in1k_20211206-947a0816.pth
+  - Name: conformer-base-p16_3rdparty_in1k
+    In Collection: Conformer
+    Config: configs/conformer/conformer-base-p16_8xb128_in1k.py
+    Metadata:
+      FLOPs: 22892078080
+      Parameters: 83289136
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 83.82
+          Top 5 Accuracy: 96.59
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/conformer/conformer-base-p16_3rdparty_8xb128_in1k_20211206-bfdf8637.pth
+    Converted From:
+      Weights: https://drive.google.com/file/d/1oeQ9LSOGKEUaYGu7WTlUGl3KDsQIi0MA/view?usp=sharing
+      Code: https://github.com/pengzhiliang/Conformer/blob/main/models.py#L89
--- a/configs/convmixer/README.md
+++ b/configs/convmixer/README.md
+# ConvMixer
+
+> [Patches Are All You Need?](https://arxiv.org/abs/2201.09792)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Although convolutional networks have been the dominant architecture for vision tasks for many years, recent experiments have shown that Transformer-based models, most notably the Vision Transformer (ViT), may exceed their performance in some settings. However, due to the quadratic runtime of the self-attention layers in Transformers, ViTs require the use of patch embeddings, which group together small regions of the image into single input features, in order to be applied to larger image sizes. This raises a question: Is the performance of ViTs due to the inherently-more-powerful Transformer architecture, or is it at least partly due to using patches as the input representation? In this paper, we present some evidence for the latter: specifically, we propose the ConvMixer, an extremely simple model that is similar in spirit to the ViT and the even-more-basic MLP-Mixer in that it operates directly on patches as input, separates the mixing of spatial and channel dimensions, and maintains equal size and resolution throughout the network. In contrast, however, the ConvMixer uses only standard convolutions to achieve the mixing steps. Despite its simplicity, we show that the ConvMixer outperforms the ViT, MLP-Mixer, and some of their variants for similar parameter counts and data set sizes, in addition to outperforming classical vision models such as the ResNet.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/42952108/156284977-abf2245e-d9ba-4e0d-8e10-c0664a20f4c8.png" width="100%"/>
+</div>
+
+## How to use it?
+
+<!-- [TABS-BEGIN] -->
+
+**Predict image**
+
+```python
+from mmpretrain import inference_model
+
+predict = inference_model('convmixer-768-32_3rdparty_in1k', 'demo/bird.JPEG')
+print(predict['pred_class'])
+print(predict['pred_score'])
+```
+
+**Use the model**
+
+```python
+import torch
+from mmpretrain import get_model
+
+model = get_model('convmixer-768-32_3rdparty_in1k', pretrained=True)
+inputs = torch.rand(1, 3, 224, 224)
+out = model(inputs)
+print(type(out))
+# To extract features.
+feats = model.extract_feat(inputs)
+print(type(feats))
+```
+
+**Test Command**
+
+Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
+
+Test:
+
+```shell
+python tools/test.py configs/convmixer/convmixer-768-32_10xb64_in1k.py https://download.openmmlab.com/mmclassification/v0/convmixer/convmixer-768-32_3rdparty_10xb64_in1k_20220323-bca1f7b8.pth
+```
+
+<!-- [TABS-END] -->
+
+## Models and results
+
+### Image Classification on ImageNet-1k
+
+| Model                               |   Pretrain   | Params (M) | Flops (G) | Top-1 (%) | Top-5 (%) |                   Config                   |                                  Download                                  |
+| :---------------------------------- | :----------: | :--------: | :-------: | :-------: | :-------: | :----------------------------------------: | :------------------------------------------------------------------------: |
+| `convmixer-768-32_3rdparty_in1k`\*  | From scratch |   21.11    |   19.62   |   80.16   |   95.08   | [config](convmixer-768-32_10xb64_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/convmixer/convmixer-768-32_3rdparty_10xb64_in1k_20220323-bca1f7b8.pth) |
+| `convmixer-1024-20_3rdparty_in1k`\* | From scratch |   24.38    |   5.55    |   76.94   |   93.36   | [config](convmixer-1024-20_10xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convmixer/convmixer-1024-20_3rdparty_10xb64_in1k_20220323-48f8aeba.pth) |
+| `convmixer-1536-20_3rdparty_in1k`\* | From scratch |   51.63    |   48.71   |   81.37   |   95.61   | [config](convmixer-1536-20_10xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convmixer/convmixer-1536_20_3rdparty_10xb64_in1k_20220323-ea5786f3.pth) |
+
+*Models with * are converted from the [official repo](https://github.com/locuslab/convmixer). The config files of these models are only for inference. We haven't reproduce the training results.*
+
+## Citation
+
+```bibtex
+@misc{trockman2022patches,
+      title={Patches Are All You Need?},
+      author={Asher Trockman and J. Zico Kolter},
+      year={2022},
+      eprint={2201.09792},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
--- a/configs/convmixer/convmixer-1024-20_10xb64_in1k.py
+++ b/configs/convmixer/convmixer-1024-20_10xb64_in1k.py
+_base_ = [
+    '../_base_/models/convmixer/convmixer-1024-20.py',
+    '../_base_/datasets/imagenet_bs64_convmixer_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# schedule setting
+optim_wrapper = dict(
+    optimizer=dict(lr=0.01),
+    clip_grad=dict(max_norm=5.0),
+)
+
+param_scheduler = [
+    # warm up learning rate scheduler
+    dict(
+        type='LinearLR',
+        start_factor=1e-3,
+        by_epoch=True,
+        begin=0,
+        end=20,
+        # update by iter
+        convert_to_iter_based=True),
+    # main learning rate scheduler
+    dict(
+        type='CosineAnnealingLR',
+        T_max=130,
+        eta_min=1e-5,
+        by_epoch=True,
+        begin=20,
+        end=150)
+]
+
+train_cfg = dict(by_epoch=True, max_epochs=150)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+# base_batch_size = (10 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=640)
--- a/configs/convmixer/convmixer-1536-20_10xb64_in1k.py
+++ b/configs/convmixer/convmixer-1536-20_10xb64_in1k.py
+_base_ = [
+    '../_base_/models/convmixer/convmixer-1536-20.py',
+    '../_base_/datasets/imagenet_bs64_convmixer_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# schedule setting
+optim_wrapper = dict(
+    optimizer=dict(lr=0.01),
+    clip_grad=dict(max_norm=5.0),
+)
+
+param_scheduler = [
+    # warm up learning rate scheduler
+    dict(
+        type='LinearLR',
+        start_factor=1e-3,
+        by_epoch=True,
+        begin=0,
+        end=20,
+        # update by iter
+        convert_to_iter_based=True),
+    # main learning rate scheduler
+    dict(
+        type='CosineAnnealingLR',
+        T_max=130,
+        eta_min=1e-5,
+        by_epoch=True,
+        begin=20,
+        end=150)
+]
+
+train_cfg = dict(by_epoch=True, max_epochs=150)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+# base_batch_size = (10 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=640)
--- a/configs/convmixer/convmixer-768-32_10xb64_in1k.py
+++ b/configs/convmixer/convmixer-768-32_10xb64_in1k.py
+_base_ = [
+    '../_base_/models/convmixer/convmixer-768-32.py',
+    '../_base_/datasets/imagenet_bs64_convmixer_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# schedule setting
+optim_wrapper = dict(
+    optimizer=dict(lr=0.01),
+    clip_grad=dict(max_norm=5.0),
+)
+
+train_cfg = dict(by_epoch=True, max_epochs=300)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+# base_batch_size = (10 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=640)
--- a/configs/convmixer/metafile.yml
+++ b/configs/convmixer/metafile.yml
+Collections:
+  - Name: ConvMixer
+    Metadata:
+      Training Data: ImageNet-1k
+      Architecture:
+        - 1x1 Convolution
+        - LayerScale
+    Paper:
+      URL: https://arxiv.org/abs/2201.09792
+      Title: Patches Are All You Need?
+    README: configs/convmixer/README.md
+
+Models:
+  - Name: convmixer-768-32_3rdparty_in1k
+    Metadata:
+      FLOPs: 19623051264
+      Parameters: 21110248
+    In Collection: ConvMixer
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 80.16
+          Top 5 Accuracy: 95.08
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convmixer/convmixer-768-32_3rdparty_10xb64_in1k_20220323-bca1f7b8.pth
+    Config: configs/convmixer/convmixer-768-32_10xb64_in1k.py
+    Converted From:
+      Weights: https://github.com/tmp-iclr/convmixer/releases/download/v1.0/convmixer_768_32_ks7_p7_relu.pth.tar
+      Code: https://github.com/locuslab/convmixer
+  - Name: convmixer-1024-20_3rdparty_in1k
+    Metadata:
+      FLOPs: 5550112768
+      Parameters: 24383464
+    In Collection: ConvMixer
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 76.94
+          Top 5 Accuracy: 93.36
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convmixer/convmixer-1024-20_3rdparty_10xb64_in1k_20220323-48f8aeba.pth
+    Config: configs/convmixer/convmixer-1024-20_10xb64_in1k.py
+    Converted From:
+      Weights: https://github.com/tmp-iclr/convmixer/releases/download/v1.0/convmixer_1024_20_ks9_p14.pth.tar
+      Code: https://github.com/locuslab/convmixer
+  - Name: convmixer-1536-20_3rdparty_in1k
+    Metadata:
+      FLOPs: 48713170944
+      Parameters: 51625960
+    In Collection: ConvMixer
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 81.37
+          Top 5 Accuracy: 95.61
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convmixer/convmixer-1536_20_3rdparty_10xb64_in1k_20220323-ea5786f3.pth
+    Config: configs/convmixer/convmixer-1536-20_10xb64_in1k.py
+    Converted From:
+      Weights: https://github.com/tmp-iclr/convmixer/releases/download/v1.0/convmixer_1536_20_ks9_p7.pth.tar
+      Code: https://github.com/locuslab/convmixer
--- a/configs/convnext/README.md
+++ b/configs/convnext/README.md
+# ConvNeXt
+
+> [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545v1)
+
+<!-- [ALGORITHM] -->
+
+## Introduction
+
+**ConvNeXt** is initially described in [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545v1), which is a pure convolutional model (ConvNet), inspired by the design of Vision Transformers. The ConvNeXt has the pyramid structure and achieve competitive  performance on various vision tasks, with simplicity and efficiency.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/8370623/148624004-e9581042-ea4d-4e10-b3bd-42c92b02053b.png" width="100%"/>
+</div>
+
+## Abstract
+
+<details>
+
+<summary>Show the paper's abstract</summary>
+
+<br>
+The "Roaring 20s" of visual recognition began with the introduction of Vision Transformers (ViTs), which quickly superseded ConvNets as the state-of-the-art image classification model. A vanilla ViT, on the other hand, faces difficulties when applied to general computer vision tasks such as object detection and semantic segmentation. It is the hierarchical Transformers (e.g., Swin Transformers) that reintroduced several ConvNet priors, making Transformers practically viable as a generic vision backbone and demonstrating remarkable performance on a wide variety of vision tasks. However, the effectiveness of such hybrid approaches is still largely credited to the intrinsic superiority of Transformers, rather than the inherent inductive biases of convolutions. In this work, we reexamine the design spaces and test the limits of what a pure ConvNet can achieve. We gradually "modernize" a standard ResNet toward the design of a vision Transformer, and discover several key components that contribute to the performance difference along the way. The outcome of this exploration is a family of pure ConvNet models dubbed ConvNeXt. Constructed entirely from standard ConvNet modules, ConvNeXts compete favorably with Transformers in terms of accuracy and scalability, achieving 87.8% ImageNet top-1 accuracy and outperforming Swin Transformers on COCO detection and ADE20K segmentation, while maintaining the simplicity and efficiency of standard ConvNets.
+</br>
+
+</details>
+
+## How to use it?
+
+<!-- [TABS-BEGIN] -->
+
+**Predict image**
+
+```python
+from mmpretrain import inference_model
+
+predict = inference_model('convnext-tiny_32xb128_in1k', 'demo/bird.JPEG')
+print(predict['pred_class'])
+print(predict['pred_score'])
+```
+
+**Use the model**
+
+```python
+import torch
+from mmpretrain import get_model
+
+model = get_model('convnext-tiny_32xb128_in1k', pretrained=True)
+inputs = torch.rand(1, 3, 224, 224)
+out = model(inputs)
+print(type(out))
+# To extract features.
+feats = model.extract_feat(inputs)
+print(type(feats))
+```
+
+**Train/Test Command**
+
+Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
+
+Train:
+
+```shell
+python tools/train.py configs/convnext/convnext-tiny_32xb128_in1k.py
+```
+
+Test:
+
+```shell
+python tools/test.py configs/convnext/convnext-tiny_32xb128_in1k.py https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_32xb128_in1k_20221207-998cf3e9.pth
+```
+
+<!-- [TABS-END] -->
+
+## Models and results
+
+### Pretrained models
+
+| Model                              | Params (M) | Flops (G) |                  Config                   |                                                  Download                                                  |
+| :--------------------------------- | :--------: | :-------: | :---------------------------------------: | :--------------------------------------------------------------------------------------------------------: |
+| `convnext-base_3rdparty_in21k`\*   |   88.59    |   15.36   | [config](convnext-base_32xb128_in21k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_3rdparty_in21k_20220124-13b83eec.pth) |
+| `convnext-large_3rdparty_in21k`\*  |   197.77   |   34.37   | [config](convnext-large_64xb64_in21k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_3rdparty_in21k_20220124-41b5a79f.pth) |
+| `convnext-xlarge_3rdparty_in21k`\* |   350.20   |   60.93   | [config](convnext-xlarge_64xb64_in21k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-xlarge_3rdparty_in21k_20220124-f909bad7.pth) |
+
+*Models with * are converted from the [official repo](https://github.com/facebookresearch/ConvNeXt). The config files of these models are only for inference. We haven't reproduce the training results.*
+
+### Image Classification on ImageNet-1k
+
+| Model                                             |   Pretrain   | Params (M) | Flops (G) | Top-1 (%) | Top-5 (%) |                     Config                     |                         Download                         |
+| :------------------------------------------------ | :----------: | :--------: | :-------: | :-------: | :-------: | :--------------------------------------------: | :------------------------------------------------------: |
+| `convnext-tiny_32xb128_in1k`                      | From scratch |   28.59    |   4.46    |   82.14   |   96.06   |    [config](convnext-tiny_32xb128_in1k.py)     | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_32xb128_in1k_20221207-998cf3e9.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_32xb128_in1k_20221207-998cf3e9.json) |
+| `convnext-tiny_32xb128-noema_in1k`                | From scratch |   28.59    |   4.46    |   81.95   |   95.89   |    [config](convnext-tiny_32xb128_in1k.py)     | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_32xb128-noema_in1k_20221208-5d4509c7.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_32xb128_in1k_20221207-998cf3e9.json) |
+| `convnext-tiny_in21k-pre_3rdparty_in1k`\*         | ImageNet-21k |   28.59    |   4.46    |   82.90   |   96.62   |    [config](convnext-tiny_32xb128_in1k.py)     | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_in21k-pre_3rdparty_in1k_20221219-7501e534.pth) |
+| `convnext-tiny_in21k-pre_3rdparty_in1k-384px`\*   | ImageNet-21k |   28.59    |   13.14   |   84.11   |   97.14   | [config](convnext-tiny_32xb128_in1k-384px.py)  | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_in21k-pre_3rdparty_in1k-384px_20221219-c1182362.pth) |
+| `convnext-small_32xb128_in1k`                     | From scratch |   50.22    |   8.69    |   83.16   |   96.56   |    [config](convnext-small_32xb128_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_32xb128_in1k_20221207-4ab7052c.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_32xb128_in1k_20221207-4ab7052c.json) |
+| `convnext-small_32xb128-noema_in1k`               | From scratch |   50.22    |   8.69    |   83.21   |   96.48   |    [config](convnext-small_32xb128_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_32xb128-noema_in1k_20221208-4a618995.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_32xb128_in1k_20221207-4ab7052c.json) |
+| `convnext-small_in21k-pre_3rdparty_in1k`\*        | ImageNet-21k |   50.22    |   8.69    |   84.59   |   97.41   |    [config](convnext-small_32xb128_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_in21k-pre_3rdparty_in1k_20221219-aeca4c93.pth) |
+| `convnext-small_in21k-pre_3rdparty_in1k-384px`\*  | ImageNet-21k |   50.22    |   25.58   |   85.75   |   97.88   | [config](convnext-small_32xb128_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_in21k-pre_3rdparty_in1k-384px_20221219-96f0bb87.pth) |
+| `convnext-base_32xb128_in1k`                      | From scratch |   88.59    |   15.36   |   83.66   |   96.74   |    [config](convnext-base_32xb128_in1k.py)     | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_32xb128_in1k_20221207-fbdb5eb9.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_32xb128_in1k_20221207-fbdb5eb9.json) |
+| `convnext-base_32xb128-noema_in1k`                | From scratch |   88.59    |   15.36   |   83.64   |   96.61   |    [config](convnext-base_32xb128_in1k.py)     | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_32xb128-noema_in1k_20221208-f8182678.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_32xb128_in1k_20221207-fbdb5eb9.json) |
+| `convnext-base_3rdparty_in1k`\*                   | From scratch |   88.59    |   15.36   |   83.85   |   96.74   |    [config](convnext-base_32xb128_in1k.py)     | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_3rdparty_32xb128_in1k_20220124-d0915162.pth) |
+| `convnext-base_3rdparty-noema_in1k`\*             | From scratch |   88.59    |   15.36   |   83.71   |   96.60   |    [config](convnext-base_32xb128_in1k.py)     | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_3rdparty_32xb128-noema_in1k_20220222-dba4f95f.pth) |
+| `convnext-base_3rdparty_in1k-384px`\*             | From scratch |   88.59    |   45.21   |   85.10   |   97.34   | [config](convnext-base_32xb128_in1k-384px.py)  | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_3rdparty_in1k-384px_20221219-c8f1dc2b.pth) |
+| `convnext-base_in21k-pre_3rdparty_in1k`\*         | ImageNet-21k |   88.59    |   15.36   |   85.81   |   97.86   |    [config](convnext-base_32xb128_in1k.py)     | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_in21k-pre-3rdparty_32xb128_in1k_20220124-eb2d6ada.pth) |
+| `convnext-base_in21k-pre-3rdparty_in1k-384px`\*   | From scratch |   88.59    |   45.21   |   86.82   |   98.25   | [config](convnext-base_32xb128_in1k-384px.py)  | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_in21k-pre-3rdparty_in1k-384px_20221219-4570f792.pth) |
+| `convnext-large_3rdparty_in1k`\*                  | From scratch |   197.77   |   34.37   |   84.30   |   96.89   |    [config](convnext-large_64xb64_in1k.py)     | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_3rdparty_64xb64_in1k_20220124-f8a0ded0.pth) |
+| `convnext-large_3rdparty_in1k-384px`\*            | From scratch |   197.77   |  101.10   |   85.50   |   97.59   | [config](convnext-large_64xb64_in1k-384px.py)  | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_3rdparty_in1k-384px_20221219-6dd29d10.pth) |
+| `convnext-large_in21k-pre_3rdparty_in1k`\*        | ImageNet-21k |   197.77   |   34.37   |   86.61   |   98.04   |    [config](convnext-large_64xb64_in1k.py)     | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_in21k-pre-3rdparty_64xb64_in1k_20220124-2412403d.pth) |
+| `convnext-large_in21k-pre-3rdparty_in1k-384px`\*  | From scratch |   197.77   |  101.10   |   87.46   |   98.37   | [config](convnext-large_64xb64_in1k-384px.py)  | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_in21k-pre-3rdparty_in1k-384px_20221219-6d38dd66.pth) |
+| `convnext-xlarge_in21k-pre_3rdparty_in1k`\*       | ImageNet-21k |   350.20   |   60.93   |   86.97   |   98.20   |    [config](convnext-xlarge_64xb64_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-xlarge_in21k-pre-3rdparty_64xb64_in1k_20220124-76b6863d.pth) |
+| `convnext-xlarge_in21k-pre-3rdparty_in1k-384px`\* | From scratch |   350.20   |  179.20   |   87.76   |   98.55   | [config](convnext-xlarge_64xb64_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-xlarge_in21k-pre-3rdparty_in1k-384px_20221219-b161bc14.pth) |
+
+*Models with * are converted from the [official repo](https://github.com/facebookresearch/ConvNeXt). The config files of these models are only for inference. We haven't reproduce the training results.*
+
+## Citation
+
+```bibtex
+@Article{liu2022convnet,
+  author  = {Zhuang Liu and Hanzi Mao and Chao-Yuan Wu and Christoph Feichtenhofer and Trevor Darrell and Saining Xie},
+  title   = {A ConvNet for the 2020s},
+  journal = {arXiv preprint arXiv:2201.03545},
+  year    = {2022},
+}
+```
--- a/configs/convnext/convnext-base_32xb128_in1k-384px.py
+++ b/configs/convnext/convnext-base_32xb128_in1k-384px.py
+_base_ = [
+    '../_base_/models/convnext/convnext-base.py',
+    '../_base_/datasets/imagenet_bs64_swin_384.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset setting
+train_dataloader = dict(batch_size=128)
+
+# schedule setting
+optim_wrapper = dict(
+    optimizer=dict(lr=4e-3),
+    clip_grad=dict(max_norm=5.0),
+)
+
+# runtime setting
+custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+# base_batch_size = (32 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
--- a/configs/convnext/convnext-base_32xb128_in1k.py
+++ b/configs/convnext/convnext-base_32xb128_in1k.py
+_base_ = [
+    '../_base_/models/convnext/convnext-base.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset setting
+train_dataloader = dict(batch_size=128)
+
+# schedule setting
+optim_wrapper = dict(
+    optimizer=dict(lr=4e-3),
+    clip_grad=None,
+)
+
+# runtime setting
+custom_hooks = [dict(type='EMAHook', momentum=1e-4, priority='ABOVE_NORMAL')]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+# base_batch_size = (32 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
--- a/configs/convnext/convnext-base_32xb128_in21k.py
+++ b/configs/convnext/convnext-base_32xb128_in21k.py
+_base_ = [
+    '../_base_/models/convnext/convnext-base.py',
+    '../_base_/datasets/imagenet21k_bs128.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# model setting
+model = dict(head=dict(num_classes=21841))
+
+# dataset setting
+data_preprocessor = dict(num_classes=21841)
+train_dataloader = dict(batch_size=128)
+
+# schedule setting
+optim_wrapper = dict(
+    optimizer=dict(lr=4e-3),
+    clip_grad=dict(max_norm=5.0),
+)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+# base_batch_size = (32 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)