Commit dff2c686 authored by renzhc's avatar renzhc
Browse files

first commit

parent 8f9dd0ed
Pipeline #1665 canceled with stages
_base_ = [
'../_base_/models/vit-base-p16.py',
'../_base_/datasets/imagenet_bs64_pil_resize.py',
'../_base_/schedules/imagenet_bs4096_AdamW.py',
'../_base_/default_runtime.py'
]
# model setting
model = dict(backbone=dict(pre_norm=True))
# data settings
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='RandomResizedCrop',
scale=224,
backend='pillow',
interpolation='bicubic'),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(type='PackInputs'),
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='ResizeEdge',
scale=224,
edge='short',
backend='pillow',
interpolation='bicubic'),
dict(type='CenterCrop', crop_size=224),
dict(type='PackInputs'),
]
train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
# schedule setting
optim_wrapper = dict(clip_grad=dict(max_norm=1.0))
_base_ = [
'../_base_/models/vit-base-p32.py',
'../_base_/datasets/imagenet_bs64_pil_resize.py',
'../_base_/schedules/imagenet_bs4096_AdamW.py',
'../_base_/default_runtime.py'
]
# model setting
model = dict(backbone=dict(pre_norm=True))
# data settings
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='RandomResizedCrop',
scale=384,
backend='pillow',
interpolation='bicubic'),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(type='PackInputs'),
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='ResizeEdge',
scale=384,
edge='short',
backend='pillow',
interpolation='bicubic'),
dict(type='CenterCrop', crop_size=384),
dict(type='PackInputs'),
]
train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
# schedule setting
optim_wrapper = dict(clip_grad=dict(max_norm=1.0))
_base_ = [
'../_base_/models/vit-base-p32.py',
'../_base_/datasets/imagenet_bs64_pil_resize.py',
'../_base_/schedules/imagenet_bs4096_AdamW.py',
'../_base_/default_runtime.py'
]
# model setting
model = dict(backbone=dict(pre_norm=True))
# data settings
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='RandomResizedCrop',
scale=448,
backend='pillow',
interpolation='bicubic'),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(type='PackInputs'),
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='ResizeEdge',
scale=448,
edge='short',
backend='pillow',
interpolation='bicubic'),
dict(type='CenterCrop', crop_size=448),
dict(type='PackInputs'),
]
train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
# schedule setting
optim_wrapper = dict(clip_grad=dict(max_norm=1.0))
_base_ = [
'../_base_/models/vit-base-p32.py',
'../_base_/datasets/imagenet_bs64_pil_resize.py',
'../_base_/schedules/imagenet_bs4096_AdamW.py',
'../_base_/default_runtime.py'
]
# model setting
model = dict(backbone=dict(pre_norm=True))
# data settings
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='RandomResizedCrop',
scale=224,
backend='pillow',
interpolation='bicubic'),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(type='PackInputs'),
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='ResizeEdge',
scale=224,
edge='short',
backend='pillow',
interpolation='bicubic'),
dict(type='CenterCrop', crop_size=224),
dict(type='PackInputs'),
]
train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
# schedule setting
optim_wrapper = dict(clip_grad=dict(max_norm=1.0))
_base_ = ['../_base_/default_runtime.py']
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='VisionTransformer',
arch='l',
img_size=224,
patch_size=16,
drop_rate=0.1,
pre_norm=True,
),
)
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='ResizeEdge', scale=256, edge='short', backend='pillow'),
dict(type='CenterCrop', crop_size=224),
dict(type='PackInputs'),
]
test_dataloader = dict(
batch_size=64,
num_workers=5,
dataset=dict(
type='ImageNet',
data_root='data/imagenet',
ann_file='meta/val.txt',
data_prefix='val',
pipeline=test_pipeline),
sampler=dict(type='DefaultSampler', shuffle=False),
)
test_evaluator = None
# Conformer
> [Conformer: Local Features Coupling Global Representations for Visual Recognition](https://arxiv.org/abs/2105.03889)
<!-- [ALGORITHM] -->
## Abstract
Within Convolutional Neural Network (CNN), the convolution operations are good at extracting local features but experience difficulty to capture global representations. Within visual transformer, the cascaded self-attention modules can capture long-distance feature dependencies but unfortunately deteriorate local feature details. In this paper, we propose a hybrid network structure, termed Conformer, to take advantage of convolutional operations and self-attention mechanisms for enhanced representation learning. Conformer roots in the Feature Coupling Unit (FCU), which fuses local features and global representations under different resolutions in an interactive fashion. Conformer adopts a concurrent structure so that local features and global representations are retained to the maximum extent. Experiments show that Conformer, under the comparable parameter complexity, outperforms the visual transformer (DeiT-B) by 2.3% on ImageNet. On MSCOCO, it outperforms ResNet-101 by 3.7% and 3.6% mAPs for object detection and instance segmentation, respectively, demonstrating the great potential to be a general backbone network.
<div align=center>
<img src="https://user-images.githubusercontent.com/26739999/144957687-926390ed-6119-4e4c-beaa-9bc0017fe953.png" width="90%"/>
</div>
## How to use it?
<!-- [TABS-BEGIN] -->
**Predict image**
```python
from mmpretrain import inference_model
predict = inference_model('conformer-tiny-p16_3rdparty_in1k', 'demo/bird.JPEG')
print(predict['pred_class'])
print(predict['pred_score'])
```
**Use the model**
```python
import torch
from mmpretrain import get_model
model = get_model('conformer-tiny-p16_3rdparty_in1k', pretrained=True)
inputs = torch.rand(1, 3, 224, 224)
out = model(inputs)
print(type(out))
# To extract features.
feats = model.extract_feat(inputs)
print(type(feats))
```
**Train/Test Command**
Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
Train:
```shell
python tools/train.py configs/conformer/conformer-small-p32_8xb128_in1k.py
```
Test:
```shell
python tools/test.py configs/conformer/conformer-tiny-p16_8xb128_in1k.py https://download.openmmlab.com/mmclassification/v0/conformer/conformer-tiny-p16_3rdparty_8xb128_in1k_20211206-f6860372.pth
```
<!-- [TABS-END] -->
## Models and results
### Image Classification on ImageNet-1k
| Model | Pretrain | Params (M) | Flops (G) | Top-1 (%) | Top-5 (%) | Config | Download |
| :------------------------------------ | :----------: | :--------: | :-------: | :-------: | :-------: | :------------------------------------------: | :--------------------------------------------------------------------: |
| `conformer-tiny-p16_3rdparty_in1k`\* | From scratch | 23.52 | 4.90 | 81.31 | 95.60 | [config](conformer-tiny-p16_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/conformer/conformer-tiny-p16_3rdparty_8xb128_in1k_20211206-f6860372.pth) |
| `conformer-small-p16_3rdparty_in1k`\* | From scratch | 37.67 | 10.31 | 83.32 | 96.46 | [config](conformer-small-p16_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/conformer/conformer-small-p16_3rdparty_8xb128_in1k_20211206-3065dcf5.pth) |
| `conformer-small-p32_8xb128_in1k` | From scratch | 38.85 | 7.09 | 81.96 | 96.02 | [config](conformer-small-p32_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/conformer/conformer-small-p32_8xb128_in1k_20211206-947a0816.pth) |
| `conformer-base-p16_3rdparty_in1k`\* | From scratch | 83.29 | 22.89 | 83.82 | 96.59 | [config](conformer-base-p16_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/conformer/conformer-base-p16_3rdparty_8xb128_in1k_20211206-bfdf8637.pth) |
*Models with * are converted from the [official repo](https://github.com/pengzhiliang/Conformer/blob/main/models.py#L89). The config files of these models are only for inference. We haven't reproduce the training results.*
## Citation
```bibtex
@article{peng2021conformer,
title={Conformer: Local Features Coupling Global Representations for Visual Recognition},
author={Zhiliang Peng and Wei Huang and Shanzhi Gu and Lingxi Xie and Yaowei Wang and Jianbin Jiao and Qixiang Ye},
journal={arXiv preprint arXiv:2105.03889},
year={2021},
}
```
_base_ = [
'../_base_/models/conformer/base-p16.py',
'../_base_/datasets/imagenet_bs64_swin_224.py',
'../_base_/schedules/imagenet_bs1024_adamw_conformer.py',
'../_base_/default_runtime.py'
]
train_dataloader = dict(batch_size=128)
_base_ = [
'../_base_/models/conformer/small-p16.py',
'../_base_/datasets/imagenet_bs64_swin_224.py',
'../_base_/schedules/imagenet_bs1024_adamw_conformer.py',
'../_base_/default_runtime.py'
]
train_dataloader = dict(batch_size=128)
_base_ = [
'../_base_/models/conformer/small-p32.py',
'../_base_/datasets/imagenet_bs64_swin_224.py',
'../_base_/schedules/imagenet_bs1024_adamw_conformer.py',
'../_base_/default_runtime.py'
]
train_dataloader = dict(batch_size=128)
_base_ = [
'../_base_/models/conformer/tiny-p16.py',
'../_base_/datasets/imagenet_bs64_swin_224.py',
'../_base_/schedules/imagenet_bs1024_adamw_conformer.py',
'../_base_/default_runtime.py'
]
train_dataloader = dict(batch_size=128)
Collections:
- Name: Conformer
Metadata:
Training Data: ImageNet-1k
Architecture:
- Layer Normalization
- Scaled Dot-Product Attention
- Dropout
Paper:
URL: https://arxiv.org/abs/2105.03889
Title: "Conformer: Local Features Coupling Global Representations for Visual Recognition"
README: configs/conformer/README.md
Code:
URL: https://github.com/open-mmlab/mmpretrain/blob/v0.19.0/mmcls/models/backbones/conformer.py
Version: v0.19.0
Models:
- Name: conformer-tiny-p16_3rdparty_in1k
In Collection: Conformer
Config: configs/conformer/conformer-tiny-p16_8xb128_in1k.py
Metadata:
FLOPs: 4899611328
Parameters: 23524704
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 81.31
Top 5 Accuracy: 95.60
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/conformer/conformer-tiny-p16_3rdparty_8xb128_in1k_20211206-f6860372.pth
Converted From:
Weights: https://drive.google.com/file/d/19SxGhKcWOR5oQSxNUWUM2MGYiaWMrF1z/view?usp=sharing
Code: https://github.com/pengzhiliang/Conformer/blob/main/models.py#L65
- Name: conformer-small-p16_3rdparty_in1k
In Collection: Conformer
Config: configs/conformer/conformer-small-p16_8xb128_in1k.py
Metadata:
FLOPs: 10311309312
Parameters: 37673424
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 83.32
Top 5 Accuracy: 96.46
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/conformer/conformer-small-p16_3rdparty_8xb128_in1k_20211206-3065dcf5.pth
Converted From:
Weights: https://drive.google.com/file/d/1mpOlbLaVxOfEwV4-ha78j_1Ebqzj2B83/view?usp=sharing
Code: https://github.com/pengzhiliang/Conformer/blob/main/models.py#L73
- Name: conformer-small-p32_8xb128_in1k
In Collection: Conformer
Config: configs/conformer/conformer-small-p32_8xb128_in1k.py
Metadata:
FLOPs: 7087281792
Parameters: 38853072
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 81.96
Top 5 Accuracy: 96.02
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/conformer/conformer-small-p32_8xb128_in1k_20211206-947a0816.pth
- Name: conformer-base-p16_3rdparty_in1k
In Collection: Conformer
Config: configs/conformer/conformer-base-p16_8xb128_in1k.py
Metadata:
FLOPs: 22892078080
Parameters: 83289136
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 83.82
Top 5 Accuracy: 96.59
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/conformer/conformer-base-p16_3rdparty_8xb128_in1k_20211206-bfdf8637.pth
Converted From:
Weights: https://drive.google.com/file/d/1oeQ9LSOGKEUaYGu7WTlUGl3KDsQIi0MA/view?usp=sharing
Code: https://github.com/pengzhiliang/Conformer/blob/main/models.py#L89
# ConvMixer
> [Patches Are All You Need?](https://arxiv.org/abs/2201.09792)
<!-- [ALGORITHM] -->
## Abstract
Although convolutional networks have been the dominant architecture for vision tasks for many years, recent experiments have shown that Transformer-based models, most notably the Vision Transformer (ViT), may exceed their performance in some settings. However, due to the quadratic runtime of the self-attention layers in Transformers, ViTs require the use of patch embeddings, which group together small regions of the image into single input features, in order to be applied to larger image sizes. This raises a question: Is the performance of ViTs due to the inherently-more-powerful Transformer architecture, or is it at least partly due to using patches as the input representation? In this paper, we present some evidence for the latter: specifically, we propose the ConvMixer, an extremely simple model that is similar in spirit to the ViT and the even-more-basic MLP-Mixer in that it operates directly on patches as input, separates the mixing of spatial and channel dimensions, and maintains equal size and resolution throughout the network. In contrast, however, the ConvMixer uses only standard convolutions to achieve the mixing steps. Despite its simplicity, we show that the ConvMixer outperforms the ViT, MLP-Mixer, and some of their variants for similar parameter counts and data set sizes, in addition to outperforming classical vision models such as the ResNet.
<div align=center>
<img src="https://user-images.githubusercontent.com/42952108/156284977-abf2245e-d9ba-4e0d-8e10-c0664a20f4c8.png" width="100%"/>
</div>
## How to use it?
<!-- [TABS-BEGIN] -->
**Predict image**
```python
from mmpretrain import inference_model
predict = inference_model('convmixer-768-32_3rdparty_in1k', 'demo/bird.JPEG')
print(predict['pred_class'])
print(predict['pred_score'])
```
**Use the model**
```python
import torch
from mmpretrain import get_model
model = get_model('convmixer-768-32_3rdparty_in1k', pretrained=True)
inputs = torch.rand(1, 3, 224, 224)
out = model(inputs)
print(type(out))
# To extract features.
feats = model.extract_feat(inputs)
print(type(feats))
```
**Test Command**
Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
Test:
```shell
python tools/test.py configs/convmixer/convmixer-768-32_10xb64_in1k.py https://download.openmmlab.com/mmclassification/v0/convmixer/convmixer-768-32_3rdparty_10xb64_in1k_20220323-bca1f7b8.pth
```
<!-- [TABS-END] -->
## Models and results
### Image Classification on ImageNet-1k
| Model | Pretrain | Params (M) | Flops (G) | Top-1 (%) | Top-5 (%) | Config | Download |
| :---------------------------------- | :----------: | :--------: | :-------: | :-------: | :-------: | :----------------------------------------: | :------------------------------------------------------------------------: |
| `convmixer-768-32_3rdparty_in1k`\* | From scratch | 21.11 | 19.62 | 80.16 | 95.08 | [config](convmixer-768-32_10xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convmixer/convmixer-768-32_3rdparty_10xb64_in1k_20220323-bca1f7b8.pth) |
| `convmixer-1024-20_3rdparty_in1k`\* | From scratch | 24.38 | 5.55 | 76.94 | 93.36 | [config](convmixer-1024-20_10xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convmixer/convmixer-1024-20_3rdparty_10xb64_in1k_20220323-48f8aeba.pth) |
| `convmixer-1536-20_3rdparty_in1k`\* | From scratch | 51.63 | 48.71 | 81.37 | 95.61 | [config](convmixer-1536-20_10xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convmixer/convmixer-1536_20_3rdparty_10xb64_in1k_20220323-ea5786f3.pth) |
*Models with * are converted from the [official repo](https://github.com/locuslab/convmixer). The config files of these models are only for inference. We haven't reproduce the training results.*
## Citation
```bibtex
@misc{trockman2022patches,
title={Patches Are All You Need?},
author={Asher Trockman and J. Zico Kolter},
year={2022},
eprint={2201.09792},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
```
_base_ = [
'../_base_/models/convmixer/convmixer-1024-20.py',
'../_base_/datasets/imagenet_bs64_convmixer_224.py',
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../_base_/default_runtime.py',
]
# schedule setting
optim_wrapper = dict(
optimizer=dict(lr=0.01),
clip_grad=dict(max_norm=5.0),
)
param_scheduler = [
# warm up learning rate scheduler
dict(
type='LinearLR',
start_factor=1e-3,
by_epoch=True,
begin=0,
end=20,
# update by iter
convert_to_iter_based=True),
# main learning rate scheduler
dict(
type='CosineAnnealingLR',
T_max=130,
eta_min=1e-5,
by_epoch=True,
begin=20,
end=150)
]
train_cfg = dict(by_epoch=True, max_epochs=150)
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
# base_batch_size = (10 GPUs) x (64 samples per GPU)
auto_scale_lr = dict(base_batch_size=640)
_base_ = [
'../_base_/models/convmixer/convmixer-1536-20.py',
'../_base_/datasets/imagenet_bs64_convmixer_224.py',
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../_base_/default_runtime.py',
]
# schedule setting
optim_wrapper = dict(
optimizer=dict(lr=0.01),
clip_grad=dict(max_norm=5.0),
)
param_scheduler = [
# warm up learning rate scheduler
dict(
type='LinearLR',
start_factor=1e-3,
by_epoch=True,
begin=0,
end=20,
# update by iter
convert_to_iter_based=True),
# main learning rate scheduler
dict(
type='CosineAnnealingLR',
T_max=130,
eta_min=1e-5,
by_epoch=True,
begin=20,
end=150)
]
train_cfg = dict(by_epoch=True, max_epochs=150)
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
# base_batch_size = (10 GPUs) x (64 samples per GPU)
auto_scale_lr = dict(base_batch_size=640)
_base_ = [
'../_base_/models/convmixer/convmixer-768-32.py',
'../_base_/datasets/imagenet_bs64_convmixer_224.py',
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../_base_/default_runtime.py',
]
# schedule setting
optim_wrapper = dict(
optimizer=dict(lr=0.01),
clip_grad=dict(max_norm=5.0),
)
train_cfg = dict(by_epoch=True, max_epochs=300)
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
# base_batch_size = (10 GPUs) x (64 samples per GPU)
auto_scale_lr = dict(base_batch_size=640)
Collections:
- Name: ConvMixer
Metadata:
Training Data: ImageNet-1k
Architecture:
- 1x1 Convolution
- LayerScale
Paper:
URL: https://arxiv.org/abs/2201.09792
Title: Patches Are All You Need?
README: configs/convmixer/README.md
Models:
- Name: convmixer-768-32_3rdparty_in1k
Metadata:
FLOPs: 19623051264
Parameters: 21110248
In Collection: ConvMixer
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 80.16
Top 5 Accuracy: 95.08
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/convmixer/convmixer-768-32_3rdparty_10xb64_in1k_20220323-bca1f7b8.pth
Config: configs/convmixer/convmixer-768-32_10xb64_in1k.py
Converted From:
Weights: https://github.com/tmp-iclr/convmixer/releases/download/v1.0/convmixer_768_32_ks7_p7_relu.pth.tar
Code: https://github.com/locuslab/convmixer
- Name: convmixer-1024-20_3rdparty_in1k
Metadata:
FLOPs: 5550112768
Parameters: 24383464
In Collection: ConvMixer
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 76.94
Top 5 Accuracy: 93.36
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/convmixer/convmixer-1024-20_3rdparty_10xb64_in1k_20220323-48f8aeba.pth
Config: configs/convmixer/convmixer-1024-20_10xb64_in1k.py
Converted From:
Weights: https://github.com/tmp-iclr/convmixer/releases/download/v1.0/convmixer_1024_20_ks9_p14.pth.tar
Code: https://github.com/locuslab/convmixer
- Name: convmixer-1536-20_3rdparty_in1k
Metadata:
FLOPs: 48713170944
Parameters: 51625960
In Collection: ConvMixer
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 81.37
Top 5 Accuracy: 95.61
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/convmixer/convmixer-1536_20_3rdparty_10xb64_in1k_20220323-ea5786f3.pth
Config: configs/convmixer/convmixer-1536-20_10xb64_in1k.py
Converted From:
Weights: https://github.com/tmp-iclr/convmixer/releases/download/v1.0/convmixer_1536_20_ks9_p7.pth.tar
Code: https://github.com/locuslab/convmixer
# ConvNeXt
> [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545v1)
<!-- [ALGORITHM] -->
## Introduction
**ConvNeXt** is initially described in [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545v1), which is a pure convolutional model (ConvNet), inspired by the design of Vision Transformers. The ConvNeXt has the pyramid structure and achieve competitive performance on various vision tasks, with simplicity and efficiency.
<div align=center>
<img src="https://user-images.githubusercontent.com/8370623/148624004-e9581042-ea4d-4e10-b3bd-42c92b02053b.png" width="100%"/>
</div>
## Abstract
<details>
<summary>Show the paper's abstract</summary>
<br>
The "Roaring 20s" of visual recognition began with the introduction of Vision Transformers (ViTs), which quickly superseded ConvNets as the state-of-the-art image classification model. A vanilla ViT, on the other hand, faces difficulties when applied to general computer vision tasks such as object detection and semantic segmentation. It is the hierarchical Transformers (e.g., Swin Transformers) that reintroduced several ConvNet priors, making Transformers practically viable as a generic vision backbone and demonstrating remarkable performance on a wide variety of vision tasks. However, the effectiveness of such hybrid approaches is still largely credited to the intrinsic superiority of Transformers, rather than the inherent inductive biases of convolutions. In this work, we reexamine the design spaces and test the limits of what a pure ConvNet can achieve. We gradually "modernize" a standard ResNet toward the design of a vision Transformer, and discover several key components that contribute to the performance difference along the way. The outcome of this exploration is a family of pure ConvNet models dubbed ConvNeXt. Constructed entirely from standard ConvNet modules, ConvNeXts compete favorably with Transformers in terms of accuracy and scalability, achieving 87.8% ImageNet top-1 accuracy and outperforming Swin Transformers on COCO detection and ADE20K segmentation, while maintaining the simplicity and efficiency of standard ConvNets.
</br>
</details>
## How to use it?
<!-- [TABS-BEGIN] -->
**Predict image**
```python
from mmpretrain import inference_model
predict = inference_model('convnext-tiny_32xb128_in1k', 'demo/bird.JPEG')
print(predict['pred_class'])
print(predict['pred_score'])
```
**Use the model**
```python
import torch
from mmpretrain import get_model
model = get_model('convnext-tiny_32xb128_in1k', pretrained=True)
inputs = torch.rand(1, 3, 224, 224)
out = model(inputs)
print(type(out))
# To extract features.
feats = model.extract_feat(inputs)
print(type(feats))
```
**Train/Test Command**
Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
Train:
```shell
python tools/train.py configs/convnext/convnext-tiny_32xb128_in1k.py
```
Test:
```shell
python tools/test.py configs/convnext/convnext-tiny_32xb128_in1k.py https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_32xb128_in1k_20221207-998cf3e9.pth
```
<!-- [TABS-END] -->
## Models and results
### Pretrained models
| Model | Params (M) | Flops (G) | Config | Download |
| :--------------------------------- | :--------: | :-------: | :---------------------------------------: | :--------------------------------------------------------------------------------------------------------: |
| `convnext-base_3rdparty_in21k`\* | 88.59 | 15.36 | [config](convnext-base_32xb128_in21k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_3rdparty_in21k_20220124-13b83eec.pth) |
| `convnext-large_3rdparty_in21k`\* | 197.77 | 34.37 | [config](convnext-large_64xb64_in21k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_3rdparty_in21k_20220124-41b5a79f.pth) |
| `convnext-xlarge_3rdparty_in21k`\* | 350.20 | 60.93 | [config](convnext-xlarge_64xb64_in21k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-xlarge_3rdparty_in21k_20220124-f909bad7.pth) |
*Models with * are converted from the [official repo](https://github.com/facebookresearch/ConvNeXt). The config files of these models are only for inference. We haven't reproduce the training results.*
### Image Classification on ImageNet-1k
| Model | Pretrain | Params (M) | Flops (G) | Top-1 (%) | Top-5 (%) | Config | Download |
| :------------------------------------------------ | :----------: | :--------: | :-------: | :-------: | :-------: | :--------------------------------------------: | :------------------------------------------------------: |
| `convnext-tiny_32xb128_in1k` | From scratch | 28.59 | 4.46 | 82.14 | 96.06 | [config](convnext-tiny_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_32xb128_in1k_20221207-998cf3e9.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_32xb128_in1k_20221207-998cf3e9.json) |
| `convnext-tiny_32xb128-noema_in1k` | From scratch | 28.59 | 4.46 | 81.95 | 95.89 | [config](convnext-tiny_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_32xb128-noema_in1k_20221208-5d4509c7.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_32xb128_in1k_20221207-998cf3e9.json) |
| `convnext-tiny_in21k-pre_3rdparty_in1k`\* | ImageNet-21k | 28.59 | 4.46 | 82.90 | 96.62 | [config](convnext-tiny_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_in21k-pre_3rdparty_in1k_20221219-7501e534.pth) |
| `convnext-tiny_in21k-pre_3rdparty_in1k-384px`\* | ImageNet-21k | 28.59 | 13.14 | 84.11 | 97.14 | [config](convnext-tiny_32xb128_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_in21k-pre_3rdparty_in1k-384px_20221219-c1182362.pth) |
| `convnext-small_32xb128_in1k` | From scratch | 50.22 | 8.69 | 83.16 | 96.56 | [config](convnext-small_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_32xb128_in1k_20221207-4ab7052c.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_32xb128_in1k_20221207-4ab7052c.json) |
| `convnext-small_32xb128-noema_in1k` | From scratch | 50.22 | 8.69 | 83.21 | 96.48 | [config](convnext-small_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_32xb128-noema_in1k_20221208-4a618995.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_32xb128_in1k_20221207-4ab7052c.json) |
| `convnext-small_in21k-pre_3rdparty_in1k`\* | ImageNet-21k | 50.22 | 8.69 | 84.59 | 97.41 | [config](convnext-small_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_in21k-pre_3rdparty_in1k_20221219-aeca4c93.pth) |
| `convnext-small_in21k-pre_3rdparty_in1k-384px`\* | ImageNet-21k | 50.22 | 25.58 | 85.75 | 97.88 | [config](convnext-small_32xb128_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_in21k-pre_3rdparty_in1k-384px_20221219-96f0bb87.pth) |
| `convnext-base_32xb128_in1k` | From scratch | 88.59 | 15.36 | 83.66 | 96.74 | [config](convnext-base_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_32xb128_in1k_20221207-fbdb5eb9.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_32xb128_in1k_20221207-fbdb5eb9.json) |
| `convnext-base_32xb128-noema_in1k` | From scratch | 88.59 | 15.36 | 83.64 | 96.61 | [config](convnext-base_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_32xb128-noema_in1k_20221208-f8182678.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_32xb128_in1k_20221207-fbdb5eb9.json) |
| `convnext-base_3rdparty_in1k`\* | From scratch | 88.59 | 15.36 | 83.85 | 96.74 | [config](convnext-base_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_3rdparty_32xb128_in1k_20220124-d0915162.pth) |
| `convnext-base_3rdparty-noema_in1k`\* | From scratch | 88.59 | 15.36 | 83.71 | 96.60 | [config](convnext-base_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_3rdparty_32xb128-noema_in1k_20220222-dba4f95f.pth) |
| `convnext-base_3rdparty_in1k-384px`\* | From scratch | 88.59 | 45.21 | 85.10 | 97.34 | [config](convnext-base_32xb128_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_3rdparty_in1k-384px_20221219-c8f1dc2b.pth) |
| `convnext-base_in21k-pre_3rdparty_in1k`\* | ImageNet-21k | 88.59 | 15.36 | 85.81 | 97.86 | [config](convnext-base_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_in21k-pre-3rdparty_32xb128_in1k_20220124-eb2d6ada.pth) |
| `convnext-base_in21k-pre-3rdparty_in1k-384px`\* | From scratch | 88.59 | 45.21 | 86.82 | 98.25 | [config](convnext-base_32xb128_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_in21k-pre-3rdparty_in1k-384px_20221219-4570f792.pth) |
| `convnext-large_3rdparty_in1k`\* | From scratch | 197.77 | 34.37 | 84.30 | 96.89 | [config](convnext-large_64xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_3rdparty_64xb64_in1k_20220124-f8a0ded0.pth) |
| `convnext-large_3rdparty_in1k-384px`\* | From scratch | 197.77 | 101.10 | 85.50 | 97.59 | [config](convnext-large_64xb64_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_3rdparty_in1k-384px_20221219-6dd29d10.pth) |
| `convnext-large_in21k-pre_3rdparty_in1k`\* | ImageNet-21k | 197.77 | 34.37 | 86.61 | 98.04 | [config](convnext-large_64xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_in21k-pre-3rdparty_64xb64_in1k_20220124-2412403d.pth) |
| `convnext-large_in21k-pre-3rdparty_in1k-384px`\* | From scratch | 197.77 | 101.10 | 87.46 | 98.37 | [config](convnext-large_64xb64_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_in21k-pre-3rdparty_in1k-384px_20221219-6d38dd66.pth) |
| `convnext-xlarge_in21k-pre_3rdparty_in1k`\* | ImageNet-21k | 350.20 | 60.93 | 86.97 | 98.20 | [config](convnext-xlarge_64xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-xlarge_in21k-pre-3rdparty_64xb64_in1k_20220124-76b6863d.pth) |
| `convnext-xlarge_in21k-pre-3rdparty_in1k-384px`\* | From scratch | 350.20 | 179.20 | 87.76 | 98.55 | [config](convnext-xlarge_64xb64_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-xlarge_in21k-pre-3rdparty_in1k-384px_20221219-b161bc14.pth) |
*Models with * are converted from the [official repo](https://github.com/facebookresearch/ConvNeXt). The config files of these models are only for inference. We haven't reproduce the training results.*
## Citation
```bibtex
@Article{liu2022convnet,
author = {Zhuang Liu and Hanzi Mao and Chao-Yuan Wu and Christoph Feichtenhofer and Trevor Darrell and Saining Xie},
title = {A ConvNet for the 2020s},
journal = {arXiv preprint arXiv:2201.03545},
year = {2022},
}
```
_base_ = [
'../_base_/models/convnext/convnext-base.py',
'../_base_/datasets/imagenet_bs64_swin_384.py',
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../_base_/default_runtime.py',
]
# dataset setting
train_dataloader = dict(batch_size=128)
# schedule setting
optim_wrapper = dict(
optimizer=dict(lr=4e-3),
clip_grad=dict(max_norm=5.0),
)
# runtime setting
custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')]
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
# base_batch_size = (32 GPUs) x (128 samples per GPU)
auto_scale_lr = dict(base_batch_size=4096)
_base_ = [
'../_base_/models/convnext/convnext-base.py',
'../_base_/datasets/imagenet_bs64_swin_224.py',
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../_base_/default_runtime.py',
]
# dataset setting
train_dataloader = dict(batch_size=128)
# schedule setting
optim_wrapper = dict(
optimizer=dict(lr=4e-3),
clip_grad=None,
)
# runtime setting
custom_hooks = [dict(type='EMAHook', momentum=1e-4, priority='ABOVE_NORMAL')]
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
# base_batch_size = (32 GPUs) x (128 samples per GPU)
auto_scale_lr = dict(base_batch_size=4096)
_base_ = [
'../_base_/models/convnext/convnext-base.py',
'../_base_/datasets/imagenet21k_bs128.py',
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../_base_/default_runtime.py',
]
# model setting
model = dict(head=dict(num_classes=21841))
# dataset setting
data_preprocessor = dict(num_classes=21841)
train_dataloader = dict(batch_size=128)
# schedule setting
optim_wrapper = dict(
optimizer=dict(lr=4e-3),
clip_grad=dict(max_norm=5.0),
)
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
# base_batch_size = (32 GPUs) x (128 samples per GPU)
auto_scale_lr = dict(base_batch_size=4096)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment