Commit dff2c686 authored by renzhc's avatar renzhc
Browse files

first commit

parent 8f9dd0ed
Pipeline #1665 canceled with stages
_base_ = ['../mobileone-s1_8xb32_in1k.py']
model = dict(backbone=dict(deploy=True))
_base_ = ['../mobileone-s2_8xb32_in1k.py']
model = dict(backbone=dict(deploy=True))
_base_ = ['../mobileone-s3_8xb32_in1k.py']
model = dict(backbone=dict(deploy=True))
_base_ = ['../mobileone-s4_8xb32_in1k.py']
model = dict(backbone=dict(deploy=True))
Collections:
- Name: MobileOne
Metadata:
Training Data: ImageNet-1k
Architecture:
- re-parameterization Convolution
- VGG-style Neural Network
- Depthwise Convolution
- Pointwise Convolution
Paper:
URL: https://arxiv.org/abs/2206.04040
Title: 'An Improved One millisecond Mobile Backbone'
README: configs/mobileone/README.md
Code:
URL: https://github.com/open-mmlab/mmpretrain/blob/v1.0.0rc1/configs/mobileone/metafile.yml
Version: v1.0.0rc1
Models:
- Name: mobileone-s0_8xb32_in1k
In Collection: MobileOne
Config: configs/mobileone/mobileone-s0_8xb32_in1k.py
Metadata:
FLOPs: 274136576 # 0.27G
Parameters: 2078504 # 2.08M
Results:
- Dataset: ImageNet-1k
Task: Image Classification
Metrics:
Top 1 Accuracy: 71.34
Top 5 Accuracy: 89.87
Weights: https://download.openmmlab.com/mmclassification/v0/mobileone/mobileone-s0_8xb32_in1k_20221110-0bc94952.pth
- Name: mobileone-s1_8xb32_in1k
In Collection: MobileOne
Config: configs/mobileone/mobileone-s1_8xb32_in1k.py
Metadata:
FLOPs: 823839744 # 8.6G
Parameters: 4764840 # 4.82M
Results:
- Dataset: ImageNet-1k
Task: Image Classification
Metrics:
Top 1 Accuracy: 75.72
Top 5 Accuracy: 92.54
Weights: https://download.openmmlab.com/mmclassification/v0/mobileone/mobileone-s1_8xb32_in1k_20221110-ceeef467.pth
- Name: mobileone-s2_8xb32_in1k
In Collection: MobileOne
Config: configs/mobileone/mobileone-s2_8xb32_in1k.py
Metadata:
FLOPs: 1296478848
Parameters: 7808168
Results:
- Dataset: ImageNet-1k
Task: Image Classification
Metrics:
Top 1 Accuracy: 77.37
Top 5 Accuracy: 93.34
Weights: https://download.openmmlab.com/mmclassification/v0/mobileone/mobileone-s2_8xb32_in1k_20221110-9c7ecb97.pth
- Name: mobileone-s3_8xb32_in1k
In Collection: MobileOne
Config: configs/mobileone/mobileone-s3_8xb32_in1k.py
Metadata:
FLOPs: 1893842944
Parameters: 10078312
Results:
- Dataset: ImageNet-1k
Task: Image Classification
Metrics:
Top 1 Accuracy: 78.06
Top 5 Accuracy: 93.83
Weights: https://download.openmmlab.com/mmclassification/v0/mobileone/mobileone-s3_8xb32_in1k_20221110-c95eb3bf.pth
- Name: mobileone-s4_8xb32_in1k
In Collection: MobileOne
Config: configs/mobileone/mobileone-s4_8xb32_in1k.py
Metadata:
FLOPs: 2979222528
Parameters: 14838352
Results:
- Dataset: ImageNet-1k
Task: Image Classification
Metrics:
Top 1 Accuracy: 79.69
Top 5 Accuracy: 94.46
Weights: https://download.openmmlab.com/mmclassification/v0/mobileone/mobileone-s4_8xb32_in1k_20221110-28d888cb.pth
_base_ = [
'../_base_/models/mobileone/mobileone_s0.py',
'../_base_/datasets/imagenet_bs32_pil_resize.py',
'../_base_/schedules/imagenet_bs256_coslr_coswd_300e.py',
'../_base_/default_runtime.py'
]
# schedule settings
optim_wrapper = dict(paramwise_cfg=dict(norm_decay_mult=0.))
val_dataloader = dict(batch_size=256)
test_dataloader = dict(batch_size=256)
custom_hooks = [
dict(
type='EMAHook',
momentum=5e-4,
priority='ABOVE_NORMAL',
update_buffers=True)
]
_base_ = [
'../_base_/models/mobileone/mobileone_s1.py',
'../_base_/datasets/imagenet_bs32_pil_resize.py',
'../_base_/schedules/imagenet_bs256_coslr_coswd_300e.py',
'../_base_/default_runtime.py'
]
# schedule settings
optim_wrapper = dict(paramwise_cfg=dict(norm_decay_mult=0.))
val_dataloader = dict(batch_size=256)
test_dataloader = dict(batch_size=256)
bgr_mean = _base_.data_preprocessor['mean'][::-1]
base_train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='RandomResizedCrop', scale=224, backend='pillow'),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(
type='RandAugment',
policies='timm_increasing',
num_policies=2,
total_level=10,
magnitude_level=7,
magnitude_std=0.5,
hparams=dict(pad_val=[round(x) for x in bgr_mean])),
dict(type='PackInputs')
]
import copy # noqa: E402
# modify start epoch's RandomResizedCrop.scale to 160
train_pipeline_1e = copy.deepcopy(base_train_pipeline)
train_pipeline_1e[1]['scale'] = 160
train_pipeline_1e[3]['magnitude_level'] *= 0.1
_base_.train_dataloader.dataset.pipeline = train_pipeline_1e
# modify 37 epoch's RandomResizedCrop.scale to 192
train_pipeline_37e = copy.deepcopy(base_train_pipeline)
train_pipeline_37e[1]['scale'] = 192
train_pipeline_1e[3]['magnitude_level'] *= 0.2
# modify 112 epoch's RandomResizedCrop.scale to 224
train_pipeline_112e = copy.deepcopy(base_train_pipeline)
train_pipeline_112e[1]['scale'] = 224
train_pipeline_1e[3]['magnitude_level'] *= 0.3
custom_hooks = [
dict(
type='SwitchRecipeHook',
schedule=[
dict(action_epoch=37, pipeline=train_pipeline_37e),
dict(action_epoch=112, pipeline=train_pipeline_112e),
]),
dict(
type='EMAHook',
momentum=5e-4,
priority='ABOVE_NORMAL',
update_buffers=True)
]
_base_ = [
'../_base_/models/mobileone/mobileone_s2.py',
'../_base_/datasets/imagenet_bs32_pil_resize.py',
'../_base_/schedules/imagenet_bs256_coslr_coswd_300e.py',
'../_base_/default_runtime.py'
]
# schedule settings
optim_wrapper = dict(paramwise_cfg=dict(norm_decay_mult=0.))
val_dataloader = dict(batch_size=256)
test_dataloader = dict(batch_size=256)
import copy # noqa: E402
bgr_mean = _base_.data_preprocessor['mean'][::-1]
base_train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='RandomResizedCrop', scale=224, backend='pillow'),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(
type='RandAugment',
policies='timm_increasing',
num_policies=2,
total_level=10,
magnitude_level=7,
magnitude_std=0.5,
hparams=dict(pad_val=[round(x) for x in bgr_mean])),
dict(type='PackInputs')
]
# modify start epoch RandomResizedCrop.scale to 160
# and RA.magnitude_level * 0.3
train_pipeline_1e = copy.deepcopy(base_train_pipeline)
train_pipeline_1e[1]['scale'] = 160
train_pipeline_1e[3]['magnitude_level'] *= 0.3
_base_.train_dataloader.dataset.pipeline = train_pipeline_1e
import copy # noqa: E402
# modify 137 epoch's RandomResizedCrop.scale to 192
# and RA.magnitude_level * 0.7
train_pipeline_37e = copy.deepcopy(base_train_pipeline)
train_pipeline_37e[1]['scale'] = 192
train_pipeline_37e[3]['magnitude_level'] *= 0.7
# modify 112 epoch's RandomResizedCrop.scale to 224
# and RA.magnitude_level * 1.0
train_pipeline_112e = copy.deepcopy(base_train_pipeline)
train_pipeline_112e[1]['scale'] = 224
train_pipeline_112e[3]['magnitude_level'] *= 1.0
custom_hooks = [
dict(
type='SwitchRecipeHook',
schedule=[
dict(action_epoch=37, pipeline=train_pipeline_37e),
dict(action_epoch=112, pipeline=train_pipeline_112e),
]),
dict(
type='EMAHook',
momentum=5e-4,
priority='ABOVE_NORMAL',
update_buffers=True)
]
_base_ = [
'../_base_/models/mobileone/mobileone_s3.py',
'../_base_/datasets/imagenet_bs32_pil_resize.py',
'../_base_/schedules/imagenet_bs256_coslr_coswd_300e.py',
'../_base_/default_runtime.py'
]
# schedule settings
optim_wrapper = dict(paramwise_cfg=dict(norm_decay_mult=0.))
val_dataloader = dict(batch_size=256)
test_dataloader = dict(batch_size=256)
import copy # noqa: E402
bgr_mean = _base_.data_preprocessor['mean'][::-1]
base_train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='RandomResizedCrop', scale=224, backend='pillow'),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(
type='RandAugment',
policies='timm_increasing',
num_policies=2,
total_level=10,
magnitude_level=7,
magnitude_std=0.5,
hparams=dict(pad_val=[round(x) for x in bgr_mean])),
dict(type='PackInputs')
]
# modify start epoch RandomResizedCrop.scale to 160
# and RA.magnitude_level * 0.3
train_pipeline_1e = copy.deepcopy(base_train_pipeline)
train_pipeline_1e[1]['scale'] = 160
train_pipeline_1e[3]['magnitude_level'] *= 0.3
_base_.train_dataloader.dataset.pipeline = train_pipeline_1e
import copy # noqa: E402
# modify 137 epoch's RandomResizedCrop.scale to 192
# and RA.magnitude_level * 0.7
train_pipeline_37e = copy.deepcopy(base_train_pipeline)
train_pipeline_37e[1]['scale'] = 192
train_pipeline_37e[3]['magnitude_level'] *= 0.7
# modify 112 epoch's RandomResizedCrop.scale to 224
# and RA.magnitude_level * 1.0
train_pipeline_112e = copy.deepcopy(base_train_pipeline)
train_pipeline_112e[1]['scale'] = 224
train_pipeline_112e[3]['magnitude_level'] *= 1.0
custom_hooks = [
dict(
type='SwitchRecipeHook',
schedule=[
dict(action_epoch=37, pipeline=train_pipeline_37e),
dict(action_epoch=112, pipeline=train_pipeline_112e),
]),
dict(
type='EMAHook',
momentum=5e-4,
priority='ABOVE_NORMAL',
update_buffers=True)
]
_base_ = [
'../_base_/models/mobileone/mobileone_s4.py',
'../_base_/datasets/imagenet_bs32_pil_resize.py',
'../_base_/schedules/imagenet_bs256_coslr_coswd_300e.py',
'../_base_/default_runtime.py'
]
# schedule settings
optim_wrapper = dict(paramwise_cfg=dict(norm_decay_mult=0.))
val_dataloader = dict(batch_size=256)
test_dataloader = dict(batch_size=256)
bgr_mean = _base_.data_preprocessor['mean'][::-1]
base_train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='RandomResizedCrop', scale=224, backend='pillow'),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(
type='RandAugment',
policies='timm_increasing',
num_policies=2,
total_level=10,
magnitude_level=7,
magnitude_std=0.5,
hparams=dict(pad_val=[round(x) for x in bgr_mean])),
dict(type='PackInputs')
]
import copy # noqa: E402
# modify start epoch RandomResizedCrop.scale to 160
# and RA.magnitude_level * 0.3
train_pipeline_1e = copy.deepcopy(base_train_pipeline)
train_pipeline_1e[1]['scale'] = 160
train_pipeline_1e[3]['magnitude_level'] *= 0.3
_base_.train_dataloader.dataset.pipeline = train_pipeline_1e
# modify 137 epoch's RandomResizedCrop.scale to 192
# and RA.magnitude_level * 0.7
train_pipeline_37e = copy.deepcopy(base_train_pipeline)
train_pipeline_37e[1]['scale'] = 192
train_pipeline_37e[3]['magnitude_level'] *= 0.7
# modify 112 epoch's RandomResizedCrop.scale to 224
# and RA.magnitude_level * 1.0
train_pipeline_112e = copy.deepcopy(base_train_pipeline)
train_pipeline_112e[1]['scale'] = 224
train_pipeline_112e[3]['magnitude_level'] *= 1.0
custom_hooks = [
dict(
type='SwitchRecipeHook',
schedule=[
dict(action_epoch=37, pipeline=train_pipeline_37e),
dict(action_epoch=112, pipeline=train_pipeline_112e),
]),
dict(
type='EMAHook',
momentum=5e-4,
priority='ABOVE_NORMAL',
update_buffers=True)
]
# MobileViT
> [MobileViT Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178)
<!-- [ALGORITHM] -->
## Introduction
**MobileViT** aims at introducing a light-weight network, which takes the advantages of both ViTs and CNNs, uses the `InvertedResidual` blocks in [MobileNetV2](../mobilenet_v2/README.md) and `MobileViTBlock` which refers to [ViT](../vision_transformer/README.md) transformer blocks to build a standard 5-stage model structure.
The MobileViTBlock reckons transformers as convolutions to perform a global representation, meanwhile conbined with original convolution layers for local representation to build a block with global receptive field. This is different from ViT, which adds an extra class token and position embeddings for learning relative relationship. Without any position embeddings, MobileViT can benfit from multi-scale inputs during training.
Also, this paper puts forward a strategy for multi-scale training to dynamically adjust batch size based on the image size to both improve training efficiency and final performance.
It is also proven effective in downstream tasks such as object detection and segmentation.
<div align=center>
<img src="https://user-images.githubusercontent.com/42952108/193229983-822bf025-89a6-4d95-b6be-76b7f1a62f2c.png" width="70%"/>
</div>
## Abstract
<details>
<summary>Show the paper's abstract</summary>
<br>
Light-weight convolutional neural networks (CNNs) are the de-facto for mobile vision tasks. Their spatial inductive biases allow them to learn representations with fewer parameters across different vision tasks. However, these networks are spatially local. To learn global representations, self-attention-based vision trans-formers (ViTs) have been adopted. Unlike CNNs, ViTs are heavy-weight. In this paper, we ask the following question: is it possible to combine the strengths of CNNs and ViTs to build a light-weight and low latency network for mobile vision tasks? Towards this end, we introduce MobileViT, a light-weight and general-purpose vision transformer for mobile devices. MobileViT presents a different perspective for the global processing of information with transformers, i.e., transformers as convolutions. Our results show that MobileViT significantly outperforms CNN- and ViT-based networks across different tasks and datasets. On the ImageNet-1k dataset, MobileViT achieves top-1 accuracy of 78.4% with about 6 million parameters, which is 3.2% and 6.2% more accurate than MobileNetv3 (CNN-based) and DeIT (ViT-based) for a similar number of parameters. On the MS-COCO object detection task, MobileViT is 5.7% more accurate than MobileNetv3 for a similar number of parameters.
</br>
</details>
## How to use it?
<!-- [TABS-BEGIN] -->
**Predict image**
```python
from mmpretrain import inference_model
predict = inference_model('mobilevit-small_3rdparty_in1k', 'demo/bird.JPEG')
print(predict['pred_class'])
print(predict['pred_score'])
```
**Use the model**
```python
import torch
from mmpretrain import get_model
model = get_model('mobilevit-small_3rdparty_in1k', pretrained=True)
inputs = torch.rand(1, 3, 224, 224)
out = model(inputs)
print(type(out))
# To extract features.
feats = model.extract_feat(inputs)
print(type(feats))
```
**Test Command**
Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
Test:
```shell
python tools/test.py configs/mobilevit/mobilevit-small_8xb128_in1k.py https://download.openmmlab.com/mmclassification/v0/mobilevit/mobilevit-small_3rdparty_in1k_20221018-cb4f741c.pth
```
<!-- [TABS-END] -->
## Models and results
### Image Classification on ImageNet-1k
| Model | Pretrain | Params (M) | Flops (G) | Top-1 (%) | Top-5 (%) | Config | Download |
| :---------------------------------- | :----------: | :--------: | :-------: | :-------: | :-------: | :----------------------------------------: | :------------------------------------------------------------------------: |
| `mobilevit-small_3rdparty_in1k`\* | From scratch | 5.58 | 2.03 | 78.25 | 94.09 | [config](mobilevit-small_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mobilevit/mobilevit-small_3rdparty_in1k_20221018-cb4f741c.pth) |
| `mobilevit-xsmall_3rdparty_in1k`\* | From scratch | 2.32 | 1.05 | 74.75 | 92.32 | [config](mobilevit-xsmall_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mobilevit/mobilevit-xsmall_3rdparty_in1k_20221018-be39a6e7.pth) |
| `mobilevit-xxsmall_3rdparty_in1k`\* | From scratch | 1.27 | 0.42 | 69.02 | 88.91 | [config](mobilevit-xxsmall_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mobilevit/mobilevit-xxsmall_3rdparty_in1k_20221018-77835605.pth) |
*Models with * are converted from the [official repo](https://github.com/apple/ml-cvnets). The config files of these models are only for inference. We haven't reproduce the training results.*
## Citation
```bibtex
@article{mehta2021mobilevit,
title={MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer},
author={Mehta, Sachin and Rastegari, Mohammad},
journal={arXiv preprint arXiv:2110.02178},
year={2021}
}
```
Collections:
- Name: MobileViT
Metadata:
Training Data: ImageNet-1k
Architecture:
- MobileViT Block
Paper:
URL: https://arxiv.org/abs/2110.02178
Title: MobileViT Light-weight, General-purpose, and Mobile-friendly Vision Transformer
README: configs/mobilevit/README.md
Models:
- Name: mobilevit-small_3rdparty_in1k
Metadata:
FLOPs: 2030000000
Parameters: 5580000
In Collection: MobileViT
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 78.25
Top 5 Accuracy: 94.09
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/mobilevit/mobilevit-small_3rdparty_in1k_20221018-cb4f741c.pth
Config: configs/mobilevit/mobilevit-small_8xb128_in1k.py
Converted From:
Weights: https://docs-assets.developer.apple.com/ml-research/models/cvnets/classification/mobilevit_s.pt
Code: https://github.com/apple/ml-cvnets
- Name: mobilevit-xsmall_3rdparty_in1k
Metadata:
FLOPs: 1050000000
Parameters: 2320000
In Collection: MobileViT
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 74.75
Top 5 Accuracy: 92.32
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/mobilevit/mobilevit-xsmall_3rdparty_in1k_20221018-be39a6e7.pth
Config: configs/mobilevit/mobilevit-xsmall_8xb128_in1k.py
Converted From:
Weights: https://docs-assets.developer.apple.com/ml-research/models/cvnets/classification/mobilevit_xs.pt
Code: https://github.com/apple/ml-cvnets
- Name: mobilevit-xxsmall_3rdparty_in1k
Metadata:
FLOPs: 420000000
Parameters: 1270000
In Collection: MobileViT
Results:
- Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 69.02
Top 5 Accuracy: 88.91
Task: Image Classification
Weights: https://download.openmmlab.com/mmclassification/v0/mobilevit/mobilevit-xxsmall_3rdparty_in1k_20221018-77835605.pth
Config: configs/mobilevit/mobilevit-xxsmall_8xb128_in1k.py
Converted From:
Weights: https://docs-assets.developer.apple.com/ml-research/models/cvnets/classification/mobilevit_xxs.pt
Code: https://github.com/apple/ml-cvnets
_base_ = [
'../_base_/models/mobilevit/mobilevit_s.py',
'../_base_/datasets/imagenet_bs32.py',
'../_base_/default_runtime.py',
'../_base_/schedules/imagenet_bs256.py',
]
# no normalize for original implements
data_preprocessor = dict(
# RGB format normalization parameters
mean=[0, 0, 0],
std=[255, 255, 255],
# use bgr directly
to_rgb=False,
)
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='ResizeEdge', scale=288, edge='short'),
dict(type='CenterCrop', crop_size=256),
dict(type='PackInputs'),
]
train_dataloader = dict(batch_size=128)
val_dataloader = dict(
batch_size=128,
dataset=dict(pipeline=test_pipeline),
)
test_dataloader = val_dataloader
_base_ = [
'../_base_/models/mobilevit/mobilevit_xs.py',
'../_base_/datasets/imagenet_bs32.py',
'../_base_/default_runtime.py',
'../_base_/schedules/imagenet_bs256.py',
]
# no normalize for original implements
data_preprocessor = dict(
# RGB format normalization parameters
mean=[0, 0, 0],
std=[255, 255, 255],
# use bgr directly
to_rgb=False,
)
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='ResizeEdge', scale=288, edge='short'),
dict(type='CenterCrop', crop_size=256),
dict(type='PackInputs'),
]
train_dataloader = dict(batch_size=128)
val_dataloader = dict(
batch_size=128,
dataset=dict(pipeline=test_pipeline),
)
test_dataloader = val_dataloader
_base_ = [
'../_base_/models/mobilevit/mobilevit_xxs.py',
'../_base_/datasets/imagenet_bs32.py',
'../_base_/default_runtime.py',
'../_base_/schedules/imagenet_bs256.py',
]
# no normalize for original implements
data_preprocessor = dict(
# RGB format normalization parameters
mean=[0, 0, 0],
std=[255, 255, 255],
# use bgr directly
to_rgb=False,
)
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='ResizeEdge', scale=288, edge='short'),
dict(type='CenterCrop', crop_size=256),
dict(type='PackInputs'),
]
train_dataloader = dict(batch_size=128)
val_dataloader = dict(
batch_size=128,
dataset=dict(pipeline=test_pipeline),
)
test_dataloader = val_dataloader
# MoCoV2
> [Improved Baselines with Momentum Contrastive Learning](https://arxiv.org/abs/2003.04297)
<!-- [ALGORITHM] -->
## Abstract
Contrastive unsupervised learning has recently shown encouraging progress, e.g., in Momentum Contrast (MoCo) and SimCLR. In this note, we verify the effectiveness of two of SimCLR’s design improvements by implementing them in the MoCo framework. With simple modifications to MoCo—namely, using an MLP projection head and more data augmentation—we establish stronger baselines that outperform SimCLR and do not require large training batches. We hope this will make state-of-the-art unsupervised learning research more accessible.
<div align=center>
<img src="https://user-images.githubusercontent.com/36138628/149720067-b65e5736-d425-45b3-93ed-6f2427fc6217.png" width="500" />
</div>
## How to use it?
<!-- [TABS-BEGIN] -->
**Predict image**
```python
from mmpretrain import inference_model
predict = inference_model('resnet50_mocov2-pre_8xb32-linear-steplr-100e_in1k', 'demo/bird.JPEG')
print(predict['pred_class'])
print(predict['pred_score'])
```
**Use the model**
```python
import torch
from mmpretrain import get_model
model = get_model('mocov2_resnet50_8xb32-coslr-200e_in1k', pretrained=True)
inputs = torch.rand(1, 3, 224, 224)
out = model(inputs)
print(type(out))
# To extract features.
feats = model.extract_feat(inputs)
print(type(feats))
```
**Train/Test Command**
Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
Train:
```shell
python tools/train.py configs/mocov2/mocov2_resnet50_8xb32-coslr-200e_in1k.py
```
Test:
```shell
python tools/test.py configs/mocov2/benchmarks/resnet50_8xb32-linear-steplr-100e_in1k.py https://download.openmmlab.com/mmselfsup/1.x/mocov2/mocov2_resnet50_8xb32-coslr-200e_in1k/resnet50_linear-8xb32-steplr-100e_in1k/resnet50_linear-8xb32-steplr-100e_in1k_20220825-994c4128.pth
```
<!-- [TABS-END] -->
## Models and results
### Pretrained models
| Model | Params (M) | Flops (G) | Config | Download |
| :-------------------------------------- | :--------: | :-------: | :------------------------------------------------: | :------------------------------------------------------------------------------------------: |
| `mocov2_resnet50_8xb32-coslr-200e_in1k` | 55.93 | 4.11 | [config](mocov2_resnet50_8xb32-coslr-200e_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/1.x/mocov2/mocov2_resnet50_8xb32-coslr-200e_in1k/mocov2_resnet50_8xb32-coslr-200e_in1k_20220825-b6d23c86.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/mocov2/mocov2_resnet50_8xb32-coslr-200e_in1k/mocov2_resnet50_8xb32-coslr-200e_in1k_20220825-b6d23c86.json) |
### Image Classification on ImageNet-1k
| Model | Pretrain | Params (M) | Flops (G) | Top-1 (%) | Config | Download |
| :---------------------------------------- | :------------------------------------------: | :--------: | :-------: | :-------: | :----------------------------------------: | :-------------------------------------------: |
| `resnet50_mocov2-pre_8xb32-linear-steplr-100e_in1k` | [MOCOV2](https://download.openmmlab.com/mmselfsup/1.x/mocov2/mocov2_resnet50_8xb32-coslr-200e_in1k/mocov2_resnet50_8xb32-coslr-200e_in1k_20220825-b6d23c86.pth) | 25.56 | 4.11 | 67.50 | [config](benchmarks/resnet50_8xb32-linear-steplr-100e_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/1.x/mocov2/mocov2_resnet50_8xb32-coslr-200e_in1k/resnet50_linear-8xb32-steplr-100e_in1k/resnet50_linear-8xb32-steplr-100e_in1k_20220825-994c4128.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/mocov2/mocov2_resnet50_8xb32-coslr-200e_in1k/resnet50_linear-8xb32-steplr-100e_in1k/resnet50_linear-8xb32-steplr-100e_in1k_20220825-994c4128.json) |
## Citation
```bibtex
@article{chen2020improved,
title={Improved baselines with momentum contrastive learning},
author={Chen, Xinlei and Fan, Haoqi and Girshick, Ross and He, Kaiming},
journal={arXiv preprint arXiv:2003.04297},
year={2020}
}
```
_base_ = [
'../../_base_/models/resnet50.py',
'../../_base_/datasets/imagenet_bs32_pil_resize.py',
'../../_base_/schedules/imagenet_sgd_steplr_100e.py',
'../../_base_/default_runtime.py',
]
model = dict(
backbone=dict(
frozen_stages=4,
init_cfg=dict(type='Pretrained', checkpoint='', prefix='backbone.')))
# optimizer
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='SGD', lr=30., momentum=0.9, weight_decay=0.))
# runtime settings
default_hooks = dict(
checkpoint=dict(type='CheckpointHook', interval=10, max_keep_ckpts=3))
Collections:
- Name: MoCoV2
Metadata:
Training Data: ImageNet-1k
Training Techniques:
- SGD with Momentum
- Weight Decay
Training Resources: 8x V100 GPUs
Architecture:
- ResNet
- MoCo
Paper:
Title: Improved Baselines with Momentum Contrastive Learning
URL: https://arxiv.org/abs/2003.04297
README: configs/mocov2/README.md
Models:
- Name: mocov2_resnet50_8xb32-coslr-200e_in1k
Metadata:
Epochs: 200
Batch Size: 256
FLOPs: 4109364224
Parameters: 55933312
Training Data: ImageNet-1k
In Collection: MoCoV2
Results: null
Weights: https://download.openmmlab.com/mmselfsup/1.x/mocov2/mocov2_resnet50_8xb32-coslr-200e_in1k/mocov2_resnet50_8xb32-coslr-200e_in1k_20220825-b6d23c86.pth
Config: configs/mocov2/mocov2_resnet50_8xb32-coslr-200e_in1k.py
Downstream:
- resnet50_mocov2-pre_8xb32-linear-steplr-100e_in1k
- Name: resnet50_mocov2-pre_8xb32-linear-steplr-100e_in1k
Metadata:
Epochs: 100
Batch Size: 256
FLOPs: 4109464576
Parameters: 25557032
Training Data: ImageNet-1k
In Collection: MoCoV2
Results:
- Task: Image Classification
Dataset: ImageNet-1k
Metrics:
Top 1 Accuracy: 67.5
Weights: https://download.openmmlab.com/mmselfsup/1.x/mocov2/mocov2_resnet50_8xb32-coslr-200e_in1k/resnet50_linear-8xb32-steplr-100e_in1k/resnet50_linear-8xb32-steplr-100e_in1k_20220825-994c4128.pth
Config: configs/mocov2/benchmarks/resnet50_8xb32-linear-steplr-100e_in1k.py
_base_ = [
'../_base_/datasets/imagenet_bs32_mocov2.py',
'../_base_/schedules/imagenet_sgd_coslr_200e.py',
'../_base_/default_runtime.py',
]
# model settings
model = dict(
type='MoCo',
queue_len=65536,
feat_dim=128,
momentum=0.001,
backbone=dict(
type='ResNet',
depth=50,
norm_cfg=dict(type='BN'),
zero_init_residual=False),
neck=dict(
type='MoCoV2Neck',
in_channels=2048,
hid_channels=2048,
out_channels=128,
with_avg_pool=True),
head=dict(
type='ContrastiveHead',
loss=dict(type='CrossEntropyLoss'),
temperature=0.2))
# only keeps the latest 3 checkpoints
default_hooks = dict(checkpoint=dict(max_keep_ckpts=3))
# NOTE: `auto_scale_lr` is for automatically scaling LR
# based on the actual training batch size.
auto_scale_lr = dict(base_batch_size=256)
# MoCoV3
> [An Empirical Study of Training Self-Supervised Vision Transformers](https://arxiv.org/abs/2104.02057)
<!-- [ALGORITHM] -->
## Abstract
This paper does not describe a novel method. Instead, it studies a straightforward, incremental, yet must-know baseline given the recent progress in computer vision: self-supervised learning for Vision Transformers (ViT). While the training recipes for standard convolutional networks have been highly mature and robust, the recipes for ViT are yet to be built, especially in the self-supervised scenarios where training becomes more challenging. In this work, we go back to basics and investigate the effects of several fundamental components for training self-supervised ViT. We observe that instability is a major issue that degrades accuracy, and it can be hidden by apparently good results. We reveal that these results are indeed partial failure, and they can be improved when training is made more stable. We benchmark ViT results in MoCo v3 and several other self-supervised frameworks, with ablations in various aspects. We discuss the currently positive evidence as well as challenges and open questions. We hope that this work will provide useful data points and experience for future research.
<div align=center>
<img src="https://user-images.githubusercontent.com/36138628/151305362-e6e8ea35-b3b8-45f6-8819-634e67083218.png" width="500" />
</div>
## How to use it?
<!-- [TABS-BEGIN] -->
**Predict image**
```python
from mmpretrain import inference_model
predict = inference_model('resnet50_mocov3-100e-pre_8xb128-linear-coslr-90e_in1k', 'demo/bird.JPEG')
print(predict['pred_class'])
print(predict['pred_score'])
```
**Use the model**
```python
import torch
from mmpretrain import get_model
model = get_model('mocov3_resnet50_8xb512-amp-coslr-100e_in1k', pretrained=True)
inputs = torch.rand(1, 3, 224, 224)
out = model(inputs)
print(type(out))
# To extract features.
feats = model.extract_feat(inputs)
print(type(feats))
```
**Train/Test Command**
Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
Train:
```shell
python tools/train.py configs/mocov3/mocov3_resnet50_8xb512-amp-coslr-100e_in1k.py
```
Test:
```shell
python tools/test.py configs/mocov3/benchmarks/resnet50_8xb128-linear-coslr-90e_in1k.py https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-100e_in1k/resnet50_linear-8xb128-coslr-90e_in1k/resnet50_linear-8xb128-coslr-90e_in1k_20220927-8f7d937e.pth
```
<!-- [TABS-END] -->
## Models and results
### Pretrained models
| Model | Params (M) | Flops (G) | Config | Download |
| :------------------------------------------------- | :--------: | :-------: | :-----------------------------------------------------------: | :--------------------------------------------------------------------: |
| `mocov3_resnet50_8xb512-amp-coslr-100e_in1k` | 68.01 | 4.11 | [config](mocov3_resnet50_8xb512-amp-coslr-100e_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-100e_in1k/mocov3_resnet50_8xb512-amp-coslr-100e_in1k_20220927-f1144efa.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-100e_in1k/mocov3_resnet50_8xb512-amp-coslr-100e_in1k_20220927-f1144efa.json) |
| `mocov3_resnet50_8xb512-amp-coslr-300e_in1k` | 68.01 | 4.11 | [config](mocov3_resnet50_8xb512-amp-coslr-300e_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-300e_in1k/mocov3_resnet50_8xb512-amp-coslr-300e_in1k_20220927-1e4f3304.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-300e_in1k/mocov3_resnet50_8xb512-amp-coslr-300e_in1k_20220927-1e4f3304.json) |
| `mocov3_resnet50_8xb512-amp-coslr-800e_in1k` | 68.01 | 4.11 | [config](mocov3_resnet50_8xb512-amp-coslr-800e_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-800e_in1k/mocov3_resnet50_8xb512-amp-coslr-800e_in1k_20220927-e043f51a.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-800e_in1k/mocov3_resnet50_8xb512-amp-coslr-800e_in1k_20220927-e043f51a.json) |
| `mocov3_vit-small-p16_16xb256-amp-coslr-300e_in1k` | 84.27 | 4.61 | [config](mocov3_vit-small-p16_16xb256-amp-coslr-300e_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-small-p16_16xb256-amp-coslr-300e_in1k/mocov3_vit-small-p16_16xb256-amp-coslr-300e_in1k-224_20220826-08bc52f7.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-small-p16_16xb256-amp-coslr-300e_in1k/mocov3_vit-small-p16_16xb256-amp-coslr-300e_in1k-224_20220826-08bc52f7.json) |
| `mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k` | 215.68 | 17.58 | [config](mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k/mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k-224_20220826-25213343.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k/mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k-224_20220826-25213343.json) |
| `mocov3_vit-large-p16_64xb64-amp-coslr-300e_in1k` | 652.78 | 61.60 | [config](mocov3_vit-large-p16_64xb64-amp-coslr-300e_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-large-p16_64xb64-amp-coslr-300e_in1k/mocov3_vit-large-p16_64xb64-amp-coslr-300e_in1k-224_20220829-9b88a442.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-large-p16_64xb64-amp-coslr-300e_in1k/mocov3_vit-large-p16_64xb64-amp-coslr-300e_in1k-224_20220829-9b88a442.json) |
### Image Classification on ImageNet-1k
| Model | Pretrain | Params (M) | Flops (G) | Top-1 (%) | Config | Download |
| :---------------------------------------- | :------------------------------------------: | :--------: | :-------: | :-------: | :----------------------------------------: | :-------------------------------------------: |
| `resnet50_mocov3-100e-pre_8xb128-linear-coslr-90e_in1k` | [MOCOV3 100-Epochs](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-100e_in1k/mocov3_resnet50_8xb512-amp-coslr-100e_in1k_20220927-f1144efa.pth) | 25.56 | 4.11 | 69.60 | [config](benchmarks/resnet50_8xb128-linear-coslr-90e_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-100e_in1k/resnet50_linear-8xb128-coslr-90e_in1k/resnet50_linear-8xb128-coslr-90e_in1k_20220927-8f7d937e.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-100e_in1k/resnet50_linear-8xb128-coslr-90e_in1k/resnet50_linear-8xb128-coslr-90e_in1k_20220927-8f7d937e.json) |
| `resnet50_mocov3-300e-pre_8xb128-linear-coslr-90e_in1k` | [MOCOV3 300-Epochs](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-300e_in1k/mocov3_resnet50_8xb512-amp-coslr-300e_in1k_20220927-1e4f3304.pth) | 25.56 | 4.11 | 72.80 | [config](benchmarks/resnet50_8xb128-linear-coslr-90e_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-300e_in1k/resnet50_linear-8xb128-coslr-90e_in1k/resnet50_linear-8xb128-coslr-90e_in1k_20220927-d21ddac2.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-300e_in1k/resnet50_linear-8xb128-coslr-90e_in1k/resnet50_linear-8xb128-coslr-90e_in1k_20220927-d21ddac2.json) |
| `resnet50_mocov3-800e-pre_8xb128-linear-coslr-90e_in1k` | [MOCOV3 800-Epochs](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-800e_in1k/mocov3_resnet50_8xb512-amp-coslr-800e_in1k_20220927-e043f51a.pth) | 25.56 | 4.11 | 74.40 | [config](benchmarks/resnet50_8xb128-linear-coslr-90e_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-800e_in1k/resnet50_linear-8xb128-coslr-90e_in1k/resnet50_linear-8xb128-coslr-90e_in1k_20220927-0e97a483.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-800e_in1k/resnet50_linear-8xb128-coslr-90e_in1k/resnet50_linear-8xb128-coslr-90e_in1k_20220927-0e97a483.json) |
| `vit-small-p16_mocov3-pre_8xb128-linear-coslr-90e_in1k` | [MOCOV3](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-small-p16_16xb256-amp-coslr-300e_in1k/mocov3_vit-small-p16_16xb256-amp-coslr-300e_in1k-224_20220826-08bc52f7.pth) | 22.05 | 4.61 | 73.60 | [config](benchmarks/vit-small-p16_8xb128-linear-coslr-90e_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-small-p16_16xb256-amp-coslr-300e_in1k/vit-small-p16_linear-8xb128-coslr-90e_in1k/vit-small-p16_linear-8xb128-coslr-90e_in1k_20220826-376674ef.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-small-p16_16xb256-amp-coslr-300e_in1k/vit-small-p16_linear-8xb128-coslr-90e_in1k/vit-small-p16_linear-8xb128-coslr-90e_in1k_20220826-376674ef.json) |
| `vit-base-p16_mocov3-pre_8xb64-coslr-150e_in1k` | [MOCOV3](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k/mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k-224_20220826-25213343.pth) | 86.57 | 17.58 | 83.00 | [config](benchmarks/vit-base-p16_8xb64-coslr-150e_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k/vit-base-p16_ft-8xb64-coslr-150e_in1k/vit-base-p16_ft-8xb64-coslr-150e_in1k_20220826-f1e6c442.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k/vit-base-p16_ft-8xb64-coslr-150e_in1k/vit-base-p16_ft-8xb64-coslr-150e_in1k_20220826-f1e6c442.json) |
| `vit-base-p16_mocov3-pre_8xb128-linear-coslr-90e_in1k` | [MOCOV3](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k/mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k-224_20220826-25213343.pth) | 86.57 | 17.58 | 76.90 | [config](benchmarks/vit-base-p16_8xb128-linear-coslr-90e_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k/vit-base-p16_linear-8xb128-coslr-90e_in1k/vit-base-p16_linear-8xb128-coslr-90e_in1k_20220826-83be7758.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k/vit-base-p16_linear-8xb128-coslr-90e_in1k/vit-base-p16_linear-8xb128-coslr-90e_in1k_20220826-83be7758.json) |
| `vit-large-p16_mocov3-pre_8xb64-coslr-100e_in1k` | [MOCOV3](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-large-p16_64xb64-amp-coslr-300e_in1k/mocov3_vit-large-p16_64xb64-amp-coslr-300e_in1k-224_20220829-9b88a442.pth) | 304.33 | 61.60 | 83.70 | [config](benchmarks/vit-large-p16_8xb64-coslr-100e_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-large-p16_64xb64-amp-coslr-300e_in1k/vit-large-p16_ft-8xb64-coslr-100e_in1k/vit-large-p16_ft-8xb64-coslr-100e_in1k_20220829-878a2f7f.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-large-p16_64xb64-amp-coslr-300e_in1k/vit-large-p16_ft-8xb64-coslr-100e_in1k/vit-large-p16_ft-8xb64-coslr-100e_in1k_20220829-878a2f7f.json) |
## Citation
```bibtex
@InProceedings{Chen_2021_ICCV,
title = {An Empirical Study of Training Self-Supervised Vision Transformers},
author = {Chen, Xinlei and Xie, Saining and He, Kaiming},
booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
year = {2021}
}
```
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment