first commit

dff2c686 · renzhc · 8f9dd0ed · dff2c686 · dff2c686 · dff2c686
Commit dff2c686 authored Sep 03, 2024 by renzhc
20 changed files
--- a/configs/mobileone/deploy/mobileone-s1_deploy_8xb32_in1k.py
+++ b/configs/mobileone/deploy/mobileone-s1_deploy_8xb32_in1k.py
+_base_ = ['../mobileone-s1_8xb32_in1k.py']
+
+model = dict(backbone=dict(deploy=True))
--- a/configs/mobileone/deploy/mobileone-s2_deploy_8xb32_in1k.py
+++ b/configs/mobileone/deploy/mobileone-s2_deploy_8xb32_in1k.py
+_base_ = ['../mobileone-s2_8xb32_in1k.py']
+
+model = dict(backbone=dict(deploy=True))
--- a/configs/mobileone/deploy/mobileone-s3_deploy_8xb32_in1k.py
+++ b/configs/mobileone/deploy/mobileone-s3_deploy_8xb32_in1k.py
+_base_ = ['../mobileone-s3_8xb32_in1k.py']
+
+model = dict(backbone=dict(deploy=True))
--- a/configs/mobileone/deploy/mobileone-s4_deploy_8xb32_in1k.py
+++ b/configs/mobileone/deploy/mobileone-s4_deploy_8xb32_in1k.py
+_base_ = ['../mobileone-s4_8xb32_in1k.py']
+
+model = dict(backbone=dict(deploy=True))
--- a/configs/mobileone/metafile.yml
+++ b/configs/mobileone/metafile.yml
+Collections:
+  - Name: MobileOne
+    Metadata:
+      Training Data: ImageNet-1k
+      Architecture:
+        - re-parameterization Convolution
+        - VGG-style Neural Network
+        - Depthwise Convolution
+        - Pointwise Convolution
+    Paper:
+      URL: https://arxiv.org/abs/2206.04040
+      Title: 'An Improved One millisecond Mobile Backbone'
+    README: configs/mobileone/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmpretrain/blob/v1.0.0rc1/configs/mobileone/metafile.yml
+      Version: v1.0.0rc1
+
+Models:
+  - Name: mobileone-s0_8xb32_in1k
+    In Collection: MobileOne
+    Config: configs/mobileone/mobileone-s0_8xb32_in1k.py
+    Metadata:
+      FLOPs: 274136576  # 0.27G
+      Parameters: 2078504  # 2.08M
+    Results:
+    - Dataset: ImageNet-1k
+      Task: Image Classification
+      Metrics:
+        Top 1 Accuracy: 71.34
+        Top 5 Accuracy: 89.87
+    Weights: https://download.openmmlab.com/mmclassification/v0/mobileone/mobileone-s0_8xb32_in1k_20221110-0bc94952.pth
+  - Name: mobileone-s1_8xb32_in1k
+    In Collection: MobileOne
+    Config: configs/mobileone/mobileone-s1_8xb32_in1k.py
+    Metadata:
+      FLOPs: 823839744    # 8.6G
+      Parameters: 4764840  # 4.82M
+    Results:
+    - Dataset: ImageNet-1k
+      Task: Image Classification
+      Metrics:
+        Top 1 Accuracy: 75.72
+        Top 5 Accuracy: 92.54
+    Weights: https://download.openmmlab.com/mmclassification/v0/mobileone/mobileone-s1_8xb32_in1k_20221110-ceeef467.pth
+  - Name: mobileone-s2_8xb32_in1k
+    In Collection: MobileOne
+    Config: configs/mobileone/mobileone-s2_8xb32_in1k.py
+    Metadata:
+      FLOPs: 1296478848
+      Parameters: 7808168
+    Results:
+    - Dataset: ImageNet-1k
+      Task: Image Classification
+      Metrics:
+        Top 1 Accuracy: 77.37
+        Top 5 Accuracy: 93.34
+    Weights: https://download.openmmlab.com/mmclassification/v0/mobileone/mobileone-s2_8xb32_in1k_20221110-9c7ecb97.pth
+  - Name: mobileone-s3_8xb32_in1k
+    In Collection: MobileOne
+    Config: configs/mobileone/mobileone-s3_8xb32_in1k.py
+    Metadata:
+      FLOPs: 1893842944
+      Parameters: 10078312
+    Results:
+    - Dataset: ImageNet-1k
+      Task: Image Classification
+      Metrics:
+        Top 1 Accuracy: 78.06
+        Top 5 Accuracy: 93.83
+    Weights: https://download.openmmlab.com/mmclassification/v0/mobileone/mobileone-s3_8xb32_in1k_20221110-c95eb3bf.pth
+  - Name: mobileone-s4_8xb32_in1k
+    In Collection: MobileOne
+    Config: configs/mobileone/mobileone-s4_8xb32_in1k.py
+    Metadata:
+      FLOPs: 2979222528
+      Parameters: 14838352
+    Results:
+    - Dataset: ImageNet-1k
+      Task: Image Classification
+      Metrics:
+        Top 1 Accuracy: 79.69
+        Top 5 Accuracy: 94.46
+    Weights: https://download.openmmlab.com/mmclassification/v0/mobileone/mobileone-s4_8xb32_in1k_20221110-28d888cb.pth
--- a/configs/mobileone/mobileone-s0_8xb32_in1k.py
+++ b/configs/mobileone/mobileone-s0_8xb32_in1k.py
+_base_ = [
+    '../_base_/models/mobileone/mobileone_s0.py',
+    '../_base_/datasets/imagenet_bs32_pil_resize.py',
+    '../_base_/schedules/imagenet_bs256_coslr_coswd_300e.py',
+    '../_base_/default_runtime.py'
+]
+
+# schedule settings
+optim_wrapper = dict(paramwise_cfg=dict(norm_decay_mult=0.))
+
+val_dataloader = dict(batch_size=256)
+test_dataloader = dict(batch_size=256)
+
+custom_hooks = [
+    dict(
+        type='EMAHook',
+        momentum=5e-4,
+        priority='ABOVE_NORMAL',
+        update_buffers=True)
+]
--- a/configs/mobileone/mobileone-s1_8xb32_in1k.py
+++ b/configs/mobileone/mobileone-s1_8xb32_in1k.py
+_base_ = [
+    '../_base_/models/mobileone/mobileone_s1.py',
+    '../_base_/datasets/imagenet_bs32_pil_resize.py',
+    '../_base_/schedules/imagenet_bs256_coslr_coswd_300e.py',
+    '../_base_/default_runtime.py'
+]
+
+# schedule settings
+optim_wrapper = dict(paramwise_cfg=dict(norm_decay_mult=0.))
+
+val_dataloader = dict(batch_size=256)
+test_dataloader = dict(batch_size=256)
+
+bgr_mean = _base_.data_preprocessor['mean'][::-1]
+base_train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='RandomResizedCrop', scale=224, backend='pillow'),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(
+        type='RandAugment',
+        policies='timm_increasing',
+        num_policies=2,
+        total_level=10,
+        magnitude_level=7,
+        magnitude_std=0.5,
+        hparams=dict(pad_val=[round(x) for x in bgr_mean])),
+    dict(type='PackInputs')
+]
+
+import copy  # noqa: E402
+
+# modify start epoch's RandomResizedCrop.scale to 160
+train_pipeline_1e = copy.deepcopy(base_train_pipeline)
+train_pipeline_1e[1]['scale'] = 160
+train_pipeline_1e[3]['magnitude_level'] *= 0.1
+_base_.train_dataloader.dataset.pipeline = train_pipeline_1e
+
+# modify 37 epoch's RandomResizedCrop.scale to 192
+train_pipeline_37e = copy.deepcopy(base_train_pipeline)
+train_pipeline_37e[1]['scale'] = 192
+train_pipeline_1e[3]['magnitude_level'] *= 0.2
+
+# modify 112 epoch's RandomResizedCrop.scale to 224
+train_pipeline_112e = copy.deepcopy(base_train_pipeline)
+train_pipeline_112e[1]['scale'] = 224
+train_pipeline_1e[3]['magnitude_level'] *= 0.3
+
+custom_hooks = [
+    dict(
+        type='SwitchRecipeHook',
+        schedule=[
+            dict(action_epoch=37, pipeline=train_pipeline_37e),
+            dict(action_epoch=112, pipeline=train_pipeline_112e),
+        ]),
+    dict(
+        type='EMAHook',
+        momentum=5e-4,
+        priority='ABOVE_NORMAL',
+        update_buffers=True)
+]
--- a/configs/mobileone/mobileone-s2_8xb32_in1k.py
+++ b/configs/mobileone/mobileone-s2_8xb32_in1k.py
+_base_ = [
+    '../_base_/models/mobileone/mobileone_s2.py',
+    '../_base_/datasets/imagenet_bs32_pil_resize.py',
+    '../_base_/schedules/imagenet_bs256_coslr_coswd_300e.py',
+    '../_base_/default_runtime.py'
+]
+
+# schedule settings
+optim_wrapper = dict(paramwise_cfg=dict(norm_decay_mult=0.))
+
+val_dataloader = dict(batch_size=256)
+test_dataloader = dict(batch_size=256)
+
+import copy  # noqa: E402
+
+bgr_mean = _base_.data_preprocessor['mean'][::-1]
+base_train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='RandomResizedCrop', scale=224, backend='pillow'),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(
+        type='RandAugment',
+        policies='timm_increasing',
+        num_policies=2,
+        total_level=10,
+        magnitude_level=7,
+        magnitude_std=0.5,
+        hparams=dict(pad_val=[round(x) for x in bgr_mean])),
+    dict(type='PackInputs')
+]
+
+# modify start epoch RandomResizedCrop.scale to 160
+# and RA.magnitude_level * 0.3
+train_pipeline_1e = copy.deepcopy(base_train_pipeline)
+train_pipeline_1e[1]['scale'] = 160
+train_pipeline_1e[3]['magnitude_level'] *= 0.3
+_base_.train_dataloader.dataset.pipeline = train_pipeline_1e
+
+import copy  # noqa: E402
+
+# modify 137 epoch's RandomResizedCrop.scale to 192
+# and RA.magnitude_level * 0.7
+train_pipeline_37e = copy.deepcopy(base_train_pipeline)
+train_pipeline_37e[1]['scale'] = 192
+train_pipeline_37e[3]['magnitude_level'] *= 0.7
+
+# modify 112 epoch's RandomResizedCrop.scale to 224
+# and RA.magnitude_level * 1.0
+train_pipeline_112e = copy.deepcopy(base_train_pipeline)
+train_pipeline_112e[1]['scale'] = 224
+train_pipeline_112e[3]['magnitude_level'] *= 1.0
+
+custom_hooks = [
+    dict(
+        type='SwitchRecipeHook',
+        schedule=[
+            dict(action_epoch=37, pipeline=train_pipeline_37e),
+            dict(action_epoch=112, pipeline=train_pipeline_112e),
+        ]),
+    dict(
+        type='EMAHook',
+        momentum=5e-4,
+        priority='ABOVE_NORMAL',
+        update_buffers=True)
+]
--- a/configs/mobileone/mobileone-s3_8xb32_in1k.py
+++ b/configs/mobileone/mobileone-s3_8xb32_in1k.py
+_base_ = [
+    '../_base_/models/mobileone/mobileone_s3.py',
+    '../_base_/datasets/imagenet_bs32_pil_resize.py',
+    '../_base_/schedules/imagenet_bs256_coslr_coswd_300e.py',
+    '../_base_/default_runtime.py'
+]
+
+# schedule settings
+optim_wrapper = dict(paramwise_cfg=dict(norm_decay_mult=0.))
+
+val_dataloader = dict(batch_size=256)
+test_dataloader = dict(batch_size=256)
+
+import copy  # noqa: E402
+
+bgr_mean = _base_.data_preprocessor['mean'][::-1]
+base_train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='RandomResizedCrop', scale=224, backend='pillow'),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(
+        type='RandAugment',
+        policies='timm_increasing',
+        num_policies=2,
+        total_level=10,
+        magnitude_level=7,
+        magnitude_std=0.5,
+        hparams=dict(pad_val=[round(x) for x in bgr_mean])),
+    dict(type='PackInputs')
+]
+
+# modify start epoch RandomResizedCrop.scale to 160
+# and RA.magnitude_level * 0.3
+train_pipeline_1e = copy.deepcopy(base_train_pipeline)
+train_pipeline_1e[1]['scale'] = 160
+train_pipeline_1e[3]['magnitude_level'] *= 0.3
+_base_.train_dataloader.dataset.pipeline = train_pipeline_1e
+
+import copy  # noqa: E402
+
+# modify 137 epoch's RandomResizedCrop.scale to 192
+# and RA.magnitude_level * 0.7
+train_pipeline_37e = copy.deepcopy(base_train_pipeline)
+train_pipeline_37e[1]['scale'] = 192
+train_pipeline_37e[3]['magnitude_level'] *= 0.7
+
+# modify 112 epoch's RandomResizedCrop.scale to 224
+# and RA.magnitude_level * 1.0
+train_pipeline_112e = copy.deepcopy(base_train_pipeline)
+train_pipeline_112e[1]['scale'] = 224
+train_pipeline_112e[3]['magnitude_level'] *= 1.0
+
+custom_hooks = [
+    dict(
+        type='SwitchRecipeHook',
+        schedule=[
+            dict(action_epoch=37, pipeline=train_pipeline_37e),
+            dict(action_epoch=112, pipeline=train_pipeline_112e),
+        ]),
+    dict(
+        type='EMAHook',
+        momentum=5e-4,
+        priority='ABOVE_NORMAL',
+        update_buffers=True)
+]
--- a/configs/mobileone/mobileone-s4_8xb32_in1k.py
+++ b/configs/mobileone/mobileone-s4_8xb32_in1k.py
+_base_ = [
+    '../_base_/models/mobileone/mobileone_s4.py',
+    '../_base_/datasets/imagenet_bs32_pil_resize.py',
+    '../_base_/schedules/imagenet_bs256_coslr_coswd_300e.py',
+    '../_base_/default_runtime.py'
+]
+
+# schedule settings
+optim_wrapper = dict(paramwise_cfg=dict(norm_decay_mult=0.))
+
+val_dataloader = dict(batch_size=256)
+test_dataloader = dict(batch_size=256)
+
+bgr_mean = _base_.data_preprocessor['mean'][::-1]
+base_train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='RandomResizedCrop', scale=224, backend='pillow'),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(
+        type='RandAugment',
+        policies='timm_increasing',
+        num_policies=2,
+        total_level=10,
+        magnitude_level=7,
+        magnitude_std=0.5,
+        hparams=dict(pad_val=[round(x) for x in bgr_mean])),
+    dict(type='PackInputs')
+]
+
+import copy  # noqa: E402
+
+# modify start epoch RandomResizedCrop.scale to 160
+# and RA.magnitude_level * 0.3
+train_pipeline_1e = copy.deepcopy(base_train_pipeline)
+train_pipeline_1e[1]['scale'] = 160
+train_pipeline_1e[3]['magnitude_level'] *= 0.3
+_base_.train_dataloader.dataset.pipeline = train_pipeline_1e
+
+# modify 137 epoch's RandomResizedCrop.scale to 192
+# and RA.magnitude_level * 0.7
+train_pipeline_37e = copy.deepcopy(base_train_pipeline)
+train_pipeline_37e[1]['scale'] = 192
+train_pipeline_37e[3]['magnitude_level'] *= 0.7
+
+# modify 112 epoch's RandomResizedCrop.scale to 224
+# and RA.magnitude_level * 1.0
+train_pipeline_112e = copy.deepcopy(base_train_pipeline)
+train_pipeline_112e[1]['scale'] = 224
+train_pipeline_112e[3]['magnitude_level'] *= 1.0
+
+custom_hooks = [
+    dict(
+        type='SwitchRecipeHook',
+        schedule=[
+            dict(action_epoch=37, pipeline=train_pipeline_37e),
+            dict(action_epoch=112, pipeline=train_pipeline_112e),
+        ]),
+    dict(
+        type='EMAHook',
+        momentum=5e-4,
+        priority='ABOVE_NORMAL',
+        update_buffers=True)
+]
--- a/configs/mobilevit/README.md
+++ b/configs/mobilevit/README.md
+# MobileViT
+
+> [MobileViT Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178)
+
+<!-- [ALGORITHM] -->
+
+## Introduction
+
+**MobileViT** aims at introducing a light-weight network, which takes the advantages of both ViTs and CNNs, uses the `InvertedResidual` blocks in [MobileNetV2](../mobilenet_v2/README.md) and `MobileViTBlock` which refers to [ViT](../vision_transformer/README.md) transformer blocks to build a standard 5-stage model structure.
+
+The MobileViTBlock reckons transformers as convolutions to perform a global representation, meanwhile conbined with original convolution layers for local representation to build a block with global receptive field. This is different from ViT, which adds an extra class token and position embeddings for learning relative relationship. Without any position embeddings, MobileViT can benfit from multi-scale inputs during training.
+
+Also, this paper puts forward a strategy for multi-scale training to dynamically adjust batch size based on the image size to both improve training efficiency and final performance.
+
+It is also proven effective in downstream tasks such as object detection and segmentation.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/42952108/193229983-822bf025-89a6-4d95-b6be-76b7f1a62f2c.png" width="70%"/>
+</div>
+
+## Abstract
+
+<details>
+
+<summary>Show the paper's abstract</summary>
+
+<br>
+
+Light-weight convolutional neural networks (CNNs) are the de-facto for mobile vision tasks. Their spatial inductive biases allow them to learn representations with fewer parameters across different vision tasks. However, these networks are spatially local. To learn global representations, self-attention-based vision trans-formers (ViTs) have been adopted. Unlike CNNs, ViTs are heavy-weight. In this paper, we ask the following question: is it possible to combine the strengths of CNNs and ViTs to build a light-weight and low latency network for mobile vision tasks? Towards this end, we introduce MobileViT, a light-weight and general-purpose vision transformer for mobile devices. MobileViT presents a different perspective for the global processing of information with transformers, i.e., transformers as convolutions. Our results show that MobileViT significantly outperforms CNN- and ViT-based networks across different tasks and datasets. On the ImageNet-1k dataset, MobileViT achieves top-1 accuracy of 78.4% with about 6 million parameters, which is 3.2% and 6.2% more accurate than MobileNetv3 (CNN-based) and DeIT (ViT-based) for a similar number of parameters. On the MS-COCO object detection task, MobileViT is 5.7% more accurate than MobileNetv3 for a similar number of parameters.
+</br>
+
+</details>
+
+## How to use it?
+
+<!-- [TABS-BEGIN] -->
+
+**Predict image**
+
+```python
+from mmpretrain import inference_model
+
+predict = inference_model('mobilevit-small_3rdparty_in1k', 'demo/bird.JPEG')
+print(predict['pred_class'])
+print(predict['pred_score'])
+```
+
+**Use the model**
+
+```python
+import torch
+from mmpretrain import get_model
+
+model = get_model('mobilevit-small_3rdparty_in1k', pretrained=True)
+inputs = torch.rand(1, 3, 224, 224)
+out = model(inputs)
+print(type(out))
+# To extract features.
+feats = model.extract_feat(inputs)
+print(type(feats))
+```
+
+**Test Command**
+
+Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
+
+Test:
+
+```shell
+python tools/test.py configs/mobilevit/mobilevit-small_8xb128_in1k.py https://download.openmmlab.com/mmclassification/v0/mobilevit/mobilevit-small_3rdparty_in1k_20221018-cb4f741c.pth
+```
+
+<!-- [TABS-END] -->
+
+## Models and results
+
+### Image Classification on ImageNet-1k
+
+| Model                               |   Pretrain   | Params (M) | Flops (G) | Top-1 (%) | Top-5 (%) |                   Config                   |                                  Download                                  |
+| :---------------------------------- | :----------: | :--------: | :-------: | :-------: | :-------: | :----------------------------------------: | :------------------------------------------------------------------------: |
+| `mobilevit-small_3rdparty_in1k`\*   | From scratch |    5.58    |   2.03    |   78.25   |   94.09   |  [config](mobilevit-small_8xb128_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/mobilevit/mobilevit-small_3rdparty_in1k_20221018-cb4f741c.pth) |
+| `mobilevit-xsmall_3rdparty_in1k`\*  | From scratch |    2.32    |   1.05    |   74.75   |   92.32   | [config](mobilevit-xsmall_8xb128_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/mobilevit/mobilevit-xsmall_3rdparty_in1k_20221018-be39a6e7.pth) |
+| `mobilevit-xxsmall_3rdparty_in1k`\* | From scratch |    1.27    |   0.42    |   69.02   |   88.91   | [config](mobilevit-xxsmall_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mobilevit/mobilevit-xxsmall_3rdparty_in1k_20221018-77835605.pth) |
+
+*Models with * are converted from the [official repo](https://github.com/apple/ml-cvnets). The config files of these models are only for inference. We haven't reproduce the training results.*
+
+## Citation
+
+```bibtex
+@article{mehta2021mobilevit,
+  title={MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer},
+  author={Mehta, Sachin and Rastegari, Mohammad},
+  journal={arXiv preprint arXiv:2110.02178},
+  year={2021}
+}
+```
--- a/configs/mobilevit/metafile.yml
+++ b/configs/mobilevit/metafile.yml
+Collections:
+  - Name: MobileViT
+    Metadata:
+      Training Data: ImageNet-1k
+      Architecture:
+        - MobileViT Block
+    Paper:
+      URL: https://arxiv.org/abs/2110.02178
+      Title: MobileViT Light-weight, General-purpose, and Mobile-friendly Vision Transformer
+    README: configs/mobilevit/README.md
+
+Models:
+  - Name: mobilevit-small_3rdparty_in1k
+    Metadata:
+      FLOPs: 2030000000
+      Parameters: 5580000
+    In Collection: MobileViT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 78.25
+          Top 5 Accuracy: 94.09
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/mobilevit/mobilevit-small_3rdparty_in1k_20221018-cb4f741c.pth
+    Config: configs/mobilevit/mobilevit-small_8xb128_in1k.py
+    Converted From:
+      Weights: https://docs-assets.developer.apple.com/ml-research/models/cvnets/classification/mobilevit_s.pt
+      Code: https://github.com/apple/ml-cvnets
+  - Name: mobilevit-xsmall_3rdparty_in1k
+    Metadata:
+      FLOPs: 1050000000
+      Parameters: 2320000
+    In Collection: MobileViT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 74.75
+          Top 5 Accuracy: 92.32
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/mobilevit/mobilevit-xsmall_3rdparty_in1k_20221018-be39a6e7.pth
+    Config: configs/mobilevit/mobilevit-xsmall_8xb128_in1k.py
+    Converted From:
+      Weights: https://docs-assets.developer.apple.com/ml-research/models/cvnets/classification/mobilevit_xs.pt
+      Code: https://github.com/apple/ml-cvnets
+  - Name: mobilevit-xxsmall_3rdparty_in1k
+    Metadata:
+      FLOPs: 420000000
+      Parameters: 1270000
+    In Collection: MobileViT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 69.02
+          Top 5 Accuracy: 88.91
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/mobilevit/mobilevit-xxsmall_3rdparty_in1k_20221018-77835605.pth
+    Config: configs/mobilevit/mobilevit-xxsmall_8xb128_in1k.py
+    Converted From:
+      Weights: https://docs-assets.developer.apple.com/ml-research/models/cvnets/classification/mobilevit_xxs.pt
+      Code: https://github.com/apple/ml-cvnets
--- a/configs/mobilevit/mobilevit-small_8xb128_in1k.py
+++ b/configs/mobilevit/mobilevit-small_8xb128_in1k.py
+_base_ = [
+    '../_base_/models/mobilevit/mobilevit_s.py',
+    '../_base_/datasets/imagenet_bs32.py',
+    '../_base_/default_runtime.py',
+    '../_base_/schedules/imagenet_bs256.py',
+]
+
+# no normalize for original implements
+data_preprocessor = dict(
+    # RGB format normalization parameters
+    mean=[0, 0, 0],
+    std=[255, 255, 255],
+    # use bgr directly
+    to_rgb=False,
+)
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='ResizeEdge', scale=288, edge='short'),
+    dict(type='CenterCrop', crop_size=256),
+    dict(type='PackInputs'),
+]
+
+train_dataloader = dict(batch_size=128)
+
+val_dataloader = dict(
+    batch_size=128,
+    dataset=dict(pipeline=test_pipeline),
+)
+test_dataloader = val_dataloader
--- a/configs/mobilevit/mobilevit-xsmall_8xb128_in1k.py
+++ b/configs/mobilevit/mobilevit-xsmall_8xb128_in1k.py
+_base_ = [
+    '../_base_/models/mobilevit/mobilevit_xs.py',
+    '../_base_/datasets/imagenet_bs32.py',
+    '../_base_/default_runtime.py',
+    '../_base_/schedules/imagenet_bs256.py',
+]
+
+# no normalize for original implements
+data_preprocessor = dict(
+    # RGB format normalization parameters
+    mean=[0, 0, 0],
+    std=[255, 255, 255],
+    # use bgr directly
+    to_rgb=False,
+)
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='ResizeEdge', scale=288, edge='short'),
+    dict(type='CenterCrop', crop_size=256),
+    dict(type='PackInputs'),
+]
+
+train_dataloader = dict(batch_size=128)
+
+val_dataloader = dict(
+    batch_size=128,
+    dataset=dict(pipeline=test_pipeline),
+)
+test_dataloader = val_dataloader
--- a/configs/mobilevit/mobilevit-xxsmall_8xb128_in1k.py
+++ b/configs/mobilevit/mobilevit-xxsmall_8xb128_in1k.py
+_base_ = [
+    '../_base_/models/mobilevit/mobilevit_xxs.py',
+    '../_base_/datasets/imagenet_bs32.py',
+    '../_base_/default_runtime.py',
+    '../_base_/schedules/imagenet_bs256.py',
+]
+
+# no normalize for original implements
+data_preprocessor = dict(
+    # RGB format normalization parameters
+    mean=[0, 0, 0],
+    std=[255, 255, 255],
+    # use bgr directly
+    to_rgb=False,
+)
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='ResizeEdge', scale=288, edge='short'),
+    dict(type='CenterCrop', crop_size=256),
+    dict(type='PackInputs'),
+]
+
+train_dataloader = dict(batch_size=128)
+
+val_dataloader = dict(
+    batch_size=128,
+    dataset=dict(pipeline=test_pipeline),
+)
+test_dataloader = val_dataloader
--- a/configs/mocov2/README.md
+++ b/configs/mocov2/README.md
+# MoCoV2
+
+> [Improved Baselines with Momentum Contrastive Learning](https://arxiv.org/abs/2003.04297)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Contrastive unsupervised learning has recently shown encouraging progress, e.g., in Momentum Contrast (MoCo) and SimCLR. In this note, we verify the effectiveness of two of SimCLR’s design improvements by implementing them in the MoCo framework. With simple modifications to MoCo—namely, using an MLP projection head and more data augmentation—we establish stronger baselines that outperform SimCLR and do not require large training batches. We hope this will make state-of-the-art unsupervised learning research more accessible.
+
+<div align=center>
+<img  src="https://user-images.githubusercontent.com/36138628/149720067-b65e5736-d425-45b3-93ed-6f2427fc6217.png" width="500" />
+</div>
+
+## How to use it?
+
+<!-- [TABS-BEGIN] -->
+
+**Predict image**
+
+```python
+from mmpretrain import inference_model
+
+predict = inference_model('resnet50_mocov2-pre_8xb32-linear-steplr-100e_in1k', 'demo/bird.JPEG')
+print(predict['pred_class'])
+print(predict['pred_score'])
+```
+
+**Use the model**
+
+```python
+import torch
+from mmpretrain import get_model
+
+model = get_model('mocov2_resnet50_8xb32-coslr-200e_in1k', pretrained=True)
+inputs = torch.rand(1, 3, 224, 224)
+out = model(inputs)
+print(type(out))
+# To extract features.
+feats = model.extract_feat(inputs)
+print(type(feats))
+```
+
+**Train/Test Command**
+
+Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
+
+Train:
+
+```shell
+python tools/train.py configs/mocov2/mocov2_resnet50_8xb32-coslr-200e_in1k.py
+```
+
+Test:
+
+```shell
+python tools/test.py configs/mocov2/benchmarks/resnet50_8xb32-linear-steplr-100e_in1k.py https://download.openmmlab.com/mmselfsup/1.x/mocov2/mocov2_resnet50_8xb32-coslr-200e_in1k/resnet50_linear-8xb32-steplr-100e_in1k/resnet50_linear-8xb32-steplr-100e_in1k_20220825-994c4128.pth
+```
+
+<!-- [TABS-END] -->
+
+## Models and results
+
+### Pretrained models
+
+| Model                                   | Params (M) | Flops (G) |                       Config                       |                                           Download                                           |
+| :-------------------------------------- | :--------: | :-------: | :------------------------------------------------: | :------------------------------------------------------------------------------------------: |
+| `mocov2_resnet50_8xb32-coslr-200e_in1k` |   55.93    |   4.11    | [config](mocov2_resnet50_8xb32-coslr-200e_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/1.x/mocov2/mocov2_resnet50_8xb32-coslr-200e_in1k/mocov2_resnet50_8xb32-coslr-200e_in1k_20220825-b6d23c86.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/mocov2/mocov2_resnet50_8xb32-coslr-200e_in1k/mocov2_resnet50_8xb32-coslr-200e_in1k_20220825-b6d23c86.json) |
+
+### Image Classification on ImageNet-1k
+
+| Model                                     |                   Pretrain                   | Params (M) | Flops (G) | Top-1 (%) |                   Config                   |                   Download                    |
+| :---------------------------------------- | :------------------------------------------: | :--------: | :-------: | :-------: | :----------------------------------------: | :-------------------------------------------: |
+| `resnet50_mocov2-pre_8xb32-linear-steplr-100e_in1k` | [MOCOV2](https://download.openmmlab.com/mmselfsup/1.x/mocov2/mocov2_resnet50_8xb32-coslr-200e_in1k/mocov2_resnet50_8xb32-coslr-200e_in1k_20220825-b6d23c86.pth) |   25.56    |   4.11    |   67.50   | [config](benchmarks/resnet50_8xb32-linear-steplr-100e_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/1.x/mocov2/mocov2_resnet50_8xb32-coslr-200e_in1k/resnet50_linear-8xb32-steplr-100e_in1k/resnet50_linear-8xb32-steplr-100e_in1k_20220825-994c4128.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/mocov2/mocov2_resnet50_8xb32-coslr-200e_in1k/resnet50_linear-8xb32-steplr-100e_in1k/resnet50_linear-8xb32-steplr-100e_in1k_20220825-994c4128.json) |
+
+## Citation
+
+```bibtex
+@article{chen2020improved,
+  title={Improved baselines with momentum contrastive learning},
+  author={Chen, Xinlei and Fan, Haoqi and Girshick, Ross and He, Kaiming},
+  journal={arXiv preprint arXiv:2003.04297},
+  year={2020}
+}
+```
--- a/configs/mocov2/benchmarks/resnet50_8xb32-linear-steplr-100e_in1k.py
+++ b/configs/mocov2/benchmarks/resnet50_8xb32-linear-steplr-100e_in1k.py
+_base_ = [
+    '../../_base_/models/resnet50.py',
+    '../../_base_/datasets/imagenet_bs32_pil_resize.py',
+    '../../_base_/schedules/imagenet_sgd_steplr_100e.py',
+    '../../_base_/default_runtime.py',
+]
+
+model = dict(
+    backbone=dict(
+        frozen_stages=4,
+        init_cfg=dict(type='Pretrained', checkpoint='', prefix='backbone.')))
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=30., momentum=0.9, weight_decay=0.))
+
+# runtime settings
+default_hooks = dict(
+    checkpoint=dict(type='CheckpointHook', interval=10, max_keep_ckpts=3))
--- a/configs/mocov2/metafile.yml
+++ b/configs/mocov2/metafile.yml
+Collections:
+  - Name: MoCoV2
+    Metadata:
+      Training Data: ImageNet-1k
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNet
+        - MoCo
+    Paper:
+      Title: Improved Baselines with Momentum Contrastive Learning
+      URL: https://arxiv.org/abs/2003.04297
+    README: configs/mocov2/README.md
+
+Models:
+  - Name: mocov2_resnet50_8xb32-coslr-200e_in1k
+    Metadata:
+      Epochs: 200
+      Batch Size: 256
+      FLOPs: 4109364224
+      Parameters: 55933312
+      Training Data: ImageNet-1k
+    In Collection: MoCoV2
+    Results: null
+    Weights: https://download.openmmlab.com/mmselfsup/1.x/mocov2/mocov2_resnet50_8xb32-coslr-200e_in1k/mocov2_resnet50_8xb32-coslr-200e_in1k_20220825-b6d23c86.pth
+    Config: configs/mocov2/mocov2_resnet50_8xb32-coslr-200e_in1k.py
+    Downstream:
+      - resnet50_mocov2-pre_8xb32-linear-steplr-100e_in1k
+  - Name: resnet50_mocov2-pre_8xb32-linear-steplr-100e_in1k
+    Metadata:
+      Epochs: 100
+      Batch Size: 256
+      FLOPs: 4109464576
+      Parameters: 25557032
+      Training Data: ImageNet-1k
+    In Collection: MoCoV2
+    Results:
+      - Task: Image Classification
+        Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 67.5
+    Weights: https://download.openmmlab.com/mmselfsup/1.x/mocov2/mocov2_resnet50_8xb32-coslr-200e_in1k/resnet50_linear-8xb32-steplr-100e_in1k/resnet50_linear-8xb32-steplr-100e_in1k_20220825-994c4128.pth
+    Config: configs/mocov2/benchmarks/resnet50_8xb32-linear-steplr-100e_in1k.py
--- a/configs/mocov2/mocov2_resnet50_8xb32-coslr-200e_in1k.py
+++ b/configs/mocov2/mocov2_resnet50_8xb32-coslr-200e_in1k.py
+_base_ = [
+    '../_base_/datasets/imagenet_bs32_mocov2.py',
+    '../_base_/schedules/imagenet_sgd_coslr_200e.py',
+    '../_base_/default_runtime.py',
+]
+
+# model settings
+model = dict(
+    type='MoCo',
+    queue_len=65536,
+    feat_dim=128,
+    momentum=0.001,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        norm_cfg=dict(type='BN'),
+        zero_init_residual=False),
+    neck=dict(
+        type='MoCoV2Neck',
+        in_channels=2048,
+        hid_channels=2048,
+        out_channels=128,
+        with_avg_pool=True),
+    head=dict(
+        type='ContrastiveHead',
+        loss=dict(type='CrossEntropyLoss'),
+        temperature=0.2))
+
+# only keeps the latest 3 checkpoints
+default_hooks = dict(checkpoint=dict(max_keep_ckpts=3))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+auto_scale_lr = dict(base_batch_size=256)
--- a/configs/mocov3/README.md
+++ b/configs/mocov3/README.md
+# MoCoV3
+
+> [An Empirical Study of Training Self-Supervised Vision Transformers](https://arxiv.org/abs/2104.02057)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+This paper does not describe a novel method. Instead, it studies a straightforward, incremental, yet must-know baseline given the recent progress in computer vision: self-supervised learning for Vision Transformers (ViT). While the training recipes for standard convolutional networks have been highly mature and robust, the recipes for ViT are yet to be built, especially in the self-supervised scenarios where training becomes more challenging. In this work, we go back to basics and investigate the effects of several fundamental components for training self-supervised ViT. We observe that instability is a major issue that degrades accuracy, and it can be hidden by apparently good results. We reveal that these results are indeed partial failure, and they can be improved when training is made more stable. We benchmark ViT results in MoCo v3 and several other self-supervised frameworks, with ablations in various aspects. We discuss the currently positive evidence as well as challenges and open questions. We hope that this work will provide useful data points and experience for future research.
+
+<div align=center>
+<img  src="https://user-images.githubusercontent.com/36138628/151305362-e6e8ea35-b3b8-45f6-8819-634e67083218.png" width="500" />
+</div>
+
+## How to use it?
+
+<!-- [TABS-BEGIN] -->
+
+**Predict image**
+
+```python
+from mmpretrain import inference_model
+
+predict = inference_model('resnet50_mocov3-100e-pre_8xb128-linear-coslr-90e_in1k', 'demo/bird.JPEG')
+print(predict['pred_class'])
+print(predict['pred_score'])
+```
+
+**Use the model**
+
+```python
+import torch
+from mmpretrain import get_model
+
+model = get_model('mocov3_resnet50_8xb512-amp-coslr-100e_in1k', pretrained=True)
+inputs = torch.rand(1, 3, 224, 224)
+out = model(inputs)
+print(type(out))
+# To extract features.
+feats = model.extract_feat(inputs)
+print(type(feats))
+```
+
+**Train/Test Command**
+
+Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
+
+Train:
+
+```shell
+python tools/train.py configs/mocov3/mocov3_resnet50_8xb512-amp-coslr-100e_in1k.py
+```
+
+Test:
+
+```shell
+python tools/test.py configs/mocov3/benchmarks/resnet50_8xb128-linear-coslr-90e_in1k.py https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-100e_in1k/resnet50_linear-8xb128-coslr-90e_in1k/resnet50_linear-8xb128-coslr-90e_in1k_20220927-8f7d937e.pth
+```
+
+<!-- [TABS-END] -->
+
+## Models and results
+
+### Pretrained models
+
+| Model                                              | Params (M) | Flops (G) |                            Config                             |                                Download                                |
+| :------------------------------------------------- | :--------: | :-------: | :-----------------------------------------------------------: | :--------------------------------------------------------------------: |
+| `mocov3_resnet50_8xb512-amp-coslr-100e_in1k`       |   68.01    |   4.11    |    [config](mocov3_resnet50_8xb512-amp-coslr-100e_in1k.py)    | [model](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-100e_in1k/mocov3_resnet50_8xb512-amp-coslr-100e_in1k_20220927-f1144efa.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-100e_in1k/mocov3_resnet50_8xb512-amp-coslr-100e_in1k_20220927-f1144efa.json) |
+| `mocov3_resnet50_8xb512-amp-coslr-300e_in1k`       |   68.01    |   4.11    |    [config](mocov3_resnet50_8xb512-amp-coslr-300e_in1k.py)    | [model](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-300e_in1k/mocov3_resnet50_8xb512-amp-coslr-300e_in1k_20220927-1e4f3304.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-300e_in1k/mocov3_resnet50_8xb512-amp-coslr-300e_in1k_20220927-1e4f3304.json) |
+| `mocov3_resnet50_8xb512-amp-coslr-800e_in1k`       |   68.01    |   4.11    |    [config](mocov3_resnet50_8xb512-amp-coslr-800e_in1k.py)    | [model](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-800e_in1k/mocov3_resnet50_8xb512-amp-coslr-800e_in1k_20220927-e043f51a.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-800e_in1k/mocov3_resnet50_8xb512-amp-coslr-800e_in1k_20220927-e043f51a.json) |
+| `mocov3_vit-small-p16_16xb256-amp-coslr-300e_in1k` |   84.27    |   4.61    | [config](mocov3_vit-small-p16_16xb256-amp-coslr-300e_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-small-p16_16xb256-amp-coslr-300e_in1k/mocov3_vit-small-p16_16xb256-amp-coslr-300e_in1k-224_20220826-08bc52f7.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-small-p16_16xb256-amp-coslr-300e_in1k/mocov3_vit-small-p16_16xb256-amp-coslr-300e_in1k-224_20220826-08bc52f7.json) |
+| `mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k`  |   215.68   |   17.58   | [config](mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k.py)  | [model](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k/mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k-224_20220826-25213343.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k/mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k-224_20220826-25213343.json) |
+| `mocov3_vit-large-p16_64xb64-amp-coslr-300e_in1k`  |   652.78   |   61.60   | [config](mocov3_vit-large-p16_64xb64-amp-coslr-300e_in1k.py)  | [model](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-large-p16_64xb64-amp-coslr-300e_in1k/mocov3_vit-large-p16_64xb64-amp-coslr-300e_in1k-224_20220829-9b88a442.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-large-p16_64xb64-amp-coslr-300e_in1k/mocov3_vit-large-p16_64xb64-amp-coslr-300e_in1k-224_20220829-9b88a442.json) |
+
+### Image Classification on ImageNet-1k
+
+| Model                                     |                   Pretrain                   | Params (M) | Flops (G) | Top-1 (%) |                   Config                   |                   Download                    |
+| :---------------------------------------- | :------------------------------------------: | :--------: | :-------: | :-------: | :----------------------------------------: | :-------------------------------------------: |
+| `resnet50_mocov3-100e-pre_8xb128-linear-coslr-90e_in1k` | [MOCOV3 100-Epochs](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-100e_in1k/mocov3_resnet50_8xb512-amp-coslr-100e_in1k_20220927-f1144efa.pth) |   25.56    |   4.11    |   69.60   | [config](benchmarks/resnet50_8xb128-linear-coslr-90e_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-100e_in1k/resnet50_linear-8xb128-coslr-90e_in1k/resnet50_linear-8xb128-coslr-90e_in1k_20220927-8f7d937e.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-100e_in1k/resnet50_linear-8xb128-coslr-90e_in1k/resnet50_linear-8xb128-coslr-90e_in1k_20220927-8f7d937e.json) |
+| `resnet50_mocov3-300e-pre_8xb128-linear-coslr-90e_in1k` | [MOCOV3 300-Epochs](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-300e_in1k/mocov3_resnet50_8xb512-amp-coslr-300e_in1k_20220927-1e4f3304.pth) |   25.56    |   4.11    |   72.80   | [config](benchmarks/resnet50_8xb128-linear-coslr-90e_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-300e_in1k/resnet50_linear-8xb128-coslr-90e_in1k/resnet50_linear-8xb128-coslr-90e_in1k_20220927-d21ddac2.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-300e_in1k/resnet50_linear-8xb128-coslr-90e_in1k/resnet50_linear-8xb128-coslr-90e_in1k_20220927-d21ddac2.json) |
+| `resnet50_mocov3-800e-pre_8xb128-linear-coslr-90e_in1k` | [MOCOV3 800-Epochs](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-800e_in1k/mocov3_resnet50_8xb512-amp-coslr-800e_in1k_20220927-e043f51a.pth) |   25.56    |   4.11    |   74.40   | [config](benchmarks/resnet50_8xb128-linear-coslr-90e_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-800e_in1k/resnet50_linear-8xb128-coslr-90e_in1k/resnet50_linear-8xb128-coslr-90e_in1k_20220927-0e97a483.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-800e_in1k/resnet50_linear-8xb128-coslr-90e_in1k/resnet50_linear-8xb128-coslr-90e_in1k_20220927-0e97a483.json) |
+| `vit-small-p16_mocov3-pre_8xb128-linear-coslr-90e_in1k` | [MOCOV3](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-small-p16_16xb256-amp-coslr-300e_in1k/mocov3_vit-small-p16_16xb256-amp-coslr-300e_in1k-224_20220826-08bc52f7.pth) |   22.05    |   4.61    |   73.60   | [config](benchmarks/vit-small-p16_8xb128-linear-coslr-90e_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-small-p16_16xb256-amp-coslr-300e_in1k/vit-small-p16_linear-8xb128-coslr-90e_in1k/vit-small-p16_linear-8xb128-coslr-90e_in1k_20220826-376674ef.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-small-p16_16xb256-amp-coslr-300e_in1k/vit-small-p16_linear-8xb128-coslr-90e_in1k/vit-small-p16_linear-8xb128-coslr-90e_in1k_20220826-376674ef.json) |
+| `vit-base-p16_mocov3-pre_8xb64-coslr-150e_in1k` | [MOCOV3](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k/mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k-224_20220826-25213343.pth) |   86.57    |   17.58   |   83.00   | [config](benchmarks/vit-base-p16_8xb64-coslr-150e_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k/vit-base-p16_ft-8xb64-coslr-150e_in1k/vit-base-p16_ft-8xb64-coslr-150e_in1k_20220826-f1e6c442.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k/vit-base-p16_ft-8xb64-coslr-150e_in1k/vit-base-p16_ft-8xb64-coslr-150e_in1k_20220826-f1e6c442.json) |
+| `vit-base-p16_mocov3-pre_8xb128-linear-coslr-90e_in1k` | [MOCOV3](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k/mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k-224_20220826-25213343.pth) |   86.57    |   17.58   |   76.90   | [config](benchmarks/vit-base-p16_8xb128-linear-coslr-90e_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k/vit-base-p16_linear-8xb128-coslr-90e_in1k/vit-base-p16_linear-8xb128-coslr-90e_in1k_20220826-83be7758.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k/vit-base-p16_linear-8xb128-coslr-90e_in1k/vit-base-p16_linear-8xb128-coslr-90e_in1k_20220826-83be7758.json) |
+| `vit-large-p16_mocov3-pre_8xb64-coslr-100e_in1k` | [MOCOV3](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-large-p16_64xb64-amp-coslr-300e_in1k/mocov3_vit-large-p16_64xb64-amp-coslr-300e_in1k-224_20220829-9b88a442.pth) |   304.33   |   61.60   |   83.70   | [config](benchmarks/vit-large-p16_8xb64-coslr-100e_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-large-p16_64xb64-amp-coslr-300e_in1k/vit-large-p16_ft-8xb64-coslr-100e_in1k/vit-large-p16_ft-8xb64-coslr-100e_in1k_20220829-878a2f7f.pth) \| [log](https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-large-p16_64xb64-amp-coslr-300e_in1k/vit-large-p16_ft-8xb64-coslr-100e_in1k/vit-large-p16_ft-8xb64-coslr-100e_in1k_20220829-878a2f7f.json) |
+
+## Citation
+
+```bibtex
+@InProceedings{Chen_2021_ICCV,
+    title     = {An Empirical Study of Training Self-Supervised Vision Transformers},
+    author    = {Chen, Xinlei and Xie, Saining and He, Kaiming},
+    booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
+    year      = {2021}
+}
+```