first commit

dff2c686 · renzhc · 8f9dd0ed · dff2c686 · dff2c686 · dff2c686
Commit dff2c686 authored Sep 03, 2024 by renzhc
20 changed files
--- a/configs/swin_transformer_v2/swinv2-large-w12_8xb128_in21k-192px.py
+++ b/configs/swin_transformer_v2/swinv2-large-w12_8xb128_in21k-192px.py
+_base_ = [
+    '../_base_/models/swin_transformer_v2/base_256.py',
+    '../_base_/datasets/imagenet21k_bs128.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# model settings
+model = dict(
+    backbone=dict(img_size=192, window_size=[12, 12, 12, 6]),
+    head=dict(num_classes=21841),
+)
+
+# dataset settings
+data_preprocessor = dict(num_classes=21841)
+
+_base_['train_pipeline'][1]['scale'] = 192  # RandomResizedCrop
+_base_['test_pipeline'][1]['scale'] = 219  # ResizeEdge
+_base_['test_pipeline'][2]['crop_size'] = 192  # CenterCrop
--- a/configs/swin_transformer_v2/swinv2-large-w16_in21k-pre_16xb64_in1k-256px.py
+++ b/configs/swin_transformer_v2/swinv2-large-w16_in21k-pre_16xb64_in1k-256px.py
+# Only for evaluation
+_base_ = [
+    '../_base_/models/swin_transformer_v2/large_256.py',
+    '../_base_/datasets/imagenet_bs64_swin_256.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py'
+]
+
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        window_size=[16, 16, 16, 8], pretrained_window_sizes=[12, 12, 12, 6]),
+)
--- a/configs/swin_transformer_v2/swinv2-large-w24_in21k-pre_16xb64_in1k-384px.py
+++ b/configs/swin_transformer_v2/swinv2-large-w24_in21k-pre_16xb64_in1k-384px.py
+# Only for evaluation
+_base_ = [
+    '../_base_/models/swin_transformer_v2/large_384.py',
+    '../_base_/datasets/imagenet_bs64_swin_384.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py'
+]
+
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        img_size=384,
+        window_size=[24, 24, 24, 12],
+        pretrained_window_sizes=[12, 12, 12, 6]),
+)
--- a/configs/swin_transformer_v2/swinv2-small-w16_16xb64_in1k-256px.py
+++ b/configs/swin_transformer_v2/swinv2-small-w16_16xb64_in1k-256px.py
+_base_ = [
+    '../_base_/models/swin_transformer_v2/small_256.py',
+    '../_base_/datasets/imagenet_bs64_swin_256.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py'
+]
+
+model = dict(backbone=dict(window_size=[16, 16, 16, 8]))
--- a/configs/swin_transformer_v2/swinv2-small-w8_16xb64_in1k-256px.py
+++ b/configs/swin_transformer_v2/swinv2-small-w8_16xb64_in1k-256px.py
+_base_ = [
+    '../_base_/models/swin_transformer_v2/small_256.py',
+    '../_base_/datasets/imagenet_bs64_swin_256.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py'
+]
--- a/configs/swin_transformer_v2/swinv2-tiny-w16_16xb64_in1k-256px.py
+++ b/configs/swin_transformer_v2/swinv2-tiny-w16_16xb64_in1k-256px.py
+_base_ = [
+    '../_base_/models/swin_transformer_v2/tiny_256.py',
+    '../_base_/datasets/imagenet_bs64_swin_256.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py'
+]
+
+model = dict(backbone=dict(window_size=[16, 16, 16, 8]))
--- a/configs/swin_transformer_v2/swinv2-tiny-w8_16xb64_in1k-256px.py
+++ b/configs/swin_transformer_v2/swinv2-tiny-w8_16xb64_in1k-256px.py
+_base_ = [
+    '../_base_/models/swin_transformer_v2/tiny_256.py',
+    '../_base_/datasets/imagenet_bs64_swin_256.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py'
+]
--- a/configs/t2t_vit/README.md
+++ b/configs/t2t_vit/README.md
+# Tokens-to-Token ViT
+
+> [Tokens-to-Token ViT: Training Vision Transformers from Scratch on ImageNet](https://arxiv.org/abs/2101.11986)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Transformers, which are popular for language modeling, have been explored for solving vision tasks recently, e.g., the Vision Transformer (ViT) for image classification. The ViT model splits each image into a sequence of tokens with fixed length and then applies multiple Transformer layers to model their global relation for classification. However, ViT achieves inferior performance to CNNs when trained from scratch on a midsize dataset like ImageNet. We find it is because: 1) the simple tokenization of input images fails to model the important local structure such as edges and lines among neighboring pixels, leading to low training sample efficiency; 2) the redundant attention backbone design of ViT leads to limited feature richness for fixed computation budgets and limited training samples. To overcome such limitations, we propose a new Tokens-To-Token Vision Transformer (T2T-ViT), which incorporates 1) a layer-wise Tokens-to-Token (T2T) transformation to progressively structurize the image to tokens by recursively aggregating neighboring Tokens into one Token (Tokens-to-Token), such that local structure represented by surrounding tokens can be modeled and tokens length can be reduced; 2) an efficient backbone with a deep-narrow structure for vision transformer motivated by CNN architecture design after empirical study. Notably, T2T-ViT reduces the parameter count and MACs of vanilla ViT by half, while achieving more than 3.0% improvement when trained from scratch on ImageNet. It also outperforms ResNets and achieves comparable performance with MobileNets by directly training on ImageNet. For example, T2T-ViT with comparable size to ResNet50 (21.5M parameters) can achieve 83.3% top1 accuracy in image resolution 384×384 on ImageNet.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/26739999/142578381-e9040610-05d9-457c-8bf5-01c2fa94add2.png" width="60%"/>
+</div>
+
+## How to use it?
+
+<!-- [TABS-BEGIN] -->
+
+**Predict image**
+
+```python
+from mmpretrain import inference_model
+
+predict = inference_model('t2t-vit-t-14_8xb64_in1k', 'demo/bird.JPEG')
+print(predict['pred_class'])
+print(predict['pred_score'])
+```
+
+**Use the model**
+
+```python
+import torch
+from mmpretrain import get_model
+
+model = get_model('t2t-vit-t-14_8xb64_in1k', pretrained=True)
+inputs = torch.rand(1, 3, 224, 224)
+out = model(inputs)
+print(type(out))
+# To extract features.
+feats = model.extract_feat(inputs)
+print(type(feats))
+```
+
+**Train/Test Command**
+
+Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
+
+Train:
+
+```shell
+python tools/train.py configs/t2t_vit/t2t-vit-t-14_8xb64_in1k.py
+```
+
+Test:
+
+```shell
+python tools/test.py configs/t2t_vit/t2t-vit-t-14_8xb64_in1k.py https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-14_8xb64_in1k_20211220-f7378dd5.pth
+```
+
+<!-- [TABS-END] -->
+
+## Models and results
+
+### Image Classification on ImageNet-1k
+
+| Model                     |   Pretrain   | Params (M) | Flops (G) | Top-1 (%) | Top-5 (%) |                Config                |                                          Download                                          |
+| :------------------------ | :----------: | :--------: | :-------: | :-------: | :-------: | :----------------------------------: | :----------------------------------------------------------------------------------------: |
+| `t2t-vit-t-14_8xb64_in1k` | From scratch |   21.47    |   4.34    |   81.83   |   95.84   | [config](t2t-vit-t-14_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-14_8xb64_in1k_20211220-f7378dd5.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-14_8xb64_in1k_20211220-f7378dd5.json) |
+| `t2t-vit-t-19_8xb64_in1k` | From scratch |   39.08    |   7.80    |   82.63   |   96.18   | [config](t2t-vit-t-19_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-19_8xb64_in1k_20211214-7f5e3aaf.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-19_8xb64_in1k_20211214-7f5e3aaf.json) |
+| `t2t-vit-t-24_8xb64_in1k` | From scratch |   64.00    |   12.69   |   82.71   |   96.09   | [config](t2t-vit-t-24_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-24_8xb64_in1k_20211214-b2a68ae3.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-24_8xb64_in1k_20211214-b2a68ae3.json) |
+
+## Citation
+
+```bibtex
+@article{yuan2021tokens,
+  title={Tokens-to-token vit: Training vision transformers from scratch on imagenet},
+  author={Yuan, Li and Chen, Yunpeng and Wang, Tao and Yu, Weihao and Shi, Yujun and Tay, Francis EH and Feng, Jiashi and Yan, Shuicheng},
+  journal={arXiv preprint arXiv:2101.11986},
+  year={2021}
+}
+```
--- a/configs/t2t_vit/metafile.yml
+++ b/configs/t2t_vit/metafile.yml
+Collections:
+  - Name: Tokens-to-Token ViT
+    Metadata:
+      Training Data: ImageNet-1k
+      Architecture:
+        - Layer Normalization
+        - Scaled Dot-Product Attention
+        - Attention Dropout
+        - Dropout
+        - Tokens to Token
+    Paper:
+      URL: https://arxiv.org/abs/2101.11986
+      Title: "Tokens-to-Token ViT: Training Vision Transformers from Scratch on ImageNet"
+    README: configs/t2t_vit/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmpretrain/blob/v0.17.0/mmcls/models/backbones/t2t_vit.py
+      Version: v0.17.0
+
+Models:
+  - Name: t2t-vit-t-14_8xb64_in1k
+    Metadata:
+      FLOPs: 4340000000
+      Parameters: 21470000
+    In Collection: Tokens-to-Token ViT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 81.83
+          Top 5 Accuracy: 95.84
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-14_8xb64_in1k_20211220-f7378dd5.pth
+    Config: configs/t2t_vit/t2t-vit-t-14_8xb64_in1k.py
+  - Name: t2t-vit-t-19_8xb64_in1k
+    Metadata:
+      FLOPs: 7800000000
+      Parameters: 39080000
+    In Collection: Tokens-to-Token ViT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 82.63
+          Top 5 Accuracy: 96.18
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-19_8xb64_in1k_20211214-7f5e3aaf.pth
+    Config: configs/t2t_vit/t2t-vit-t-19_8xb64_in1k.py
+  - Name: t2t-vit-t-24_8xb64_in1k
+    Metadata:
+      FLOPs: 12690000000
+      Parameters: 64000000
+    In Collection: Tokens-to-Token ViT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 82.71
+          Top 5 Accuracy: 96.09
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-24_8xb64_in1k_20211214-b2a68ae3.pth
+    Config: configs/t2t_vit/t2t-vit-t-24_8xb64_in1k.py
--- a/configs/t2t_vit/t2t-vit-t-14_8xb64_in1k.py
+++ b/configs/t2t_vit/t2t-vit-t-14_8xb64_in1k.py
+_base_ = [
+    '../_base_/models/t2t-vit-t-14.py',
+    '../_base_/datasets/imagenet_bs64_t2t_224.py',
+    '../_base_/default_runtime.py',
+]
+
+# schedule settings
+optim_wrapper = dict(
+    optimizer=dict(type='AdamW', lr=5e-4, weight_decay=0.05),
+    paramwise_cfg=dict(
+        norm_decay_mult=0.0,
+        bias_decay_mult=0.0,
+        custom_keys={'cls_token': dict(decay_mult=0.0)},
+    ),
+)
+
+param_scheduler = [
+    # warm up learning rate scheduler
+    dict(
+        type='LinearLR',
+        start_factor=1e-6,
+        by_epoch=True,
+        begin=0,
+        end=10,
+        # update by iter
+        convert_to_iter_based=True),
+    # main learning rate scheduler
+    dict(
+        type='CosineAnnealingLR',
+        T_max=290,
+        eta_min=1e-5,
+        by_epoch=True,
+        begin=10,
+        end=300),
+    # cool down learning rate scheduler
+    dict(type='ConstantLR', factor=0.1, by_epoch=True, begin=300, end=310),
+]
+
+train_cfg = dict(by_epoch=True, max_epochs=310, val_interval=1)
+val_cfg = dict()
+test_cfg = dict()
+
+# runtime settings
+custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+# base_batch_size = (8 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=512)
--- a/configs/t2t_vit/t2t-vit-t-19_8xb64_in1k.py
+++ b/configs/t2t_vit/t2t-vit-t-19_8xb64_in1k.py
+_base_ = [
+    '../_base_/models/t2t-vit-t-19.py',
+    '../_base_/datasets/imagenet_bs64_t2t_224.py',
+    '../_base_/default_runtime.py',
+]
+
+# schedule settings
+optim_wrapper = dict(
+    optimizer=dict(type='AdamW', lr=5e-4, weight_decay=0.065),
+    paramwise_cfg=dict(
+        norm_decay_mult=0.0,
+        bias_decay_mult=0.0,
+        custom_keys={'cls_token': dict(decay_mult=0.0)},
+    ),
+)
+
+param_scheduler = [
+    # warm up learning rate scheduler
+    dict(
+        type='LinearLR',
+        start_factor=1e-6,
+        by_epoch=True,
+        begin=0,
+        end=10,
+        # update by iter
+        convert_to_iter_based=True),
+    # main learning rate scheduler
+    dict(
+        type='CosineAnnealingLR',
+        T_max=290,
+        eta_min=1e-5,
+        by_epoch=True,
+        begin=10,
+        end=300),
+    # cool down learning rate scheduler
+    dict(type='ConstantLR', factor=0.1, by_epoch=True, begin=300, end=310),
+]
+
+train_cfg = dict(by_epoch=True, max_epochs=310, val_interval=1)
+val_cfg = dict()
+test_cfg = dict()
+
+# runtime settings
+custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+# base_batch_size = (8 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=512)
--- a/configs/t2t_vit/t2t-vit-t-24_8xb64_in1k.py
+++ b/configs/t2t_vit/t2t-vit-t-24_8xb64_in1k.py
+_base_ = [
+    '../_base_/models/t2t-vit-t-24.py',
+    '../_base_/datasets/imagenet_bs64_t2t_224.py',
+    '../_base_/default_runtime.py',
+]
+
+# schedule settings
+optim_wrapper = dict(
+    optimizer=dict(type='AdamW', lr=5e-4, weight_decay=0.065),
+    paramwise_cfg=dict(
+        norm_decay_mult=0.0,
+        bias_decay_mult=0.0,
+        custom_keys={'cls_token': dict(decay_mult=0.0)},
+    ),
+)
+
+param_scheduler = [
+    # warm up learning rate scheduler
+    dict(
+        type='LinearLR',
+        start_factor=1e-6,
+        by_epoch=True,
+        begin=0,
+        end=10,
+        # update by iter
+        convert_to_iter_based=True),
+    # main learning rate scheduler
+    dict(
+        type='CosineAnnealingLR',
+        T_max=290,
+        eta_min=1e-5,
+        by_epoch=True,
+        begin=10,
+        end=300),
+    # cool down learning rate scheduler
+    dict(type='ConstantLR', factor=0.1, by_epoch=True, begin=300, end=310),
+]
+
+train_cfg = dict(by_epoch=True, max_epochs=310, val_interval=1)
+val_cfg = dict()
+test_cfg = dict()
+
+# runtime settings
+custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+# base_batch_size = (8 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=512)
--- a/configs/tinyvit/README.md
+++ b/configs/tinyvit/README.md
+# TinyViT
+
+> [TinyViT: Fast Pretraining Distillation for Small Vision Transformers](https://arxiv.org/abs/2207.10666)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Vision transformer (ViT) recently has drawn great attention in computer vision due to its remarkable model capability. However, most prevailing ViT models suffer from huge number of parameters, restricting their applicability on devices with limited resources. To alleviate this issue, we propose TinyViT, a new family of tiny and efficient small vision transformers pretrained on large-scale datasets with our proposed fast distillation framework. The central idea is to transfer knowledge from large pretrained models to small ones, while enabling small models to get the dividends of massive pretraining data. More specifically, we apply distillation during pretraining for knowledge transfer. The logits of large teacher models are sparsified and stored in disk in advance to save the memory cost and computation overheads. The tiny student transformers are automatically scaled down from a large pretrained model with computation and parameter constraints. Comprehensive experiments demonstrate the efficacy of TinyViT. It achieves a top-1 accuracy of 84.8% on ImageNet-1k with only 21M parameters, being comparable to SwinB pretrained on ImageNet-21k while using 4.2 times fewer parameters. Moreover, increasing image resolutions, TinyViT can reach 86.5% accuracy, being slightly better than Swin-L while using only 11% parameters. Last but not the least, we demonstrate a good transfer ability of TinyViT on various downstream tasks.
+
+<div align=center>
+<img src="https://github.com/microsoft/Cream/raw/main/TinyViT/.figure/framework.png" width="100%">
+</div>
+
+## How to use it?
+
+<!-- [TABS-BEGIN] -->
+
+**Predict image**
+
+```python
+from mmpretrain import inference_model
+
+predict = inference_model('tinyvit-5m_3rdparty_in1k', 'demo/bird.JPEG')
+print(predict['pred_class'])
+print(predict['pred_score'])
+```
+
+**Use the model**
+
+```python
+import torch
+from mmpretrain import get_model
+
+model = get_model('tinyvit-5m_3rdparty_in1k', pretrained=True)
+inputs = torch.rand(1, 3, 224, 224)
+out = model(inputs)
+print(type(out))
+# To extract features.
+feats = model.extract_feat(inputs)
+print(type(feats))
+```
+
+**Test Command**
+
+Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
+
+Test:
+
+```shell
+python tools/test.py configs/tinyvit/tinyvit-5m_8xb256_in1k.py https://download.openmmlab.com/mmclassification/v0/tinyvit/tinyvit-5m_3rdparty_in1k_20221021-62cb5abf.pth
+```
+
+<!-- [TABS-END] -->
+
+## Models and results
+
+### Image Classification on ImageNet-1k
+
+| Model                                          |       Pretrain       | Params (M) | Flops (G) | Top-1 (%) | Top-5 (%) |                     Config                      |                      Download                      |
+| :--------------------------------------------- | :------------------: | :--------: | :-------: | :-------: | :-------: | :---------------------------------------------: | :------------------------------------------------: |
+| `tinyvit-5m_3rdparty_in1k`\*                   |     From scratch     |    5.39    |   1.29    |   79.02   |   94.74   |       [config](tinyvit-5m_8xb256_in1k.py)       | [model](https://download.openmmlab.com/mmclassification/v0/tinyvit/tinyvit-5m_3rdparty_in1k_20221021-62cb5abf.pth) |
+| `tinyvit-5m_in21k-distill-pre_3rdparty_in1k`\* | ImageNet-21k DISTILL |    5.39    |   1.29    |   80.71   |   95.57   |   [config](tinyvit-5m-distill_8xb256_in1k.py)   | [model](https://download.openmmlab.com/mmclassification/v0/tinyvit/tinyvit-5m_in21k-distill-pre_3rdparty_in1k_20221021-d4b010a8.pth) |
+| `tinyvit-11m_3rdparty_in1k`\*                  |     From scratch     |   11.00    |   2.05    |   81.44   |   95.79   |      [config](tinyvit-11m_8xb256_in1k.py)       | [model](https://download.openmmlab.com/mmclassification/v0/tinyvit/tinyvit-11m_3rdparty_in1k_20221021-11ccef16.pth) |
+| `tinyvit-11m_in21k-distill-pre_3rdparty_in1k`\* | ImageNet-21k DISTILL |   11.00    |   2.05    |   83.19   |   96.53   |  [config](tinyvit-11m-distill_8xb256_in1k.py)   | [model](https://download.openmmlab.com/mmclassification/v0/tinyvit/tinyvit-11m_in21k-distill-pre_3rdparty_in1k_20221021-5d3bc0dc.pth) |
+| `tinyvit-21m_3rdparty_in1k`\*                  |     From scratch     |   21.20    |   4.30    |   83.08   |   96.58   |      [config](tinyvit-21m_8xb256_in1k.py)       | [model](https://download.openmmlab.com/mmclassification/v0/tinyvit/tinyvit-21m_3rdparty_in1k_20221021-5346ba34.pth) |
+| `tinyvit-21m_in21k-distill-pre_3rdparty_in1k`\* | ImageNet-21k DISTILL |   21.20    |   4.30    |   84.85   |   97.27   |  [config](tinyvit-21m-distill_8xb256_in1k.py)   | [model](https://download.openmmlab.com/mmclassification/v0/tinyvit/tinyvit-21m_in21k-distill-pre_3rdparty_in1k_20221021-3d9b30a2.pth) |
+| `tinyvit-21m_in21k-distill-pre_3rdparty_in1k-384px`\* | ImageNet-21k DISTILL |   21.23    |   13.85   |   86.21   |   97.77   | [config](tinyvit-21m-distill_8xb256_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/tinyvit/tinyvit-21m_in21k-distill-pre_3rdparty_in1k-384px_20221021-65be6b3f.pth) |
+| `tinyvit-21m_in21k-distill-pre_3rdparty_in1k-512px`\* | ImageNet-21k DISTILL |   21.27    |   27.15   |   86.44   |   97.89   | [config](tinyvit-21m-distill_8xb256_in1k-512px.py) | [model](https://download.openmmlab.com/mmclassification/v0/tinyvit/tinyvit-21m_in21k-distill-pre_3rdparty_in1k-512px_20221021-e42a9bea.pth) |
+
+*Models with * are converted from the [official repo](https://github.com/microsoft/Cream/tree/main/TinyViT). The config files of these models are only for inference. We haven't reproduce the training results.*
+
+## Citation
+
+```bibtex
+@InProceedings{tiny_vit,
+  title={TinyViT: Fast Pretraining Distillation for Small Vision Transformers},
+  author={Wu, Kan and Zhang, Jinnian and Peng, Houwen and Liu, Mengchen and Xiao, Bin and Fu, Jianlong and Yuan, Lu},
+  booktitle={European conference on computer vision (ECCV)},
+  year={2022}
+}
+```
--- a/configs/tinyvit/metafile.yml
+++ b/configs/tinyvit/metafile.yml
+Collections:
+  - Name: TinyViT
+    Metadata:
+      Training Data: ImageNet-1k
+      Architecture:
+        - MBConv
+        - Window Multi-head Self-Attention
+    Paper:
+      Title: 'TinyViT: Fast Pretraining Distillation for Small Vision Transformers'
+      URL: https://arxiv.org/abs/2207.10666
+    README: configs/tinyvit/README.md
+    Code:
+      Version: v1.0.0rc1
+      URL: https://github.com/open-mmlab/mmpretrain/blob/v0.23.2/mmcls/models/backbones/tinyvit.py
+
+Models:
+  - Name: tinyvit-5m_3rdparty_in1k
+    Metadata:
+      FLOPs: 1286655360
+      Parameters: 5392764
+      Training Data: ImageNet-1k
+    In Collection: TinyViT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 79.02
+          Top 5 Accuracy: 94.74
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/tinyvit/tinyvit-5m_3rdparty_in1k_20221021-62cb5abf.pth
+    Config: configs/tinyvit/tinyvit-5m_8xb256_in1k.py
+    Converted From:
+      Weights: https://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/tiny_vit_5m_1k.pth
+      Code: https://github.com/microsoft/Cream/tree/main/TinyViT
+  - Name: tinyvit-5m_in21k-distill-pre_3rdparty_in1k
+    Metadata:
+      FLOPs: 1286655360
+      Parameters: 5392764
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+    In Collection: TinyViT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 80.71
+          Top 5 Accuracy: 95.57
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/tinyvit/tinyvit-5m_in21k-distill-pre_3rdparty_in1k_20221021-d4b010a8.pth
+    Config: configs/tinyvit/tinyvit-5m-distill_8xb256_in1k.py
+    Converted From:
+      Weights: https://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/tiny_vit_5m_22kto1k_distill.pth
+      Code: https://github.com/microsoft/Cream/tree/main/TinyViT
+  - Name: tinyvit-11m_3rdparty_in1k
+    Metadata:
+      FLOPs: 2050033664
+      Parameters: 10996972
+      Training Data: ImageNet-1k
+    In Collection: TinyViT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 81.44
+          Top 5 Accuracy: 95.79
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/tinyvit/tinyvit-11m_3rdparty_in1k_20221021-11ccef16.pth
+    Config: configs/tinyvit/tinyvit-11m_8xb256_in1k.py
+    Converted From:
+      Weights: https://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/tiny_vit_11m_1k.pth
+      Code: https://github.com/microsoft/Cream/tree/main/TinyViT
+  - Name: tinyvit-11m_in21k-distill-pre_3rdparty_in1k
+    Metadata:
+      FLOPs: 2050033664
+      Parameters: 10996972
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+    In Collection: TinyViT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 83.19
+          Top 5 Accuracy: 96.53
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/tinyvit/tinyvit-11m_in21k-distill-pre_3rdparty_in1k_20221021-5d3bc0dc.pth
+    Config: configs/tinyvit/tinyvit-11m-distill_8xb256_in1k.py
+    Converted From:
+      Weights: https://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/tiny_vit_11m_22kto1k_distill.pth
+      Code: https://github.com/microsoft/Cream/tree/main/TinyViT
+  - Name: tinyvit-21m_3rdparty_in1k
+    Metadata:
+      FLOPs: 4301124096
+      Parameters: 21198568
+      Training Data: ImageNet-1k
+    In Collection: TinyViT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 83.08
+          Top 5 Accuracy: 96.58
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/tinyvit/tinyvit-21m_3rdparty_in1k_20221021-5346ba34.pth
+    Config: configs/tinyvit/tinyvit-21m_8xb256_in1k.py
+    Converted From:
+      Weights: https://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/tiny_vit_21m_1k.pth
+      Code: https://github.com/microsoft/Cream/tree/main/TinyViT
+  - Name: tinyvit-21m_in21k-distill-pre_3rdparty_in1k
+    Metadata:
+      FLOPs: 4301124096
+      Parameters: 21198568
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+    In Collection: TinyViT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 84.85
+          Top 5 Accuracy: 97.27
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/tinyvit/tinyvit-21m_in21k-distill-pre_3rdparty_in1k_20221021-3d9b30a2.pth
+    Config: configs/tinyvit/tinyvit-21m-distill_8xb256_in1k.py
+    Converted From:
+      Weights: https://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/tiny_vit_21m_22kto1k_distill.pth
+      Code: https://github.com/microsoft/Cream/tree/main/TinyViT
+  - Name: tinyvit-21m_in21k-distill-pre_3rdparty_in1k-384px
+    Metadata:
+      FLOPs: 13848250176
+      Parameters: 21230488
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+    In Collection: TinyViT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 86.21
+          Top 5 Accuracy: 97.77
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/tinyvit/tinyvit-21m_in21k-distill-pre_3rdparty_in1k-384px_20221021-65be6b3f.pth
+    Config: configs/tinyvit/tinyvit-21m-distill_8xb256_in1k-384px.py
+    Converted From:
+      Weights: https://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/tiny_vit_21m_22kto1k_384_distill.pth
+      Code: https://github.com/microsoft/Cream/tree/main/TinyViT
+  - Name: tinyvit-21m_in21k-distill-pre_3rdparty_in1k-512px
+    Metadata:
+      FLOPs: 27151420224
+      Parameters: 21268120
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+    In Collection: TinyViT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 86.44
+          Top 5 Accuracy: 97.89
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/tinyvit/tinyvit-21m_in21k-distill-pre_3rdparty_in1k-512px_20221021-e42a9bea.pth
+    Config: configs/tinyvit/tinyvit-21m-distill_8xb256_in1k-512px.py
+    Converted From:
+      Weights: https://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/tiny_vit_21m_22kto1k_512_distill.pth
+      Code: https://github.com/microsoft/Cream/tree/main/TinyViT
--- a/configs/tinyvit/tinyvit-11m-distill_8xb256_in1k.py
+++ b/configs/tinyvit/tinyvit-11m-distill_8xb256_in1k.py
+_base_ = [
+    './tinyvit-11m_8xb256_in1k.py',
+]
--- a/configs/tinyvit/tinyvit-11m_8xb256_in1k.py
+++ b/configs/tinyvit/tinyvit-11m_8xb256_in1k.py
+_base_ = [
+    '../_base_/datasets/imagenet_bs32_pil_bicubic.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+    '../_base_/models/tinyvit/tinyvit-11m.py',
+]
--- a/configs/tinyvit/tinyvit-21m-distill_8xb256_in1k-384px.py
+++ b/configs/tinyvit/tinyvit-21m-distill_8xb256_in1k-384px.py
+_base_ = [
+    '../_base_/datasets/imagenet_bs32_pil_bicubic.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+    '../_base_/models/tinyvit/tinyvit-21m.py',
+]
+
+# model settings
+model = dict(
+    backbone=dict(
+        img_size=(384, 384),
+        window_size=[12, 12, 24, 12],
+        drop_path_rate=0.1,
+    ))
+
+# data settings
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='Resize',
+        scale=(384, 384),
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='PackInputs'),
+]
+
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+test_dataloader = val_dataloader
--- a/configs/tinyvit/tinyvit-21m-distill_8xb256_in1k-512px.py
+++ b/configs/tinyvit/tinyvit-21m-distill_8xb256_in1k-512px.py
+_base_ = [
+    '../_base_/datasets/imagenet_bs32_pil_bicubic.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+    '../_base_/models/tinyvit/tinyvit-21m.py',
+]
+
+# model settings
+model = dict(
+    backbone=dict(
+        img_size=(512, 512),
+        window_size=[16, 16, 32, 16],
+        drop_path_rate=0.1,
+    ))
+# data settings
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='Resize',
+        scale=(512, 512),
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='PackInputs'),
+]
+
+val_dataloader = dict(batch_size=16, dataset=dict(pipeline=test_pipeline))
+
+test_dataloader = val_dataloader
--- a/configs/tinyvit/tinyvit-21m-distill_8xb256_in1k.py
+++ b/configs/tinyvit/tinyvit-21m-distill_8xb256_in1k.py
+_base_ = [
+    './tinyvit-21m_8xb256_in1k.py',
+]
--- a/configs/tinyvit/tinyvit-21m_8xb256_in1k.py
+++ b/configs/tinyvit/tinyvit-21m_8xb256_in1k.py
+_base_ = [
+    '../_base_/datasets/imagenet_bs32_pil_bicubic.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+    '../_base_/models/tinyvit/tinyvit-21m.py',
+]