add part code

495d9ed9 · limm · 59b09903 · 495d9ed9 · 495d9ed9 · 495d9ed9
Commit 495d9ed9 authored Jun 24, 2025 by limm
20 changed files
--- a/configs/vision_transformer/vit-base-p16_8xb64-lora_in1k-384px.py
+++ b/configs/vision_transformer/vit-base-p16_8xb64-lora_in1k-384px.py
+_base_ = [
+    '../_base_/datasets/imagenet_bs64_pil_resize.py',
+    '../_base_/schedules/imagenet_bs4096_AdamW.py',
+    '../_base_/default_runtime.py'
+]
+
+# model setting
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='LoRAModel',
+        module=dict(
+            type='VisionTransformer',
+            arch='b',
+            img_size=384,
+            patch_size=16,
+            drop_rate=0.1,
+            init_cfg=dict(type='Pretrained', checkpoint='',
+                          prefix='backbone')),
+        alpha=16,
+        rank=16,
+        drop_rate=0.1,
+        targets=[dict(type='qkv')]),
+    neck=None,
+    head=dict(
+        type='VisionTransformerClsHead',
+        num_classes=1000,
+        in_channels=768,
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1,
+            mode='classy_vision'),
+        init_cfg=[dict(type='TruncNormal', layer='Linear', std=2e-5)],
+    ))
+
+# dataset setting
+data_preprocessor = dict(
+    mean=[127.5, 127.5, 127.5],
+    std=[127.5, 127.5, 127.5],
+    # convert image from BGR to RGB
+    to_rgb=True,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='RandomResizedCrop', scale=384, backend='pillow'),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='ResizeEdge', scale=384, edge='short', backend='pillow'),
+    dict(type='CenterCrop', crop_size=384),
+    dict(type='PackInputs'),
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1e-4,
+        by_epoch=True,
+        begin=0,
+        end=5,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=45,
+        by_epoch=True,
+        begin=5,
+        end=50,
+        eta_min=1e-6,
+        convert_to_iter_based=True)
+]
+
+train_cfg = dict(by_epoch=True, max_epochs=50)
+default_hooks = dict(
+    # save checkpoint per epoch.
+    checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
+
+# schedule setting
+optim_wrapper = dict(clip_grad=dict(max_norm=1.0))
--- a/configs/vision_transformer/vit-base-p32_64xb64_in1k-384px.py
+++ b/configs/vision_transformer/vit-base-p32_64xb64_in1k-384px.py
+_base_ = [
+    '../_base_/models/vit-base-p32.py',
+    '../_base_/datasets/imagenet_bs64_pil_resize.py',
+    '../_base_/schedules/imagenet_bs4096_AdamW.py',
+    '../_base_/default_runtime.py'
+]
+
+# model setting
+model = dict(backbone=dict(img_size=384))
+
+# dataset setting
+data_preprocessor = dict(
+    mean=[127.5, 127.5, 127.5],
+    std=[127.5, 127.5, 127.5],
+    # convert image from BGR to RGB
+    to_rgb=True,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='RandomResizedCrop', scale=384, backend='pillow'),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='ResizeEdge', scale=384, edge='short', backend='pillow'),
+    dict(type='CenterCrop', crop_size=384),
+    dict(type='PackInputs'),
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# schedule setting
+optim_wrapper = dict(clip_grad=dict(max_norm=1.0))
--- a/configs/vision_transformer/vit-base-p32_64xb64_in1k.py
+++ b/configs/vision_transformer/vit-base-p32_64xb64_in1k.py
+_base_ = [
+    '../_base_/models/vit-base-p32.py',
+    '../_base_/datasets/imagenet_bs64_pil_resize_autoaug.py',
+    '../_base_/schedules/imagenet_bs4096_AdamW.py',
+    '../_base_/default_runtime.py'
+]
+
+# model setting
+model = dict(
+    head=dict(hidden_dim=3072),
+    train_cfg=dict(augments=dict(type='Mixup', alpha=0.2)),
+)
+
+# schedule setting
+optim_wrapper = dict(clip_grad=dict(max_norm=1.0))
--- a/configs/vision_transformer/vit-large-p16_64xb64_in1k-384px.py
+++ b/configs/vision_transformer/vit-large-p16_64xb64_in1k-384px.py
+_base_ = [
+    '../_base_/models/vit-large-p16.py',
+    '../_base_/datasets/imagenet_bs64_pil_resize.py',
+    '../_base_/schedules/imagenet_bs4096_AdamW.py',
+    '../_base_/default_runtime.py'
+]
+
+# model setting
+model = dict(backbone=dict(img_size=384))
+
+# dataset setting
+data_preprocessor = dict(
+    mean=[127.5, 127.5, 127.5],
+    std=[127.5, 127.5, 127.5],
+    # convert image from BGR to RGB
+    to_rgb=True,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='RandomResizedCrop', scale=384, backend='pillow'),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='ResizeEdge', scale=384, edge='short', backend='pillow'),
+    dict(type='CenterCrop', crop_size=384),
+    dict(type='PackInputs'),
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# schedule setting
+optim_wrapper = dict(clip_grad=dict(max_norm=1.0))
--- a/configs/vision_transformer/vit-large-p16_64xb64_in1k.py
+++ b/configs/vision_transformer/vit-large-p16_64xb64_in1k.py
+_base_ = [
+    '../_base_/models/vit-large-p16.py',
+    '../_base_/datasets/imagenet_bs64_pil_resize_autoaug.py',
+    '../_base_/schedules/imagenet_bs4096_AdamW.py',
+    '../_base_/default_runtime.py'
+]
+
+# model setting
+model = dict(
+    head=dict(hidden_dim=3072),
+    train_cfg=dict(augments=dict(type='Mixup', alpha=0.2)),
+)
+
+# schedule setting
+optim_wrapper = dict(clip_grad=dict(max_norm=1.0))
--- a/configs/vision_transformer/vit-large-p32_64xb64_in1k-384px.py
+++ b/configs/vision_transformer/vit-large-p32_64xb64_in1k-384px.py
+_base_ = [
+    '../_base_/models/vit-large-p32.py',
+    '../_base_/datasets/imagenet_bs64_pil_resize_autoaug.py',
+    '../_base_/schedules/imagenet_bs4096_AdamW.py',
+    '../_base_/default_runtime.py'
+]
+
+# model setting
+model = dict(backbone=dict(img_size=384))
+
+# dataset setting
+data_preprocessor = dict(
+    mean=[127.5, 127.5, 127.5],
+    std=[127.5, 127.5, 127.5],
+    # convert image from BGR to RGB
+    to_rgb=True,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='RandomResizedCrop', scale=384, backend='pillow'),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='ResizeEdge', scale=384, edge='short', backend='pillow'),
+    dict(type='CenterCrop', crop_size=384),
+    dict(type='PackInputs'),
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# schedule setting
+optim_wrapper = dict(clip_grad=dict(max_norm=1.0))
--- a/configs/vision_transformer/vit-large-p32_64xb64_in1k.py
+++ b/configs/vision_transformer/vit-large-p32_64xb64_in1k.py
+_base_ = [
+    '../_base_/models/vit-large-p32.py',
+    '../_base_/datasets/imagenet_bs64_pil_resize_autoaug.py',
+    '../_base_/schedules/imagenet_bs4096_AdamW.py',
+    '../_base_/default_runtime.py'
+]
+
+# model setting
+model = dict(
+    head=dict(hidden_dim=3072),
+    train_cfg=dict(augments=dict(type='Mixup', alpha=0.2)),
+)
+
+# schedule setting
+optim_wrapper = dict(clip_grad=dict(max_norm=1.0))
--- a/configs/wrn/README.md
+++ b/configs/wrn/README.md
+# Wide-ResNet
+
+> [Wide Residual Networks](https://arxiv.org/abs/1605.07146)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Deep residual networks were shown to be able to scale up to thousands of layers and still have improving performance. However, each fraction of a percent of improved accuracy costs nearly doubling the number of layers, and so training very deep residual networks has a problem of diminishing feature reuse, which makes these networks very slow to train. To tackle these problems, in this paper we conduct a detailed experimental study on the architecture of ResNet blocks, based on which we propose a novel architecture where we decrease depth and increase width of residual networks. We call the resulting network structures wide residual networks (WRNs) and show that these are far superior over their commonly used thin and very deep counterparts. For example, we demonstrate that even a simple 16-layer-deep wide residual network outperforms in accuracy and efficiency all previous deep residual networks, including thousand-layer-deep networks, achieving new state-of-the-art results on CIFAR, SVHN, COCO, and significant improvements on ImageNet.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/26739999/156701329-2c7ec7bc-23da-401b-86bf-dea8567ccee8.png" width="90%"/>
+</div>
+
+## How to use it?
+
+<!-- [TABS-BEGIN] -->
+
+**Predict image**
+
+```python
+from mmpretrain import inference_model
+
+predict = inference_model('wide-resnet50_3rdparty_8xb32_in1k', 'demo/bird.JPEG')
+print(predict['pred_class'])
+print(predict['pred_score'])
+```
+
+**Use the model**
+
+```python
+import torch
+from mmpretrain import get_model
+
+model = get_model('wide-resnet50_3rdparty_8xb32_in1k', pretrained=True)
+inputs = torch.rand(1, 3, 224, 224)
+out = model(inputs)
+print(type(out))
+# To extract features.
+feats = model.extract_feat(inputs)
+print(type(feats))
+```
+
+**Test Command**
+
+Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
+
+Test:
+
+```shell
+python tools/test.py configs/wrn/wide-resnet50_8xb32_in1k.py https://download.openmmlab.com/mmclassification/v0/wrn/wide-resnet50_3rdparty_8xb32_in1k_20220304-66678344.pth
+```
+
+<!-- [TABS-END] -->
+
+## Models and results
+
+### Image Classification on ImageNet-1k
+
+| Model                                      |   Pretrain   | Params (M) | Flops (G) | Top-1 (%) | Top-5 (%) |                   Config                   |                              Download                               |
+| :----------------------------------------- | :----------: | :--------: | :-------: | :-------: | :-------: | :----------------------------------------: | :-----------------------------------------------------------------: |
+| `wide-resnet50_3rdparty_8xb32_in1k`\*      | From scratch |   68.88    |   11.44   |   78.48   |   94.08   |   [config](wide-resnet50_8xb32_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/wrn/wide-resnet50_3rdparty_8xb32_in1k_20220304-66678344.pth) |
+| `wide-resnet101_3rdparty_8xb32_in1k`\*     | From scratch |   126.89   |   22.81   |   78.84   |   94.28   |   [config](wide-resnet101_8xb32_in1k.py)   | [model](https://download.openmmlab.com/mmclassification/v0/wrn/wide-resnet101_3rdparty_8xb32_in1k_20220304-8d5f9d61.pth) |
+| `wide-resnet50_3rdparty-timm_8xb32_in1k`\* | From scratch |   68.88    |   11.44   |   81.45   |   95.53   | [config](wide-resnet50_timm_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/wrn/wide-resnet50_3rdparty-timm_8xb32_in1k_20220304-83ae4399.pth) |
+
+*Models with * are converted from the [timm](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/resnet.py). The config files of these models are only for inference. We haven't reproduce the training results.*
+
+## Citation
+
+```bibtex
+@INPROCEEDINGS{Zagoruyko2016WRN,
+    author = {Sergey Zagoruyko and Nikos Komodakis},
+    title = {Wide Residual Networks},
+    booktitle = {BMVC},
+    year = {2016}}
+```
--- a/configs/wrn/metafile.yml
+++ b/configs/wrn/metafile.yml
+Collections:
+  - Name: Wide-ResNet
+    Metadata:
+      Training Data: ImageNet-1k
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Epochs: 100
+      Batch Size: 256
+      Architecture:
+        - 1x1 Convolution
+        - Batch Normalization
+        - Convolution
+        - Global Average Pooling
+        - Max Pooling
+        - ReLU
+        - Residual Connection
+        - Softmax
+        - Wide Residual Block
+    Paper:
+      URL: https://arxiv.org/abs/1605.07146
+      Title: "Wide Residual Networks"
+    README: configs/wrn/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmpretrain/blob/v0.20.1/mmcls/models/backbones/resnet.py#L383
+      Version: v0.20.1
+
+Models:
+  - Name: wide-resnet50_3rdparty_8xb32_in1k
+    Metadata:
+      FLOPs: 11440000000  # 11.44G
+      Parameters: 68880000  # 68.88M
+    In Collection: Wide-ResNet
+    Results:
+      - Task: Image Classification
+        Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 78.48
+          Top 5 Accuracy: 94.08
+    Weights: https://download.openmmlab.com/mmclassification/v0/wrn/wide-resnet50_3rdparty_8xb32_in1k_20220304-66678344.pth
+    Config: configs/wrn/wide-resnet50_8xb32_in1k.py
+    Converted From:
+      Weights: https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth
+      Code: https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py
+  - Name: wide-resnet101_3rdparty_8xb32_in1k
+    Metadata:
+      FLOPs: 22810000000  # 22.81G
+      Parameters: 126890000 # 126.89M
+    In Collection: Wide-ResNet
+    Results:
+      - Task: Image Classification
+        Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 78.84
+          Top 5 Accuracy: 94.28
+    Weights: https://download.openmmlab.com/mmclassification/v0/wrn/wide-resnet101_3rdparty_8xb32_in1k_20220304-8d5f9d61.pth
+    Config: configs/wrn/wide-resnet101_8xb32_in1k.py
+    Converted From:
+      Weights: https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth
+      Code: https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py
+  - Name: wide-resnet50_3rdparty-timm_8xb32_in1k
+    Metadata:
+      FLOPs: 11440000000  # 11.44G
+      Parameters: 68880000  # 68.88M
+    In Collection: Wide-ResNet
+    Results:
+      - Task: Image Classification
+        Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 81.45
+          Top 5 Accuracy: 95.53
+    Weights: https://download.openmmlab.com/mmclassification/v0/wrn/wide-resnet50_3rdparty-timm_8xb32_in1k_20220304-83ae4399.pth
+    Config: configs/wrn/wide-resnet50_timm_8xb32_in1k.py
+    Converted From:
+      Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/wide_resnet50_racm-8234f177.pth
+      Code: https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/resnet.py
--- a/configs/wrn/wide-resnet101_8xb32_in1k.py
+++ b/configs/wrn/wide-resnet101_8xb32_in1k.py
+_base_ = [
+    '../_base_/models/wide-resnet50.py',
+    '../_base_/datasets/imagenet_bs32_pil_resize.py',
+    '../_base_/schedules/imagenet_bs256.py', '../_base_/default_runtime.py'
+]
+
+model = dict(backbone=dict(depth=101))
--- a/configs/wrn/wide-resnet50_8xb32_in1k.py
+++ b/configs/wrn/wide-resnet50_8xb32_in1k.py
+_base_ = [
+    '../_base_/models/wide-resnet50.py',
+    '../_base_/datasets/imagenet_bs32_pil_resize.py',
+    '../_base_/schedules/imagenet_bs256.py', '../_base_/default_runtime.py'
+]
--- a/configs/wrn/wide-resnet50_timm_8xb32_in1k.py
+++ b/configs/wrn/wide-resnet50_timm_8xb32_in1k.py
+_base_ = [
+    '../_base_/models/wide-resnet50.py',
+    '../_base_/datasets/imagenet_bs32_pil_bicubic.py',
+    '../_base_/schedules/imagenet_bs256.py', '../_base_/default_runtime.py'
+]
--- a/configs/xcit/README.md
+++ b/configs/xcit/README.md
+# XCiT
+
+> [XCiT: Cross-Covariance Image Transformers](https://arxiv.org/abs/2106.09681)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Following their success in natural language processing, transformers have recently shown much promise for computer vision. The self-attention operation underlying transformers yields global interactions between all tokens ,i.e. words or image patches, and enables flexible modelling of image data beyond the local interactions of convolutions. This flexibility, however, comes with a quadratic complexity in time and memory, hindering application to long sequences and high-resolution images. We propose a "transposed" version of self-attention that operates across feature channels rather than tokens, where the interactions are based on the cross-covariance matrix between keys and queries. The resulting cross-covariance attention (XCA) has linear complexity in the number of tokens, and allows efficient processing of high-resolution images. Our cross-covariance image transformer (XCiT) is built upon XCA. It combines the accuracy of conventional transformers with the scalability of convolutional architectures. We validate the effectiveness and generality of XCiT by reporting excellent results on multiple vision benchmarks, including image classification and self-supervised feature learning on ImageNet-1k, object detection and instance segmentation on COCO, and semantic segmentation on ADE20k.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/26739999/218900814-64a44606-150b-4757-aec8-7015c77a9fd1.png" width="60%"/>
+</div>
+
+## How to use it?
+
+<!-- [TABS-BEGIN] -->
+
+**Use the model**
+
+```python
+import torch
+from mmpretrain import get_model
+
+model = get_model('xcit-nano-12-p16_3rdparty_in1k', pretrained=True)
+inputs = torch.rand(1, 3, 224, 224)
+out = model(inputs)
+print(type(out))
+# To extract features.
+feats = model.extract_feat(inputs)
+print(type(feats))
+```
+
+**Test Command**
+
+Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
+
+Test:
+
+```shell
+python tools/test.py configs/xcit/xcit-nano-12-p16_8xb128_in1k.py https://download.openmmlab.com/mmclassification/v0/xcit/xcit-nano-12-p16_3rdparty_in1k_20230213-ed776c38.pth
+```
+
+<!-- [TABS-END] -->
+
+## Models and results
+
+### Pretrained models
+
+| Model                                           | Params (M) | Flops (G) |                      Config                       |                                       Download                                        |
+| :---------------------------------------------- | :--------: | :-------: | :-----------------------------------------------: | :-----------------------------------------------------------------------------------: |
+| `xcit-nano-12-p16_3rdparty_in1k`\*              |    3.05    |   0.56    |     [config](xcit-nano-12-p16_8xb128_in1k.py)     | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-nano-12-p16_3rdparty_in1k_20230213-ed776c38.pth) |
+| `xcit-nano-12-p16_3rdparty-dist_in1k`\*         |    3.05    |   0.56    |     [config](xcit-nano-12-p16_8xb128_in1k.py)     | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-nano-12-p16_3rdparty-dist_in1k_20230213-fb247f7b.pth) |
+| `xcit-tiny-12-p16_3rdparty_in1k`\*              |    6.72    |   1.24    |     [config](xcit-tiny-12-p16_8xb128_in1k.py)     | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-tiny-12-p16_3rdparty_in1k_20230213-82c547ca.pth) |
+| `xcit-tiny-12-p16_3rdparty-dist_in1k`\*         |    6.72    |   1.24    |     [config](xcit-tiny-12-p16_8xb128_in1k.py)     | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-tiny-12-p16_3rdparty-dist_in1k_20230213-d5fde0a3.pth) |
+| `xcit-nano-12-p16_3rdparty-dist_in1k-384px`\*   |    3.05    |   1.64    |  [config](xcit-nano-12-p16_8xb128_in1k-384px.py)  | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-nano-12-p16_3rdparty-dist_in1k-384px_20230213-712db4d4.pth) |
+| `xcit-nano-12-p8_3rdparty_in1k`\*               |    3.05    |   2.16    |     [config](xcit-nano-12-p8_8xb128_in1k.py)      | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-nano-12-p8_3rdparty_in1k_20230213-3370c293.pth) |
+| `xcit-nano-12-p8_3rdparty-dist_in1k`\*          |    3.05    |   2.16    |     [config](xcit-nano-12-p8_8xb128_in1k.py)      | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-nano-12-p8_3rdparty-dist_in1k_20230213-2f87d2b3.pth) |
+| `xcit-tiny-24-p16_3rdparty_in1k`\*              |   12.12    |   2.34    |     [config](xcit-tiny-24-p16_8xb128_in1k.py)     | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-tiny-24-p16_3rdparty_in1k_20230213-366c1cd0.pth) |
+| `xcit-tiny-24-p16_3rdparty-dist_in1k`\*         |   12.12    |   2.34    |     [config](xcit-tiny-24-p16_8xb128_in1k.py)     | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-tiny-24-p16_3rdparty-dist_in1k_20230213-b472e80a.pth) |
+| `xcit-tiny-12-p16_3rdparty-dist_in1k-384px`\*   |    6.72    |   3.64    |  [config](xcit-tiny-12-p16_8xb128_in1k-384px.py)  | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-tiny-12-p16_3rdparty-dist_in1k-384px_20230213-00a20023.pth) |
+| `xcit-tiny-12-p8_3rdparty_in1k`\*               |    6.71    |   4.81    |     [config](xcit-tiny-12-p8_8xb128_in1k.py)      | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-tiny-12-p8_3rdparty_in1k_20230213-8b02f8f5.pth) |
+| `xcit-tiny-12-p8_3rdparty-dist_in1k`\*          |    6.71    |   4.81    |     [config](xcit-tiny-12-p8_8xb128_in1k.py)      | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-tiny-12-p8_3rdparty-dist_in1k_20230213-f3f9b44f.pth) |
+| `xcit-small-12-p16_3rdparty_in1k`\*             |   26.25    |   4.81    |    [config](xcit-small-12-p16_8xb128_in1k.py)     | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-small-12-p16_3rdparty_in1k_20230213-d36779d2.pth) |
+| `xcit-small-12-p16_3rdparty-dist_in1k`\*        |   26.25    |   4.81    |    [config](xcit-small-12-p16_8xb128_in1k.py)     | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-small-12-p16_3rdparty-dist_in1k_20230213-c95bbae1.pth) |
+| `xcit-nano-12-p8_3rdparty-dist_in1k-384px`\*    |    3.05    |   6.34    |  [config](xcit-nano-12-p8_8xb128_in1k-384px.py)   | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-nano-12-p8_3rdparty-dist_in1k-384px_20230213-09d925ef.pth) |
+| `xcit-tiny-24-p16_3rdparty-dist_in1k-384px`\*   |   12.12    |   6.87    |  [config](xcit-tiny-24-p16_8xb128_in1k-384px.py)  | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-tiny-24-p16_3rdparty-dist_in1k-384px_20230213-20e13917.pth) |
+| `xcit-small-24-p16_3rdparty_in1k`\*             |   47.67    |   9.10    |    [config](xcit-small-24-p16_8xb128_in1k.py)     | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-small-24-p16_3rdparty_in1k_20230213-40febe38.pth) |
+| `xcit-small-24-p16_3rdparty-dist_in1k`\*        |   47.67    |   9.10    |    [config](xcit-small-24-p16_8xb128_in1k.py)     | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-small-24-p16_3rdparty-dist_in1k_20230213-130d7262.pth) |
+| `xcit-tiny-24-p8_3rdparty_in1k`\*               |   12.11    |   9.21    |     [config](xcit-tiny-24-p8_8xb128_in1k.py)      | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-tiny-24-p8_3rdparty_in1k_20230213-4b9ba392.pth) |
+| `xcit-tiny-24-p8_3rdparty-dist_in1k`\*          |   12.11    |   9.21    |     [config](xcit-tiny-24-p8_8xb128_in1k.py)      | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-tiny-24-p8_3rdparty-dist_in1k_20230213-ad9c44b0.pth) |
+| `xcit-tiny-12-p8_3rdparty-dist_in1k-384px`\*    |    6.71    |   14.13   |  [config](xcit-tiny-12-p8_8xb128_in1k-384px.py)   | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-tiny-12-p8_3rdparty-dist_in1k-384px_20230213-a072174a.pth) |
+| `xcit-small-12-p16_3rdparty-dist_in1k-384px`\*  |   26.25    |   14.14   | [config](xcit-small-12-p16_8xb128_in1k-384px.py)  | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-small-12-p16_3rdparty-dist_in1k-384px_20230213-ba36c982.pth) |
+| `xcit-medium-24-p16_3rdparty_in1k`\*            |   84.40    |   16.13   |    [config](xcit-medium-24-p16_8xb128_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-medium-24-p16_3rdparty_in1k_20230213-ad0aa92e.pth) |
+| `xcit-medium-24-p16_3rdparty-dist_in1k`\*       |   84.40    |   16.13   |    [config](xcit-medium-24-p16_8xb128_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-medium-24-p16_3rdparty-dist_in1k_20230213-aca5cd0c.pth) |
+| `xcit-small-12-p8_3rdparty_in1k`\*              |   26.21    |   18.69   |     [config](xcit-small-12-p8_8xb128_in1k.py)     | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-small-12-p8_3rdparty_in1k_20230213-9e364ce3.pth) |
+| `xcit-small-12-p8_3rdparty-dist_in1k`\*         |   26.21    |   18.69   |     [config](xcit-small-12-p8_8xb128_in1k.py)     | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-small-12-p8_3rdparty-dist_in1k_20230213-71886580.pth) |
+| `xcit-small-24-p16_3rdparty-dist_in1k-384px`\*  |   47.67    |   26.72   | [config](xcit-small-24-p16_8xb128_in1k-384px.py)  | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-small-24-p16_3rdparty-dist_in1k-384px_20230213-28fa2d0e.pth) |
+| `xcit-tiny-24-p8_3rdparty-dist_in1k-384px`\*    |   12.11    |   27.05   |  [config](xcit-tiny-24-p8_8xb128_in1k-384px.py)   | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-tiny-24-p8_3rdparty-dist_in1k-384px_20230213-30d5e5ec.pth) |
+| `xcit-small-24-p8_3rdparty_in1k`\*              |   47.63    |   35.81   |     [config](xcit-small-24-p8_8xb128_in1k.py)     | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-small-24-p8_3rdparty_in1k_20230213-280ebcc7.pth) |
+| `xcit-small-24-p8_3rdparty-dist_in1k`\*         |   47.63    |   35.81   |     [config](xcit-small-24-p8_8xb128_in1k.py)     | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-small-24-p8_3rdparty-dist_in1k_20230213-f2773c78.pth) |
+| `xcit-large-24-p16_3rdparty_in1k`\*             |   189.10   |   35.86   |    [config](xcit-large-24-p16_8xb128_in1k.py)     | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-large-24-p16_3rdparty_in1k_20230214-d29d2529.pth) |
+| `xcit-large-24-p16_3rdparty-dist_in1k`\*        |   189.10   |   35.86   |    [config](xcit-large-24-p16_8xb128_in1k.py)     | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-large-24-p16_3rdparty-dist_in1k_20230214-4fea599c.pth) |
+| `xcit-medium-24-p16_3rdparty-dist_in1k-384px`\* |   84.40    |   47.39   | [config](xcit-medium-24-p16_8xb128_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-medium-24-p16_3rdparty-dist_in1k-384px_20230214-6c23a201.pth) |
+| `xcit-small-12-p8_3rdparty-dist_in1k-384px`\*   |   26.21    |   54.92   |  [config](xcit-small-12-p8_8xb128_in1k-384px.py)  | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-small-12-p8_3rdparty-dist_in1k-384px_20230214-9f2178bc.pth) |
+| `xcit-medium-24-p8_3rdparty_in1k`\*             |   84.32    |   63.52   |    [config](xcit-medium-24-p8_8xb128_in1k.py)     | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-medium-24-p8_3rdparty_in1k_20230214-c362850b.pth) |
+| `xcit-medium-24-p8_3rdparty-dist_in1k`\*        |   84.32    |   63.52   |    [config](xcit-medium-24-p8_8xb128_in1k.py)     | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-medium-24-p8_3rdparty-dist_in1k_20230214-625c953b.pth) |
+| `xcit-small-24-p8_3rdparty-dist_in1k-384px`\*   |   47.63    |  105.24   |  [config](xcit-small-24-p8_8xb128_in1k-384px.py)  | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-small-24-p8_3rdparty-dist_in1k-384px_20230214-57298eca.pth) |
+| `xcit-large-24-p16_3rdparty-dist_in1k-384px`\*  |   189.10   |  105.35   | [config](xcit-large-24-p16_8xb128_in1k-384px.py)  | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-large-24-p16_3rdparty-dist_in1k-384px_20230214-bd515a34.pth) |
+| `xcit-large-24-p8_3rdparty_in1k`\*              |   188.93   |  141.23   |     [config](xcit-large-24-p8_8xb128_in1k.py)     | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-large-24-p8_3rdparty_in1k_20230214-08f2f664.pth) |
+| `xcit-large-24-p8_3rdparty-dist_in1k`\*         |   188.93   |  141.23   |     [config](xcit-large-24-p8_8xb128_in1k.py)     | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-large-24-p8_3rdparty-dist_in1k_20230214-8c092b34.pth) |
+| `xcit-medium-24-p8_3rdparty-dist_in1k-384px`\*  |   84.32    |  186.67   | [config](xcit-medium-24-p8_8xb128_in1k-384px.py)  | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-medium-24-p8_3rdparty-dist_in1k-384px_20230214-5db925e0.pth) |
+| `xcit-large-24-p8_3rdparty-dist_in1k-384px`\*   |   188.93   |  415.00   |  [config](xcit-large-24-p8_8xb128_in1k-384px.py)  | [model](https://download.openmmlab.com/mmclassification/v0/xcit/xcit-large-24-p8_3rdparty-dist_in1k-384px_20230214-9f718b1a.pth) |
+
+*Models with * are converted from the [official repo](https://github.com/facebookresearch/xcit). The config files of these models are only for inference. We haven't reproduce the training results.*
+
+## Citation
+
+```bibtex
+@article{el2021xcit,
+  title={XCiT: Cross-Covariance Image Transformers},
+  author={El-Nouby, Alaaeldin and Touvron, Hugo and Caron, Mathilde and Bojanowski, Piotr and Douze, Matthijs and Joulin, Armand and Laptev, Ivan and Neverova, Natalia and Synnaeve, Gabriel and Verbeek, Jakob and others},
+  journal={arXiv preprint arXiv:2106.09681},
+  year={2021}
+}
+```
--- a/configs/xcit/metafile.yml
+++ b/configs/xcit/metafile.yml
+Collections:
+  - Name: XCiT
+    Metadata:
+      Architecture:
+        - Class Attention
+        - Local Patch Interaction
+        - Cross-Covariance Attention
+    Paper:
+      Title: 'XCiT: Cross-Covariance Image Transformers'
+      URL: https://arxiv.org/abs/2106.09681
+    README: configs/xcit/README.md
+
+Models:
+  - Name: xcit-nano-12-p16_3rdparty_in1k
+    Metadata:
+      FLOPs: 557074560
+      Parameters: 3053224
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 70.35
+          Top 5 Accuracy: 89.98
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-nano-12-p16_3rdparty_in1k_20230213-ed776c38.pth
+    Config: configs/xcit/xcit-nano-12-p16_8xb128_in1k.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_nano_12_p16_224.pth
+  - Name: xcit-nano-12-p16_3rdparty-dist_in1k
+    Metadata:
+      FLOPs: 557074560
+      Parameters: 3053224
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 72.36
+          Top 5 Accuracy: 91.02
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-nano-12-p16_3rdparty-dist_in1k_20230213-fb247f7b.pth
+    Config: configs/xcit/xcit-nano-12-p16_8xb128_in1k.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_nano_12_p16_224_dist.pth
+  - Name: xcit-tiny-12-p16_3rdparty_in1k
+    Metadata:
+      FLOPs: 1239698112
+      Parameters: 6716272
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 77.21
+          Top 5 Accuracy: 93.62
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-tiny-12-p16_3rdparty_in1k_20230213-82c547ca.pth
+    Config: configs/xcit/xcit-tiny-12-p16_8xb128_in1k.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_tiny_12_p16_224.pth
+  - Name: xcit-tiny-12-p16_3rdparty-dist_in1k
+    Metadata:
+      FLOPs: 1239698112
+      Parameters: 6716272
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 78.7
+          Top 5 Accuracy: 94.12
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-tiny-12-p16_3rdparty-dist_in1k_20230213-d5fde0a3.pth
+    Config: configs/xcit/xcit-tiny-12-p16_8xb128_in1k.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_tiny_12_p16_224_dist.pth
+  - Name: xcit-nano-12-p16_3rdparty-dist_in1k-384px
+    Metadata:
+      FLOPs: 1636347520
+      Parameters: 3053224
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 74.93
+          Top 5 Accuracy: 92.42
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-nano-12-p16_3rdparty-dist_in1k-384px_20230213-712db4d4.pth
+    Config: configs/xcit/xcit-nano-12-p16_8xb128_in1k-384px.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_nano_12_p16_384_dist.pth
+  - Name: xcit-nano-12-p8_3rdparty_in1k
+    Metadata:
+      FLOPs: 2156861056
+      Parameters: 3049016
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 73.8
+          Top 5 Accuracy: 92.08
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-nano-12-p8_3rdparty_in1k_20230213-3370c293.pth
+    Config: configs/xcit/xcit-nano-12-p8_8xb128_in1k.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_nano_12_p8_224.pth
+  - Name: xcit-nano-12-p8_3rdparty-dist_in1k
+    Metadata:
+      FLOPs: 2156861056
+      Parameters: 3049016
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 76.17
+          Top 5 Accuracy: 93.08
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-nano-12-p8_3rdparty-dist_in1k_20230213-2f87d2b3.pth
+    Config: configs/xcit/xcit-nano-12-p8_8xb128_in1k.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_nano_12_p8_224_dist.pth
+  - Name: xcit-tiny-24-p16_3rdparty_in1k
+    Metadata:
+      FLOPs: 2339305152
+      Parameters: 12116896
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 79.47
+          Top 5 Accuracy: 94.85
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-tiny-24-p16_3rdparty_in1k_20230213-366c1cd0.pth
+    Config: configs/xcit/xcit-tiny-24-p16_8xb128_in1k.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_tiny_24_p16_224.pth
+  - Name: xcit-tiny-24-p16_3rdparty-dist_in1k
+    Metadata:
+      FLOPs: 2339305152
+      Parameters: 12116896
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 80.51
+          Top 5 Accuracy: 95.17
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-tiny-24-p16_3rdparty-dist_in1k_20230213-b472e80a.pth
+    Config: configs/xcit/xcit-tiny-24-p16_8xb128_in1k.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_tiny_24_p16_224_dist.pth
+  - Name: xcit-tiny-12-p16_3rdparty-dist_in1k-384px
+    Metadata:
+      FLOPs: 3641468352
+      Parameters: 6716272
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 80.58
+          Top 5 Accuracy: 95.38
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-tiny-12-p16_3rdparty-dist_in1k-384px_20230213-00a20023.pth
+    Config: configs/xcit/xcit-tiny-12-p16_8xb128_in1k-384px.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_tiny_12_p16_384_dist.pth
+  - Name: xcit-tiny-12-p8_3rdparty_in1k
+    Metadata:
+      FLOPs: 4807399872
+      Parameters: 6706504
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 79.75
+          Top 5 Accuracy: 94.88
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-tiny-12-p8_3rdparty_in1k_20230213-8b02f8f5.pth
+    Config: configs/xcit/xcit-tiny-12-p8_8xb128_in1k.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_tiny_12_p8_224.pth
+  - Name: xcit-tiny-12-p8_3rdparty-dist_in1k
+    Metadata:
+      FLOPs: 4807399872
+      Parameters: 6706504
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 81.26
+          Top 5 Accuracy: 95.46
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-tiny-12-p8_3rdparty-dist_in1k_20230213-f3f9b44f.pth
+    Config: configs/xcit/xcit-tiny-12-p8_8xb128_in1k.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_tiny_12_p8_224_dist.pth
+  - Name: xcit-small-12-p16_3rdparty_in1k
+    Metadata:
+      FLOPs: 4814951808
+      Parameters: 26253304
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 81.87
+          Top 5 Accuracy: 95.77
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-small-12-p16_3rdparty_in1k_20230213-d36779d2.pth
+    Config: configs/xcit/xcit-small-12-p16_8xb128_in1k.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_small_12_p16_224.pth
+  - Name: xcit-small-12-p16_3rdparty-dist_in1k
+    Metadata:
+      FLOPs: 4814951808
+      Parameters: 26253304
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 83.12
+          Top 5 Accuracy: 96.41
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-small-12-p16_3rdparty-dist_in1k_20230213-c95bbae1.pth
+    Config: configs/xcit/xcit-small-12-p16_8xb128_in1k.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_small_12_p16_224_dist.pth
+  - Name: xcit-nano-12-p8_3rdparty-dist_in1k-384px
+    Metadata:
+      FLOPs: 6337760896
+      Parameters: 3049016
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 77.69
+          Top 5 Accuracy: 94.09
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-nano-12-p8_3rdparty-dist_in1k-384px_20230213-09d925ef.pth
+    Config: configs/xcit/xcit-nano-12-p8_8xb128_in1k-384px.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_nano_12_p8_384_dist.pth
+  - Name: xcit-tiny-24-p16_3rdparty-dist_in1k-384px
+    Metadata:
+      FLOPs: 6872966592
+      Parameters: 12116896
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 82.43
+          Top 5 Accuracy: 96.2
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-tiny-24-p16_3rdparty-dist_in1k-384px_20230213-20e13917.pth
+    Config: configs/xcit/xcit-tiny-24-p16_8xb128_in1k-384px.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_tiny_24_p16_384_dist.pth
+  - Name: xcit-small-24-p16_3rdparty_in1k
+    Metadata:
+      FLOPs: 9095064960
+      Parameters: 47671384
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 82.38
+          Top 5 Accuracy: 95.93
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-small-24-p16_3rdparty_in1k_20230213-40febe38.pth
+    Config: configs/xcit/xcit-small-24-p16_8xb128_in1k.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_small_24_p16_224.pth
+  - Name: xcit-small-24-p16_3rdparty-dist_in1k
+    Metadata:
+      FLOPs: 9095064960
+      Parameters: 47671384
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 83.7
+          Top 5 Accuracy: 96.61
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-small-24-p16_3rdparty-dist_in1k_20230213-130d7262.pth
+    Config: configs/xcit/xcit-small-24-p16_8xb128_in1k.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_small_24_p16_224_dist.pth
+  - Name: xcit-tiny-24-p8_3rdparty_in1k
+    Metadata:
+      FLOPs: 9205828032
+      Parameters: 12107128
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 81.7
+          Top 5 Accuracy: 95.9
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-tiny-24-p8_3rdparty_in1k_20230213-4b9ba392.pth
+    Config: configs/xcit/xcit-tiny-24-p8_8xb128_in1k.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_tiny_24_p8_224.pth
+  - Name: xcit-tiny-24-p8_3rdparty-dist_in1k
+    Metadata:
+      FLOPs: 9205828032
+      Parameters: 12107128
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 82.62
+          Top 5 Accuracy: 96.16
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-tiny-24-p8_3rdparty-dist_in1k_20230213-ad9c44b0.pth
+    Config: configs/xcit/xcit-tiny-24-p8_8xb128_in1k.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_tiny_24_p8_224_dist.pth
+  - Name: xcit-tiny-12-p8_3rdparty-dist_in1k-384px
+    Metadata:
+      FLOPs: 14126142912
+      Parameters: 6706504
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 82.46
+          Top 5 Accuracy: 96.22
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-tiny-12-p8_3rdparty-dist_in1k-384px_20230213-a072174a.pth
+    Config: configs/xcit/xcit-tiny-12-p8_8xb128_in1k-384px.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_tiny_12_p8_384_dist.pth
+  - Name: xcit-small-12-p16_3rdparty-dist_in1k-384px
+    Metadata:
+      FLOPs: 14143179648
+      Parameters: 26253304
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 84.74
+          Top 5 Accuracy: 97.19
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-small-12-p16_3rdparty-dist_in1k-384px_20230213-ba36c982.pth
+    Config: configs/xcit/xcit-small-12-p16_8xb128_in1k-384px.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_small_12_p16_384_dist.pth
+  - Name: xcit-medium-24-p16_3rdparty_in1k
+    Metadata:
+      FLOPs: 16129561088
+      Parameters: 84395752
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 82.56
+          Top 5 Accuracy: 95.82
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-medium-24-p16_3rdparty_in1k_20230213-ad0aa92e.pth
+    Config: configs/xcit/xcit-medium-24-p16_8xb128_in1k.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_medium_24_p16_224.pth
+  - Name: xcit-medium-24-p16_3rdparty-dist_in1k
+    Metadata:
+      FLOPs: 16129561088
+      Parameters: 84395752
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 84.15
+          Top 5 Accuracy: 96.82
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-medium-24-p16_3rdparty-dist_in1k_20230213-aca5cd0c.pth
+    Config: configs/xcit/xcit-medium-24-p16_8xb128_in1k.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_medium_24_p16_224_dist.pth
+  - Name: xcit-small-12-p8_3rdparty_in1k
+    Metadata:
+      FLOPs: 18691601280
+      Parameters: 26213032
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 83.21
+          Top 5 Accuracy: 96.41
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-small-12-p8_3rdparty_in1k_20230213-9e364ce3.pth
+    Config: configs/xcit/xcit-small-12-p8_8xb128_in1k.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_small_12_p8_224.pth
+  - Name: xcit-small-12-p8_3rdparty-dist_in1k
+    Metadata:
+      FLOPs: 18691601280
+      Parameters: 26213032
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 83.97
+          Top 5 Accuracy: 96.81
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-small-12-p8_3rdparty-dist_in1k_20230213-71886580.pth
+    Config: configs/xcit/xcit-small-12-p8_8xb128_in1k.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_small_12_p8_224_dist.pth
+  - Name: xcit-small-24-p16_3rdparty-dist_in1k-384px
+    Metadata:
+      FLOPs: 26721471360
+      Parameters: 47671384
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 85.1
+          Top 5 Accuracy: 97.32
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-small-24-p16_3rdparty-dist_in1k-384px_20230213-28fa2d0e.pth
+    Config: configs/xcit/xcit-small-24-p16_8xb128_in1k-384px.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_small_24_p16_384_dist.pth
+  - Name: xcit-tiny-24-p8_3rdparty-dist_in1k-384px
+    Metadata:
+      FLOPs: 27052135872
+      Parameters: 12107128
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 83.77
+          Top 5 Accuracy: 96.72
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-tiny-24-p8_3rdparty-dist_in1k-384px_20230213-30d5e5ec.pth
+    Config: configs/xcit/xcit-tiny-24-p8_8xb128_in1k-384px.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_tiny_24_p8_384_dist.pth
+  - Name: xcit-small-24-p8_3rdparty_in1k
+    Metadata:
+      FLOPs: 35812053888
+      Parameters: 47631112
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 83.62
+          Top 5 Accuracy: 96.51
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-small-24-p8_3rdparty_in1k_20230213-280ebcc7.pth
+    Config: configs/xcit/xcit-small-24-p8_8xb128_in1k.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_small_24_p8_224.pth
+  - Name: xcit-small-24-p8_3rdparty-dist_in1k
+    Metadata:
+      FLOPs: 35812053888
+      Parameters: 47631112
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 84.68
+          Top 5 Accuracy: 97.07
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-small-24-p8_3rdparty-dist_in1k_20230213-f2773c78.pth
+    Config: configs/xcit/xcit-small-24-p8_8xb128_in1k.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_small_24_p8_224_dist.pth
+  - Name: xcit-large-24-p16_3rdparty_in1k
+    Metadata:
+      FLOPs: 35855948544
+      Parameters: 189096136
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 82.97
+          Top 5 Accuracy: 95.86
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-large-24-p16_3rdparty_in1k_20230214-d29d2529.pth
+    Config: configs/xcit/xcit-large-24-p16_8xb128_in1k.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_large_24_p16_224.pth
+  - Name: xcit-large-24-p16_3rdparty-dist_in1k
+    Metadata:
+      FLOPs: 35855948544
+      Parameters: 189096136
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 84.61
+          Top 5 Accuracy: 97.07
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-large-24-p16_3rdparty-dist_in1k_20230214-4fea599c.pth
+    Config: configs/xcit/xcit-large-24-p16_8xb128_in1k.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_large_24_p16_224_dist.pth
+  - Name: xcit-medium-24-p16_3rdparty-dist_in1k-384px
+    Metadata:
+      FLOPs: 47388932608
+      Parameters: 84395752
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 85.47
+          Top 5 Accuracy: 97.49
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-medium-24-p16_3rdparty-dist_in1k-384px_20230214-6c23a201.pth
+    Config: configs/xcit/xcit-medium-24-p16_8xb128_in1k-384px.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_medium_24_p16_384_dist.pth
+  - Name: xcit-small-12-p8_3rdparty-dist_in1k-384px
+    Metadata:
+      FLOPs: 54923537280
+      Parameters: 26213032
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 85.12
+          Top 5 Accuracy: 97.31
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-small-12-p8_3rdparty-dist_in1k-384px_20230214-9f2178bc.pth
+    Config: configs/xcit/xcit-small-12-p8_8xb128_in1k-384px.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_small_12_p8_384_dist.pth
+  - Name: xcit-medium-24-p8_3rdparty_in1k
+    Metadata:
+      FLOPs: 63524706816
+      Parameters: 84323624
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 83.61
+          Top 5 Accuracy: 96.23
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-medium-24-p8_3rdparty_in1k_20230214-c362850b.pth
+    Config: configs/xcit/xcit-medium-24-p8_8xb128_in1k.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_medium_24_p8_224.pth
+  - Name: xcit-medium-24-p8_3rdparty-dist_in1k
+    Metadata:
+      FLOPs: 63524706816
+      Parameters: 84323624
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 85.0
+          Top 5 Accuracy: 97.16
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-medium-24-p8_3rdparty-dist_in1k_20230214-625c953b.pth
+    Config: configs/xcit/xcit-medium-24-p8_8xb128_in1k.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_medium_24_p8_224_dist.pth
+  - Name: xcit-small-24-p8_3rdparty-dist_in1k-384px
+    Metadata:
+      FLOPs: 105236704128
+      Parameters: 47631112
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 85.57
+          Top 5 Accuracy: 97.6
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-small-24-p8_3rdparty-dist_in1k-384px_20230214-57298eca.pth
+    Config: configs/xcit/xcit-small-24-p8_8xb128_in1k-384px.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_small_24_p8_384_dist.pth
+  - Name: xcit-large-24-p16_3rdparty-dist_in1k-384px
+    Metadata:
+      FLOPs: 105345095424
+      Parameters: 189096136
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 85.78
+          Top 5 Accuracy: 97.6
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-large-24-p16_3rdparty-dist_in1k-384px_20230214-bd515a34.pth
+    Config: configs/xcit/xcit-large-24-p16_8xb128_in1k-384px.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_large_24_p16_384_dist.pth
+  - Name: xcit-large-24-p8_3rdparty_in1k
+    Metadata:
+      FLOPs: 141225699072
+      Parameters: 188932648
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 84.23
+          Top 5 Accuracy: 96.58
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-large-24-p8_3rdparty_in1k_20230214-08f2f664.pth
+    Config: configs/xcit/xcit-large-24-p8_8xb128_in1k.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_large_24_p8_224.pth
+  - Name: xcit-large-24-p8_3rdparty-dist_in1k
+    Metadata:
+      FLOPs: 141225699072
+      Parameters: 188932648
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 85.14
+          Top 5 Accuracy: 97.32
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-large-24-p8_3rdparty-dist_in1k_20230214-8c092b34.pth
+    Config: configs/xcit/xcit-large-24-p8_8xb128_in1k.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_large_24_p8_224_dist.pth
+  - Name: xcit-medium-24-p8_3rdparty-dist_in1k-384px
+    Metadata:
+      FLOPs: 186672626176
+      Parameters: 84323624
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 85.87
+          Top 5 Accuracy: 97.61
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-medium-24-p8_3rdparty-dist_in1k-384px_20230214-5db925e0.pth
+    Config: configs/xcit/xcit-medium-24-p8_8xb128_in1k-384px.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_medium_24_p8_384_dist.pth
+  - Name: xcit-large-24-p8_3rdparty-dist_in1k-384px
+    Metadata:
+      FLOPs: 415003137792
+      Parameters: 188932648
+      Training Data: ImageNet-1k
+    In Collection: XCiT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 86.13
+          Top 5 Accuracy: 97.75
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/xcit/xcit-large-24-p8_3rdparty-dist_in1k-384px_20230214-9f718b1a.pth
+    Config: configs/xcit/xcit-large-24-p8_8xb128_in1k-384px.py
+    Converted From:
+      Code: https://github.com/facebookresearch/xcit
+      Weights: https://dl.fbaipublicfiles.com/xcit/xcit_large_24_p8_384_dist.pth
--- a/configs/xcit/xcit-large-24-p16_8xb128_in1k-384px.py
+++ b/configs/xcit/xcit-large-24-p16_8xb128_in1k-384px.py
+_base_ = [
+    '../_base_/datasets/imagenet_bs64_swin_384.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='XCiT',
+        patch_size=16,
+        embed_dims=768,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        layer_scale_init_value=1e-5,
+        tokens_norm=True,
+        out_type='cls_token',
+    ),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=768,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+    ),
+    train_cfg=dict(augments=[
+        dict(type='Mixup', alpha=0.8),
+        dict(type='CutMix', alpha=1.0),
+    ]),
+)
+
+# dataset settings
+train_dataloader = dict(batch_size=128)
--- a/configs/xcit/xcit-large-24-p16_8xb128_in1k.py
+++ b/configs/xcit/xcit-large-24-p16_8xb128_in1k.py
+_base_ = [
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='XCiT',
+        patch_size=16,
+        embed_dims=768,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        layer_scale_init_value=1e-5,
+        tokens_norm=True,
+        out_type='cls_token',
+    ),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=768,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+    ),
+    train_cfg=dict(augments=[
+        dict(type='Mixup', alpha=0.8),
+        dict(type='CutMix', alpha=1.0),
+    ]),
+)
+
+# dataset settings
+train_dataloader = dict(batch_size=128)
--- a/configs/xcit/xcit-large-24-p8_8xb128_in1k-384px.py
+++ b/configs/xcit/xcit-large-24-p8_8xb128_in1k-384px.py
+_base_ = [
+    '../_base_/datasets/imagenet_bs64_swin_384.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='XCiT',
+        patch_size=8,
+        embed_dims=768,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        layer_scale_init_value=1e-5,
+        tokens_norm=True,
+        out_type='cls_token',
+    ),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=768,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+    ),
+    train_cfg=dict(augments=[
+        dict(type='Mixup', alpha=0.8),
+        dict(type='CutMix', alpha=1.0),
+    ]),
+)
+
+# dataset settings
+train_dataloader = dict(batch_size=128)
--- a/configs/xcit/xcit-large-24-p8_8xb128_in1k.py
+++ b/configs/xcit/xcit-large-24-p8_8xb128_in1k.py
+_base_ = [
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='XCiT',
+        patch_size=8,
+        embed_dims=768,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        layer_scale_init_value=1e-5,
+        tokens_norm=True,
+        out_type='cls_token',
+    ),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=768,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+    ),
+    train_cfg=dict(augments=[
+        dict(type='Mixup', alpha=0.8),
+        dict(type='CutMix', alpha=1.0),
+    ]),
+)
+
+# dataset settings
+train_dataloader = dict(batch_size=128)
--- a/configs/xcit/xcit-medium-24-p16_8xb128_in1k-384px.py
+++ b/configs/xcit/xcit-medium-24-p16_8xb128_in1k-384px.py
+_base_ = [
+    '../_base_/datasets/imagenet_bs64_swin_384.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='XCiT',
+        patch_size=16,
+        embed_dims=512,
+        depth=24,
+        num_heads=8,
+        mlp_ratio=4,
+        qkv_bias=True,
+        layer_scale_init_value=1e-5,
+        tokens_norm=True,
+        out_type='cls_token',
+    ),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=512,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+    ),
+    train_cfg=dict(augments=[
+        dict(type='Mixup', alpha=0.8),
+        dict(type='CutMix', alpha=1.0),
+    ]),
+)
+
+# dataset settings
+train_dataloader = dict(batch_size=128)
--- a/configs/xcit/xcit-medium-24-p16_8xb128_in1k.py
+++ b/configs/xcit/xcit-medium-24-p16_8xb128_in1k.py
+_base_ = [
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='XCiT',
+        patch_size=16,
+        embed_dims=512,
+        depth=24,
+        num_heads=8,
+        mlp_ratio=4,
+        qkv_bias=True,
+        layer_scale_init_value=1e-5,
+        tokens_norm=True,
+        out_type='cls_token',
+    ),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=512,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+    ),
+    train_cfg=dict(augments=[
+        dict(type='Mixup', alpha=0.8),
+        dict(type='CutMix', alpha=1.0),
+    ]),
+)
+
+# dataset settings
+train_dataloader = dict(batch_size=128)