first commit

dff2c686 · renzhc · 8f9dd0ed · dff2c686 · dff2c686 · dff2c686
Commit dff2c686 authored Sep 03, 2024 by renzhc
20 changed files
--- a/configs/eva02/eva02-base-p14_headless.py
+++ b/configs/eva02/eva02-base-p14_headless.py
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='ViTEVA02',
+        arch='b',
+        img_size=224,
+        patch_size=14,
+        sub_ln=True,
+        final_norm=False,
+        out_type='avg_featmap'),
+    neck=None,
+    head=None,
+)
+data_preprocessor = dict(
+    # RGB format normalization parameters
+    mean=[0.48145466 * 255, 0.4578275 * 255, 0.40821073 * 255],
+    std=[0.26862954 * 255, 0.26130258 * 255, 0.27577711 * 255],
+    # convert image from BGR to RGB
+    to_rgb=True,
+)
--- a/configs/eva02/eva02-base-p14_in1k.py
+++ b/configs/eva02/eva02-base-p14_in1k.py
+_base_ = [
+    '../_base_/datasets/imagenet_bs16_eva_448.py',
+    '../_base_/schedules/imagenet_bs2048_AdamW.py',
+    '../_base_/default_runtime.py'
+]
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='ViTEVA02',
+        arch='b',
+        img_size=448,
+        patch_size=14,
+        sub_ln=True,
+        final_norm=False,
+        out_type='avg_featmap'),
+    neck=None,
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=768,
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+    ),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=.02),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
+    ],
+    train_cfg=dict(augments=[
+        dict(type='Mixup', alpha=0.8),
+        dict(type='CutMix', alpha=1.0)
+    ]))
--- a/configs/eva02/eva02-large-p14_headless.py
+++ b/configs/eva02/eva02-large-p14_headless.py
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='ViTEVA02',
+        arch='l',
+        img_size=224,
+        patch_size=14,
+        sub_ln=True,
+        final_norm=False,
+        out_type='avg_featmap'),
+    neck=None,
+    head=None,
+)
+data_preprocessor = dict(
+    # RGB format normalization parameters
+    mean=[0.48145466 * 255, 0.4578275 * 255, 0.40821073 * 255],
+    std=[0.26862954 * 255, 0.26130258 * 255, 0.27577711 * 255],
+    # convert image from BGR to RGB
+    to_rgb=True,
+)
--- a/configs/eva02/eva02-large-p14_in1k.py
+++ b/configs/eva02/eva02-large-p14_in1k.py
+_base_ = [
+    '../_base_/datasets/imagenet_bs16_eva_448.py',
+    '../_base_/schedules/imagenet_bs2048_AdamW.py',
+    '../_base_/default_runtime.py'
+]
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='ViTEVA02',
+        arch='l',
+        img_size=448,
+        patch_size=14,
+        sub_ln=True,
+        final_norm=False,
+        out_type='avg_featmap'),
+    neck=None,
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=1024,
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+    ),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=.02),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
+    ],
+    train_cfg=dict(augments=[
+        dict(type='Mixup', alpha=0.8),
+        dict(type='CutMix', alpha=1.0)
+    ]))
--- a/configs/eva02/eva02-small-p14_headless.py
+++ b/configs/eva02/eva02-small-p14_headless.py
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='ViTEVA02',
+        arch='s',
+        img_size=224,
+        patch_size=14,
+        final_norm=False,
+        out_type='avg_featmap'),
+    neck=None,
+    head=None,
+)
+data_preprocessor = dict(
+    # RGB format normalization parameters
+    mean=[0.48145466 * 255, 0.4578275 * 255, 0.40821073 * 255],
+    std=[0.26862954 * 255, 0.26130258 * 255, 0.27577711 * 255],
+    # convert image from BGR to RGB
+    to_rgb=True,
+)
--- a/configs/eva02/eva02-small-p14_in1k.py
+++ b/configs/eva02/eva02-small-p14_in1k.py
+_base_ = [
+    '../_base_/datasets/imagenet_bs16_eva_336.py',
+    '../_base_/schedules/imagenet_bs2048_AdamW.py',
+    '../_base_/default_runtime.py'
+]
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='ViTEVA02',
+        arch='s',
+        img_size=336,
+        patch_size=14,
+        final_norm=False,
+        out_type='avg_featmap'),
+    neck=None,
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=384,
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+    ),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=.02),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
+    ],
+    train_cfg=dict(augments=[
+        dict(type='Mixup', alpha=0.8),
+        dict(type='CutMix', alpha=1.0)
+    ]))
--- a/configs/eva02/eva02-tiny-p14_headless.py
+++ b/configs/eva02/eva02-tiny-p14_headless.py
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='ViTEVA02',
+        arch='t',
+        img_size=224,
+        patch_size=14,
+        final_norm=False,
+        out_type='avg_featmap'),
+    neck=None,
+    head=None,
+)
+data_preprocessor = dict(
+    # RGB format normalization parameters
+    mean=[0.48145466 * 255, 0.4578275 * 255, 0.40821073 * 255],
+    std=[0.26862954 * 255, 0.26130258 * 255, 0.27577711 * 255],
+    # convert image from BGR to RGB
+    to_rgb=True,
+)
--- a/configs/eva02/eva02-tiny-p14_in1k.py
+++ b/configs/eva02/eva02-tiny-p14_in1k.py
+_base_ = [
+    '../_base_/datasets/imagenet_bs16_eva_336.py',
+    '../_base_/schedules/imagenet_bs2048_AdamW.py',
+    '../_base_/default_runtime.py'
+]
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='ViTEVA02',
+        arch='t',
+        img_size=336,
+        patch_size=14,
+        final_norm=False,
+        out_type='avg_featmap'),
+    neck=None,
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=192,
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+    ),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=.02),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
+    ],
+    train_cfg=dict(augments=[
+        dict(type='Mixup', alpha=0.8),
+        dict(type='CutMix', alpha=1.0)
+    ]))
--- a/configs/eva02/metafile.yml
+++ b/configs/eva02/metafile.yml
+Collections:
+  - Name: EVA02
+    Metadata:
+      Architecture:
+        - Rotary Position Embedding
+        - Sub Layer Normalization
+        - SwiGLU
+    Paper:
+      Title: 'EVA-02: A Visual Representation for Neon Genesis'
+      URL: https://arxiv.org/abs/2303.11331
+    README: configs/eva02/README.md
+Models:
+  - Name: vit-tiny-p14_eva02-pre_in21k
+    Metadata:
+      FLOPs: 1703439360
+      Parameters: 5504064
+      Training Data:
+        - ImageNet-21k
+    In Collection: EVA02
+    Weights: https://download.openmmlab.com/mmpretrain/v1.0/eva02/eva02-tiny-p14_pre_in21k_20230505-d703e7b1.pth
+    Config: configs/eva02/eva02-tiny-p14_headless.py
+    Converted From:
+      Weights: https://huggingface.co/Yuxin-CV/EVA-02/blob/main/eva02/pt/eva02_Ti_pt_in21k_p14.pt
+      Code: https://github.com/baaivision/EVA/tree/master/EVA-02
+    Downstream:
+      - vit-tiny-p14_eva02-in21k-pre_3rdparty_in1k-336px
+  - Name: vit-tiny-p14_eva02-in21k-pre_3rdparty_in1k-336px
+    Metadata:
+      FLOPs: 4675416000
+      Parameters: 5758888
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+    In Collection: EVA02
+    Results:
+      - Dataset: ImageNet-1k
+        Task: Image Classification
+        Metrics:
+          Top 1 Accuracy: 80.69
+          Top 5 Accuracy: 95.54
+    Weights: https://download.openmmlab.com/mmpretrain/v1.0/eva02/eva02-tiny-p14_in21k-pre_3rdparty_in1k-336px_20230505-a4e8708a.pth
+    Config: configs/eva02/eva02-tiny-p14_in1k.py
+    Converted From:
+      Weights: https://huggingface.co/Yuxin-CV/EVA-02/blob/main/eva02/cls/in1k/eva02_Ti_pt_in21k_ft_in1k_p14.pt
+      Code: https://github.com/baaivision/EVA/tree/master/EVA-02
+  - Name: vit-small-p14_eva02-pre_in21k
+    Metadata:
+      FLOPs: 6135404544
+      Parameters: 21624960
+      Training Data:
+        - ImageNet-21k
+    In Collection: EVA02
+    Weights: https://download.openmmlab.com/mmpretrain/v1.0/eva02/eva02-small-p14_pre_in21k_20230505-3175f463.pth
+    Config: configs/eva02/eva02-small-p14_headless.py
+    Converted From:
+      Weights: https://huggingface.co/Yuxin-CV/EVA-02/blob/main/eva02/pt/eva02_S_pt_in21k_p14.pt
+      Code: https://github.com/baaivision/EVA/tree/master/EVA-02
+    Downstream:
+      - vit-small-p14_eva02-in21k-pre_3rdparty_in1k-336px
+  - Name: vit-small-p14_eva02-in21k-pre_3rdparty_in1k-336px
+    Metadata:
+      FLOPs: 15476744064
+      Parameters: 22133608
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+    In Collection: EVA02
+    Results:
+      - Dataset: ImageNet-1k
+        Task: Image Classification
+        Metrics:
+          Top 1 Accuracy: 85.78
+          Top 5 Accuracy: 97.60
+    Weights: https://download.openmmlab.com/mmpretrain/v1.0/eva02/eva02-small-p14_in21k-pre_3rdparty_in1k-336px_20230505-9c5b0e85.pth
+    Config: configs/eva02/eva02-small-p14_in1k.py
+    Converted From:
+      Weights: https://huggingface.co/Yuxin-CV/EVA-02/blob/main/eva02/cls/in1k/eva02_S_pt_in21k_ft_in1k_p14.pt
+      Code: https://github.com/baaivision/EVA/tree/master/EVA-02
+  - Name: vit-base-p14_eva02-pre_in21k
+    Metadata:
+      FLOPs: 23216492544
+      Parameters: 85766400
+      Training Data:
+        - ImageNet-21k
+    In Collection: EVA02
+    Weights: https://download.openmmlab.com/mmpretrain/v1.0/eva02/eva02-base-p14_pre_in21k_20230505-2f2d4d3c.pth
+    Config: configs/eva02/eva02-base-p14_headless.py
+    Converted From:
+      Weights: https://huggingface.co/Yuxin-CV/EVA-02/blob/main/eva02/pt/eva02_B_pt_in21k_p14.pt
+      Code: https://github.com/baaivision/EVA/tree/master/EVA-02
+    Downstream:
+      - vit-base-p14_eva02-in21k-pre_3rdparty_in1k-448px
+      - vit-base-p14_eva02-in21k-pre_in21k-medft_3rdparty_in1k-448px
+  - Name: vit-base-p14_eva02-in21k-pre_3rdparty_in1k-448px
+    Metadata:
+      FLOPs: 107105984256
+      Parameters: 87126760
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+    In Collection: EVA02
+    Results:
+      - Dataset: ImageNet-1k
+        Task: Image Classification
+        Metrics:
+          Top 1 Accuracy: 88.29
+          Top 5 Accuracy: 98.53
+    Weights: https://download.openmmlab.com/mmpretrain/v1.0/eva02/eva02-base-p14_in21k-pre_3rdparty_in1k-448px_20230505-8ad211c5.pth
+    Config: configs/eva02/eva02-base-p14_in1k.py
+    Converted From:
+      Weights: https://huggingface.co/Yuxin-CV/EVA-02/blob/main/eva02/cls/in1k/eva02_B_pt_in21k_ft_in1k_p14.pt
+      Code: https://github.com/baaivision/EVA/tree/master/EVA-02
+  - Name: vit-base-p14_eva02-in21k-pre_in21k-medft_3rdparty_in1k-448px
+    Metadata:
+      FLOPs: 107105984256
+      Parameters: 87126760
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+    In Collection: EVA02
+    Results:
+      - Dataset: ImageNet-1k
+        Task: Image Classification
+        Metrics:
+          Top 1 Accuracy: 88.47
+          Top 5 Accuracy: 98.62
+    Weights: https://download.openmmlab.com/mmpretrain/v1.0/eva02/eva02-base-p14_in21k-pre_in21k-medft_3rdparty_in1k-448px_20230505-5cd4d87f.pth
+    Config: configs/eva02/eva02-base-p14_in1k.py
+    Converted From:
+      Weights: https://huggingface.co/Yuxin-CV/EVA-02/blob/main/eva02/cls/in21k/eva02_B_pt_in21k_medft_in21k_p14.pt
+      Code: https://github.com/baaivision/EVA/tree/master/EVA-02
+  - Name: vit-large-p14_eva02-pre_in21k
+    Metadata:
+      FLOPs: 81146703792
+      Parameters: 303291328
+      Training Data:
+        - ImageNet-21k
+    In Collection: EVA02
+    Weights: https://download.openmmlab.com/mmpretrain/v1.0/eva02/eva02-large-p14_pre_in21k_20230505-9072de5d.pth
+    Config: configs/eva02/eva02-large-p14_headless.py
+    Converted From:
+      Weights: https://huggingface.co/Yuxin-CV/EVA-02/blob/main/eva02/pt/eva02_L_pt_in21k_p14.pt
+      Code: https://github.com/baaivision/EVA/tree/master/EVA-02
+    Downstream:
+      - vit-large-p14_eva02-in21k-pre_in21k-medft_3rdparty_in1k-448px
+  - Name: vit-large-p14_eva02-in21k-pre_in21k-medft_3rdparty_in1k-448px
+    Metadata:
+      FLOPs: 362333836208
+      Parameters: 305104808
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+    In Collection: EVA02
+    Results:
+      - Dataset: ImageNet-1k
+        Task: Image Classification
+        Metrics:
+          Top 1 Accuracy: 89.65
+          Top 5 Accuracy: 98.95
+    Weights: https://download.openmmlab.com/mmpretrain/v1.0/eva02/eva02-large-p14_in21k-pre_in21k-medft_3rdparty_in1k-448px_20230505-926d1599.pth
+    Config: configs/eva02/eva02-large-p14_in1k.py
+    Converted From:
+      Weights: https://huggingface.co/Yuxin-CV/EVA-02/blob/main/eva02/cls/in21k/eva02_L_pt_in21k_medft_in21k_p14.pt
+      Code: https://github.com/baaivision/EVA/tree/master/EVA-02
+  - Name: vit-large-p14_eva02-pre_m38m
+    Metadata:
+      FLOPs: 81146703792
+      Parameters: 303291328
+      Training Data:
+        - Merged-38M
+    In Collection: EVA02
+    Weights: https://download.openmmlab.com/mmpretrain/v1.0/eva02/eva02-large-p14_pre_m38m_20230505-b8a1a261.pth
+    Config: configs/eva02/eva02-large-p14_headless.py
+    Converted From:
+      Weights: https://huggingface.co/Yuxin-CV/EVA-02/blob/main/eva02/pt/eva02_L_pt_m38m_p14.pt
+      Code: https://github.com/baaivision/EVA/tree/master/EVA-02
+    Downstream:
+      - vit-large-p14_eva02_m38m-pre_in21k-medft_3rdparty_in1k-448px
+  - Name: vit-large-p14_eva02_m38m-pre_in21k-medft_3rdparty_in1k-448px
+    Metadata:
+      FLOPs: 362333836208
+      Parameters: 305104808
+      Training Data:
+        - Merged-38M
+        - ImageNet-21k
+        - ImageNet-1k
+    In Collection: EVA02
+    Results:
+      - Dataset: ImageNet-1k
+        Task: Image Classification
+        Metrics:
+          Top 1 Accuracy: 89.83
+          Top 5 Accuracy: 99.00
+    Weights: https://download.openmmlab.com/mmpretrain/v1.0/eva02/eva02-large-p14_m38m-pre_in21k-medft_3rdparty_in1k-448px_20230505-150dc5ed.pth
+    Config: configs/eva02/eva02-large-p14_in1k.py
+    Converted From:
+      Weights: https://huggingface.co/Yuxin-CV/EVA-02/blob/main/eva02/cls/in21k/eva02_L_pt_m38m_medft_in21k_p14.pt
+      Code: https://github.com/baaivision/EVA/tree/master/EVA-02
--- a/configs/flamingo/README.md
+++ b/configs/flamingo/README.md
+# Flamingo
+> [Flamingo: a Visual Language Model for Few-Shot Learning](https://arxiv.org/abs/2204.14198)
+<!-- [ALGORITHM] -->
+## Abstract
+Building models that can be rapidly adapted to novel tasks using only a handful of annotated examples is an open challenge for multimodal machine learning research. We introduce Flamingo, a family of Visual Language Models (VLM) with this ability. We propose key architectural innovations to: (i) bridge powerful pretrained vision-only and language-only models, (ii) handle sequences of arbitrarily interleaved visual and textual data, and (iii) seamlessly ingest images or videos as inputs. Thanks to their flexibility, Flamingo models can be trained on large-scale multimodal web corpora containing arbitrarily interleaved text and images, which is key to endow them with in-context few-shot learning capabilities. We perform a thorough evaluation of our models, exploring and measuring their ability to rapidly adapt to a variety of image and video tasks. These include open-ended tasks such as visual question-answering, where the model is prompted with a question which it has to answer; captioning tasks, which evaluate the ability to describe a scene or an event; and close-ended tasks such as multiple-choice visual question-answering. For tasks lying anywhere on this spectrum, a single Flamingo model can achieve a new state of the art with few-shot learning, simply by prompting the model with task-specific examples. On numerous benchmarks, Flamingo outperforms models fine-tuned on thousands of times more task-specific data.
+<div align=center>
+<img src="https://user-images.githubusercontent.com/26739999/236371424-3b9d2e16-3966-4c64-8b87-e33fd6348824.png" width="80%"/>
+</div>
+## How to use it?
+<!-- [TABS-BEGIN] -->
+**Use the model**
+```python
+from mmpretrain import inference_model
+result = inference_model('flamingo_3rdparty-zeroshot_caption', 'demo/cat-dog.png')
+print(result)
+# {'pred_caption': 'A dog and a cat are looking at each other. '}
+```
+**Test Command**
+Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
+Test:
+```shell
+python tools/test.py configs/flamingo/flamingo_zeroshot_caption.py https://download.openmmlab.com/mmclassification/v1/flamingo/openflamingo-9b-adapter_20230505-554310c8.pth
+```
+<!-- [TABS-END] -->
+## Models and results
+### Image Caption on COCO
+| Model                                  | Params (G) | CIDER |                 Config                 |                                                   Download                                                    |
+| :------------------------------------- | :--------: | :---: | :------------------------------------: | :-----------------------------------------------------------------------------------------------------------: |
+| `flamingo_3rdparty-zeroshot_caption`\* |   8.220    | 65.50 | [config](flamingo_zeroshot_caption.py) | [model](https://download.openmmlab.com/mmclassification/v1/flamingo/openflamingo-9b-adapter_20230505-554310c8.pth) |
+*Models with * are converted from the [openflamingo](https://github.com/mlfoundations/open_flamingo). The config files of these models are only for inference. We haven't reproduce the training results.*
+### Visual Question Answering on VQAv2
+| Model                              | Params (G) | Accuracy |               Config               |                                                      Download                                                      |
+| :--------------------------------- | :--------: | :------: | :--------------------------------: | :----------------------------------------------------------------------------------------------------------------: |
+| `flamingo_3rdparty-zeroshot_vqa`\* |    8.22    |  43.50   | [config](flamingo_zeroshot_vqa.py) | [model](https://download.openmmlab.com/mmclassification/v1/flamingo/openflamingo-9b-adapter_20230505-554310c8.pth) |
+*Models with * are converted from the [openflamingo](https://github.com/mlfoundations/open_flamingo). The config files of these models are only for inference. We haven't reproduce the training results.*
+## Citation
+```bibtex
+@article{Alayrac2022FlamingoAV,
+  title={Flamingo: a Visual Language Model for Few-Shot Learning},
+  author={Jean-Baptiste Alayrac and Jeff Donahue and Pauline Luc and Antoine Miech and Iain Barr and Yana Hasson and Karel Lenc and Arthur Mensch and Katie Millican and Malcolm Reynolds and Roman Ring and Eliza Rutherford and Serkan Cabi and Tengda Han and Zhitao Gong and Sina Samangooei and Marianne Monteiro and Jacob Menick and Sebastian Borgeaud and Andy Brock and Aida Nematzadeh and Sahand Sharifzadeh and Mikolaj Binkowski and Ricardo Barreira and Oriol Vinyals and Andrew Zisserman and Karen Simonyan},
+  journal={ArXiv},
+  year={2022},
+  volume={abs/2204.14198}
+}
+```
+```bibtex
+@software{anas_awadalla_2023_7733589,
+  author = {Awadalla, Anas and Gao, Irena and Gardner, Joshua and Hessel, Jack and Hanafy, Yusuf and Zhu, Wanrong and Marathe, Kalyani and Bitton, Yonatan and Gadre, Samir and Jitsev, Jenia and Kornblith, Simon and Koh, Pang Wei and Ilharco, Gabriel and Wortsman, Mitchell and Schmidt, Ludwig},
+  title = {OpenFlamingo},
+  month        = mar,
+  year         = 2023,
+  publisher    = {Zenodo},
+  version      = {v0.1.1},
+  doi          = {10.5281/zenodo.7733589},
+  url          = {https://doi.org/10.5281/zenodo.7733589}
+}
+```
--- a/configs/flamingo/flamingo_fewshot_caption.py
+++ b/configs/flamingo/flamingo_fewshot_caption.py
+_base_ = [
+    '../_base_/default_runtime.py',
+]
+# model settings
+model = dict(
+    type='Flamingo',
+    tokenizer=dict(
+        type='LlamaTokenizer', name_or_path='decapoda-research/llama-7b-hf'),
+    vision_encoder=dict(
+        type='VisionTransformer',
+        arch='l',
+        patch_size=14,
+        pre_norm=True,
+        norm_cfg=dict(type='LN', eps=1e-5),
+        layer_cfgs=dict(act_cfg=dict(type='QuickGELU')),
+        final_norm=False,
+        out_type='raw',
+        pretrained=(
+            'https://download.openmmlab.com/mmclassification/v0/clip/'
+            'vit-large-p14_clip-openai-pre_3rdparty_20230517-95e2af0b.pth'),
+    ),
+    lang_encoder=dict(
+        base=dict(
+            type='AutoModelForCausalLM',
+            name_or_path='decapoda-research/llama-7b-hf',
+            local_files_only=True),
+        adapter=dict(
+            type='FlamingoLMAdapter',
+            vis_hidden_size=1024,
+            cross_attn_every_n_layers=4,
+            use_media_placement_augmentation=False),
+    ),
+    task='caption',
+    shot_prompt_tmpl='<image>Output:{caption}<|endofchunk|>',
+    final_prompt_tmpl='<image>Output:',
+    generation_cfg=dict(num_beams=3, max_new_tokens=20, length_penalty=-2.0))
+# data settings
+data_preprocessor = dict(
+    mean=[122.770938, 116.7460125, 104.09373615],
+    std=[68.5005327, 66.6321579, 70.32316305],
+    to_rgb=True,
+)
+test_pipeline = [
+    dict(
+        type='ApplyToList',
+        # Flamingo requires to load multiple images during few-shot inference.
+        scatter_key='img_path',
+        transforms=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='ResizeEdge',
+                scale=224,
+                interpolation='bicubic',
+                backend='pillow'),
+            dict(type='CenterCrop', crop_size=(224, 224)),
+        ],
+        collate_keys=['img', 'scale_factor', 'ori_shape'],
+    ),
+    dict(
+        type='PackInputs',
+        algorithm_keys=['gt_caption', 'shots'],
+        meta_keys=['image_id']),
+]
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    dataset=dict(
+        type='FlamingoEvalCOCOCaption',
+        data_root='data/coco',
+        ann_file='annotations/captions_train2014.json',
+        data_prefix=dict(img_path='train2014'),
+        pipeline=test_pipeline,
+        num_shots=2,
+        num_support_examples=2048,
+        num_query_examples=5000,
+    ),
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    persistent_workers=True,
+)
+val_evaluator = dict(
+    type='COCOCaption',
+    ann_file='data/coco/annotations/captions_train2014.json')
+# If you want standard test, please manually configure the test dataset
+test_dataloader = val_dataloader
+test_evaluator = val_evaluator
+# schedule settings
+val_cfg = dict()
+test_cfg = dict()
--- a/configs/flamingo/flamingo_fewshot_vqa.py
+++ b/configs/flamingo/flamingo_fewshot_vqa.py
+_base_ = [
+    '../_base_/default_runtime.py',
+]
+# model settings
+model = dict(
+    type='Flamingo',
+    tokenizer=dict(
+        type='LlamaTokenizer', name_or_path='decapoda-research/llama-7b-hf'),
+    vision_encoder=dict(
+        type='VisionTransformer',
+        arch='l',
+        patch_size=14,
+        pre_norm=True,
+        norm_cfg=dict(type='LN', eps=1e-5),
+        layer_cfgs=dict(act_cfg=dict(type='QuickGELU')),
+        final_norm=False,
+        out_type='raw',
+        pretrained=(
+            'https://download.openmmlab.com/mmclassification/v0/clip/'
+            'vit-large-p14_clip-openai-pre_3rdparty_20230517-95e2af0b.pth'),
+    ),
+    lang_encoder=dict(
+        base=dict(
+            type='AutoModelForCausalLM',
+            name_or_path='decapoda-research/llama-7b-hf',
+            local_files_only=True),
+        adapter=dict(
+            type='FlamingoLMAdapter',
+            vis_hidden_size=1024,
+            cross_attn_every_n_layers=4,
+            use_media_placement_augmentation=False),
+    ),
+    task='vqa',
+    shot_prompt_tmpl=
+    '<image>Question:{question} Short Answer:{answer}<|endofchunk|>',
+    final_prompt_tmpl='<image>Question:{question} Short Answer:',
+    generation_cfg=dict(num_beams=3, max_new_tokens=5, length_penalty=-2.0))
+# data settings
+data_preprocessor = dict(
+    mean=[122.770938, 116.7460125, 104.09373615],
+    std=[68.5005327, 66.6321579, 70.32316305],
+    to_rgb=True,
+)
+test_pipeline = [
+    dict(
+        type='ApplyToList',
+        # Flamingo requires to load multiple images during few-shot inference.
+        scatter_key='img_path',
+        transforms=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='ResizeEdge',
+                scale=224,
+                interpolation='bicubic',
+                backend='pillow'),
+            dict(type='CenterCrop', crop_size=(224, 224)),
+        ],
+        collate_keys=['img', 'scale_factor', 'ori_shape'],
+    ),
+    dict(
+        type='PackInputs',
+        algorithm_keys=['question', 'gt_answer', 'gt_answer_weight', 'shots'],
+        meta_keys=['image_id']),
+]
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    dataset=dict(
+        type='FlamingoEvalCOCOVQA',
+        data_root='data/coco',
+        data_prefix='val2014',
+        question_file='annotations/v2_OpenEnded_mscoco_val2014_questions.json',
+        ann_file='annotations/v2_mscoco_val2014_annotations.json',
+        pipeline=test_pipeline,
+        num_shots=2,
+        num_support_examples=2048,
+        num_query_examples=5000,
+    ),
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    persistent_workers=True,
+)
+val_evaluator = dict(type='VQAAcc')
+test_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    dataset=dict(
+        type='FlamingoEvalCOCOVQA',
+        data_root='data/coco',
+        data_prefix='test2015',
+        question_file=
+        'annotations/v2_OpenEnded_mscoco_test-dev2015_questions.json',
+        pipeline=test_pipeline,
+        num_shots=0,
+        num_support_examples=2048,
+        num_query_examples=5000,
+    ),
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    persistent_workers=True,
+)
+test_evaluator = dict(type='ReportVQA', file_path='vqa_test-dev.json')
+# schedule settings
+val_cfg = dict()
+test_cfg = dict()
--- a/configs/flamingo/flamingo_zeroshot_caption.py
+++ b/configs/flamingo/flamingo_zeroshot_caption.py
+_base_ = [
+    '../_base_/default_runtime.py',
+]
+zeroshot_prompt = (
+    'Output:A child holding a flowered umbrella and petting a yak.<|endofchunk|>'  # noqa: E501
+    'Output:The child is holding a brush close to his mouth.<|endofchunk|>'  # noqa: E501
+)
+# model settings
+model = dict(
+    type='Flamingo',
+    tokenizer=dict(
+        type='LlamaTokenizer', name_or_path='decapoda-research/llama-7b-hf'),
+    vision_encoder=dict(
+        type='VisionTransformer',
+        arch='l',
+        patch_size=14,
+        pre_norm=True,
+        norm_cfg=dict(type='LN', eps=1e-5),
+        layer_cfgs=dict(act_cfg=dict(type='QuickGELU')),
+        final_norm=False,
+        out_type='raw',
+        pretrained=(
+            'https://download.openmmlab.com/mmclassification/v0/clip/'
+            'vit-large-p14_clip-openai-pre_3rdparty_20230517-95e2af0b.pth'),
+    ),
+    lang_encoder=dict(
+        base=dict(
+            type='AutoModelForCausalLM',
+            name_or_path='decapoda-research/llama-7b-hf',
+            local_files_only=True),
+        adapter=dict(
+            type='FlamingoLMAdapter',
+            vis_hidden_size=1024,
+            cross_attn_every_n_layers=4,
+            use_media_placement_augmentation=False),
+    ),
+    task='caption',
+    zeroshot_prompt=zeroshot_prompt,
+    final_prompt_tmpl='<image>Output:',
+    generation_cfg=dict(num_beams=3, max_new_tokens=20, length_penalty=-2.0),
+)
+# data settings
+data_preprocessor = dict(
+    type='MultiModalDataPreprocessor',
+    mean=[122.770938, 116.7460125, 104.09373615],
+    std=[68.5005327, 66.6321579, 70.32316305],
+    to_rgb=True,
+)
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='ResizeEdge',
+        scale=224,
+        interpolation='bicubic',
+        backend='pillow'),
+    dict(type='CenterCrop', crop_size=(224, 224)),
+    dict(
+        type='PackInputs',
+        algorithm_keys=['gt_caption'],
+        meta_keys=['image_id'],
+    ),
+]
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    dataset=dict(
+        type='FlamingoEvalCOCOCaption',
+        data_root='data/coco',
+        ann_file='annotations/captions_train2014.json',
+        data_prefix=dict(img_path='train2014'),
+        pipeline=test_pipeline,
+        num_shots=0,
+        num_support_examples=2048,
+        num_query_examples=5000,
+    ),
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    persistent_workers=True,
+)
+val_evaluator = dict(
+    type='COCOCaption',
+    ann_file='data/coco/annotations/captions_train2014.json')
+# If you want standard test, please manually configure the test dataset
+test_dataloader = val_dataloader
+test_evaluator = val_evaluator
+# schedule settings
+val_cfg = dict()
+test_cfg = dict()
--- a/configs/flamingo/flamingo_zeroshot_vqa.py
+++ b/configs/flamingo/flamingo_zeroshot_vqa.py
+_base_ = [
+    '../_base_/default_runtime.py',
+]
+zeroshot_prompt = (
+    'Question:What is this photo taken looking through? Short Answer:pitcher<|endofchunk|>'  # noqa: E501
+    'Question:How many people are wearing shorts in the forefront of this photo? Short Answer:4<|endofchunk|>'  # noqa: E501
+)
+# model settings
+model = dict(
+    type='Flamingo',
+    tokenizer=dict(
+        type='LlamaTokenizer', name_or_path='decapoda-research/llama-7b-hf'),
+    vision_encoder=dict(
+        type='VisionTransformer',
+        arch='l',
+        patch_size=14,
+        pre_norm=True,
+        norm_cfg=dict(type='LN', eps=1e-5),
+        layer_cfgs=dict(act_cfg=dict(type='QuickGELU')),
+        final_norm=False,
+        out_type='raw',
+        pretrained=(
+            'https://download.openmmlab.com/mmclassification/v0/clip/'
+            'vit-large-p14_clip-openai-pre_3rdparty_20230517-95e2af0b.pth'),
+    ),
+    lang_encoder=dict(
+        base=dict(
+            type='AutoModelForCausalLM',
+            name_or_path='decapoda-research/llama-7b-hf',
+            local_files_only=True),
+        adapter=dict(
+            type='FlamingoLMAdapter',
+            vis_hidden_size=1024,
+            cross_attn_every_n_layers=4,
+            use_media_placement_augmentation=False),
+    ),
+    task='vqa',
+    zeroshot_prompt=zeroshot_prompt,
+    final_prompt_tmpl='<image>Question:{question} Short Answer:',
+    generation_cfg=dict(num_beams=3, max_new_tokens=5, length_penalty=-2.0))
+# data settings
+data_preprocessor = dict(
+    type='MultiModalDataPreprocessor',
+    mean=[122.770938, 116.7460125, 104.09373615],
+    std=[68.5005327, 66.6321579, 70.32316305],
+    to_rgb=True,
+)
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='ResizeEdge',
+        scale=224,
+        interpolation='bicubic',
+        backend='pillow'),
+    dict(type='CenterCrop', crop_size=(224, 224)),
+    dict(
+        type='PackInputs',
+        algorithm_keys=['question', 'gt_answer', 'gt_answer_weight', 'shots'],
+        meta_keys=['image_id'],
+    ),
+]
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    dataset=dict(
+        type='FlamingoEvalCOCOVQA',
+        data_root='data/coco',
+        data_prefix='val2014',
+        question_file='annotations/v2_OpenEnded_mscoco_val2014_questions.json',
+        ann_file='annotations/v2_mscoco_val2014_annotations.json',
+        pipeline=test_pipeline,
+        num_shots=0,
+        num_support_examples=2048,
+        num_query_examples=5000,
+    ),
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    persistent_workers=True,
+)
+val_evaluator = dict(type='VQAAcc')
+test_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    dataset=dict(
+        type='FlamingoEvalCOCOVQA',
+        data_root='data/coco',
+        data_prefix='test2015',
+        question_file=
+        'annotations/v2_OpenEnded_mscoco_test-dev2015_questions.json',
+        pipeline=test_pipeline,
+        num_shots=0,
+        num_support_examples=2048,
+        num_query_examples=5000,
+    ),
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    persistent_workers=True,
+)
+test_evaluator = dict(type='ReportVQA', file_path='vqa_test-dev.json')
+# schedule settings
+val_cfg = dict()
+test_cfg = dict()
--- a/configs/flamingo/metafile.yml
+++ b/configs/flamingo/metafile.yml
+Collections:
+  - Name: Flamingo
+    Metadata:
+      Architecture:
+        - Transformer
+        - Gated Cross-Attention Dense
+    Paper:
+      Title: 'Flamingo: a Visual Language Model for Few-Shot Learning'
+      URL: https://arxiv.org/abs/2204.14198
+    README: configs/flamingo/README.md
+Models:
+  - Name: flamingo_3rdparty-zeroshot_caption
+    Metadata:
+      FLOPs: null
+      Parameters: 8220452880
+    In Collection: Flamingo
+    Results:
+      - Task: Image Caption
+        Dataset: COCO
+        Metrics:
+          CIDER: 65.50  # Report from the official repo
+    Weights: https://download.openmmlab.com/mmclassification/v1/flamingo/openflamingo-9b-adapter_20230505-554310c8.pth
+    Config: configs/flamingo/flamingo_zeroshot_caption.py
+    Converted From:
+      Weights: https://huggingface.co/openflamingo/OpenFlamingo-9B
+      Code: https://github.com/mlfoundations/open_flamingo
+  - Name: flamingo_3rdparty-zeroshot_vqa
+    Metadata:
+      FLOPs: null
+      Parameters: 8220452880
+    In Collection: Flamingo
+    Results:
+      - Task: Visual Question Answering
+        Dataset: VQAv2
+        Metrics:
+          Accuracy: 43.50  # Report from the official repo
+    Weights: https://download.openmmlab.com/mmclassification/v1/flamingo/openflamingo-9b-adapter_20230505-554310c8.pth
+    Config: configs/flamingo/flamingo_zeroshot_vqa.py
+    Converted From:
+      Weights: https://huggingface.co/openflamingo/OpenFlamingo-9B
+      Code: https://github.com/mlfoundations/open_flamingo
--- a/configs/glip/README.md
+++ b/configs/glip/README.md
+# GLIP
+> [Grounded Language-Image Pre-training](https://arxiv.org/abs/2112.03857)
+<!-- [ALGORITHM] -->
+## Abstract
+This paper presents a grounded language-image pre-training (GLIP) model for learning object-level, language-aware, and semantic-rich visual representations. GLIP unifies object detection and phrase grounding for pre-training. The unification brings two benefits: 1) it allows GLIP to learn from both detection and grounding data to improve both tasks and bootstrap a good grounding model; 2) GLIP can leverage massive image-text pairs by generating grounding boxes in a self-training fashion, making the learned representation semantic-rich. In our experiments, we pre-train GLIP on 27M grounding data, including 3M human-annotated and 24M web-crawled image-text pairs. The learned representations demonstrate strong zero-shot and few-shot transferability to various object-level recognition tasks. 1) When directly evaluated on COCO and LVIS (without seeing any images in COCO during pre-training), GLIP achieves 49.8 AP and 26.9 AP, respectively, surpassing many supervised baselines. 2) After fine-tuned on COCO, GLIP achieves 60.8 AP on val and 61.5 AP on test-dev, surpassing prior SoTA. 3) When transferred to 13 downstream object detection tasks, a 1-shot GLIP rivals with a fully-supervised Dynamic Head.
+<div align="center">
+<img src="https://github.com/microsoft/GLIP/blob/main/docs/lead.png" width="70%"/>
+</div>
+## How to use it?
+<!-- [TABS-BEGIN] -->
+**Use the model**
+```python
+import torch
+from mmpretrain import get_model
+model = get_model('swin-t_glip-pre_3rdparty', pretrained=True)
+inputs = torch.rand(1, 3, 224, 224)
+out = model(inputs)
+print(type(out))
+# To extract features.
+feats = model.extract_feat(inputs)
+print(type(feats))
+```
+<!-- [TABS-END] -->
+## Results and models
+### Pre-trained models
+The pre-trained models are used to fine-tune, and therefore don't have evaluation results.
+| Model                                       |          Pretrain          | resolution |                                                       Download                                                        |
+| :------------------------------------------ | :------------------------: | :--------: | :-------------------------------------------------------------------------------------------------------------------: |
+| GLIP-T (`swin-t_glip-pre_3rdparty`)\*       |    O365,GoldG,CC3M,SBU     |  224x224   |    [model](https://download.openmmlab.com/mmclassification/v1/glip/swin-t_glip-pre_3rdparty_20230413-d85813b5.pth)    |
+| GLIP-L (`swin-l_glip-pre_3rdparty_384px`)\* | FourODs,GoldG,CC3M+12M,SBU |  384x384   | [model](https://download.openmmlab.com/mmclassification/v1/glip/swin-l_glip-pre_3rdparty_384px_20230413-04b198e8.pth) |
+*Models with * are converted from the [official repo](https://github.com/microsoft/GLIP).*
+## Citation
+```bibtex
+@inproceedings{li2021grounded,
+      title={Grounded Language-Image Pre-training},
+      author={Liunian Harold Li* and Pengchuan Zhang* and Haotian Zhang* and Jianwei Yang and Chunyuan Li and Yiwu Zhong and Lijuan Wang and Lu Yuan and Lei Zhang and Jenq-Neng Hwang and Kai-Wei Chang and Jianfeng Gao},
+      year={2022},
+      booktitle={CVPR},
+}
+```
--- a/configs/glip/glip-l_headless.py
+++ b/configs/glip/glip-l_headless.py
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='SwinTransformer',
+        arch='large',
+        img_size=384,
+        out_indices=(1, 2, 3),  # original weight is for detection
+        stage_cfgs=dict(block_cfgs=dict(window_size=12))),
+    neck=None,
+    head=None)
+data_preprocessor = dict(
+    # RGB format normalization parameters
+    mean=[103.53, 116.28, 123.675],
+    std=[57.375, 57.12, 58.395],
+    # convert image from BGR to RGB
+    to_rgb=False,
+)
--- a/configs/glip/glip-t_headless.py
+++ b/configs/glip/glip-t_headless.py
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='SwinTransformer',
+        arch='tiny',
+        img_size=224,
+        out_indices=(1, 2, 3),  # original weight is for detection
+    ),
+    neck=None,
+    head=None)
+data_preprocessor = dict(
+    # RGB format normalization parameters
+    mean=[103.53, 116.28, 123.675],
+    std=[57.375, 57.12, 58.395],
+    # convert image from BGR to RGB
+    to_rgb=False,
+)
--- a/configs/glip/metafile.yml
+++ b/configs/glip/metafile.yml
+Collections:
+  - Name: GLIP
+    Metadata:
+      Training Techniques:
+        - AdamW
+        - Weight Decay
+      Architecture:
+        - Shift Window Multihead Self Attention
+    Paper:
+      URL: https://arxiv.org/abs/2112.03857
+      Title: "Grounded Language-Image Pre-training"
+    README: configs/glip/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmpretrain/blob/main/mmpretrain/models/backbones/vit.py
+      Version: v1.0.0rc8
+Models:
+  - Name: swin-t_glip-pre_3rdparty
+    In Collection: GLIP
+    Metadata:
+      FLOPs: 4508464128
+      Parameters: 29056354
+      Training Data:
+        - O365
+        - GoldG
+        - CC3M
+        - SBU
+    Results: null
+    Weights: https://download.openmmlab.com/mmclassification/v1/glip/swin-t_glip-pre_3rdparty_20230413-d85813b5.pth
+    Converted From:
+      Weights: https://penzhanwu2bbs.blob.core.windows.net/data/GLIPv1_Open/models/glip_tiny_model_o365_goldg_cc_sbu.pth
+      Code: https://github.com/microsoft/GLIP
+    Config: configs/glip/glip-t_headless.py
+  - Name: swin-l_glip-pre_3rdparty_384px
+    In Collection: GLIP
+    Metadata:
+      FLOPs: 104080343040
+      Parameters: 196735516
+      Training Data:
+        - FourODs
+        - GoldG
+        - CC3M+12M
+        - SBU
+    Results: null
+    Weights: https://download.openmmlab.com/mmclassification/v1/glip/swin-l_glip-pre_3rdparty_384px_20230413-04b198e8.pth
+    Converted From:
+      Weights: https://penzhanwu2bbs.blob.core.windows.net/data/GLIPv1_Open/models/glip_large_model.pth
+      Code: https://github.com/microsoft/GLIP
+    Config: configs/glip/glip-l_headless.py
--- a/configs/hivit/README.md
+++ b/configs/hivit/README.md
+# HiViT
+> [HiViT: A Simple and More Efficient Design of Hierarchical Vision Transformer](https://arxiv.org/abs/2205.14949)
+<!-- [ALGORITHM] -->
+## Abstract
+Recently, masked image modeling (MIM) has offered a new methodology of self-supervised pre-training of vision transformers. A key idea of efficient implementation is to discard the masked image patches (or tokens) throughout the target network (encoder), which requires the encoder to be a plain vision transformer (e.g., ViT), albeit hierarchical vision transformers (e.g., Swin Transformer) have potentially better properties in formulating vision inputs. In this paper, we offer a new design of hierarchical vision transformers named HiViT (short for Hierarchical ViT) that enjoys both high efficiency and good performance in MIM. The key is to remove the unnecessary "local inter-unit operations", deriving structurally simple hierarchical vision transformers in which mask-units can be serialized like plain vision transformers. For this purpose, we start with Swin Transformer and (i) set the masking unit size to be the token size in the main stage of Swin Transformer, (ii) switch off inter-unit self-attentions before the main stage, and (iii) eliminate all operations after the main stage. Empirical studies demonstrate the advantageous performance of HiViT in terms of fully-supervised, self-supervised, and transfer learning. In particular, in running MAE on ImageNet-1K, HiViT-B reports a +0.6% accuracy gain over ViT-B and a 1.9$\times$ speed-up over Swin-B, and the performance gain generalizes to downstream tasks of detection and segmentation. Code will be made publicly available.
+<div align=center>
+<img src="https://github.com/open-mmlab/mmpretrain/assets/36138628/4a99cf9d-15df-4866-8750-bd2c3db5d894" width="80%"/>
+</div>
+## How to use it?
+<!-- [TABS-BEGIN] -->
+<!-- **Predict image**
+```python
+from mmpretrain import inference_model
+predict = inference_model('hivit-tiny-p16_16xb64_in1k', 'demo/bird.JPEG')
+print(predict['pred_class'])
+print(predict['pred_score'])
+```
+<!-- **Use the model** -->
+<!-- ```python
+import torch
+from mmpretrain import get_model
+model = get_model('hivit-tiny-p16_16xb64_in1k', pretrained=True)
+inputs = torch.rand(1, 3, 224, 224)
+out = model(inputs)
+print(type(out))
+# To extract features.
+feats = model.extract_feat(inputs)
+print(type(feats))
+``` -->
+**Train/Test Command**
+Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
+Train:
+```shell
+python tools/train.py configs/hivit/hivit-tiny-p16_16xb64_in1k.py
+```
+<!-- Test:
+```shell
+python tools/test.py configs/hivit/hivit-tiny-p16_16xb64_in1k.py None
+``` -->
+<!-- [TABS-END] -->
+## Models and results
+### Image Classification on ImageNet-1k
+| Model                         |   Pretrain   | Params (M) | Flops (G) | Top-1 (%) |                  Config                  | Download |
+| :---------------------------- | :----------: | :--------: | :-------: | :-------: | :--------------------------------------: | :------: |
+| `hivit-tiny-p16_16xb64_in1k`  | From scratch |   19.18    |   4.60    |   82.10   | [config](hivit-tiny-p16_16xb64_in1k.py)  |   N/A    |
+| `hivit-small-p16_16xb64_in1k` | From scratch |   37.53    |   9.07    |    N/A    | [config](hivit-small-p16_16xb64_in1k.py) |   N/A    |
+| `hivit-base-p16_16xb64_in1k`  | From scratch |   79.05    |   18.47   |    N/A    | [config](hivit-base-p16_16xb64_in1k.py)  |   N/A    |
+## Citation
+```bibtex
+@inproceedings{zhanghivit,
+  title={HiViT: A Simpler and More Efficient Design of Hierarchical Vision Transformer},
+  author={Zhang, Xiaosong and Tian, Yunjie and Xie, Lingxi and Huang, Wei and Dai, Qi and Ye, Qixiang and Tian, Qi},
+  booktitle={International Conference on Learning Representations},
+  year={2023},
+}
+```