first commit

dff2c686 · renzhc · 8f9dd0ed · dff2c686 · dff2c686 · dff2c686
Commit dff2c686 authored Sep 03, 2024 by renzhc
20 changed files
--- a/configs/mocov3/benchmarks/resnet50_8xb128-linear-coslr-90e_in1k.py
+++ b/configs/mocov3/benchmarks/resnet50_8xb128-linear-coslr-90e_in1k.py
+_base_ = [
+    '../../_base_/models/resnet50.py',
+    '../../_base_/datasets/imagenet_bs32_pil_resize.py',
+    '../../_base_/schedules/imagenet_sgd_coslr_100e.py',
+    '../../_base_/default_runtime.py',
+]
+
+# dataset settings
+train_dataloader = dict(batch_size=128)
+
+model = dict(
+    backbone=dict(
+        frozen_stages=4,
+        norm_eval=True,
+        init_cfg=dict(type='Pretrained', checkpoint='', prefix='backbone.')))
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.4, momentum=0.9, weight_decay=0.))
+
+# learning rate scheduler
+param_scheduler = [
+    dict(type='CosineAnnealingLR', T_max=90, by_epoch=True, begin=0, end=90)
+]
+
+# runtime settings
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=90)
+
+default_hooks = dict(
+    checkpoint=dict(type='CheckpointHook', interval=10, max_keep_ckpts=3))
--- a/configs/mocov3/benchmarks/vit-base-p16_8xb128-linear-coslr-90e_in1k.py
+++ b/configs/mocov3/benchmarks/vit-base-p16_8xb128-linear-coslr-90e_in1k.py
+_base_ = [
+    '../../_base_/datasets/imagenet_bs32_pil_resize.py',
+    '../../_base_/default_runtime.py',
+]
+
+# dataset settings
+train_dataloader = dict(batch_size=128)
+
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='MoCoV3ViT',
+        arch='base',  # embed_dim = 768
+        img_size=224,
+        patch_size=16,
+        stop_grad_conv1=True,
+        frozen_stages=12,
+        norm_eval=True,
+        init_cfg=dict(type='Pretrained', checkpoint='', prefix='backbone.')),
+    head=dict(
+        type='VisionTransformerClsHead',
+        num_classes=1000,
+        in_channels=768,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+        init_cfg=dict(type='Normal', std=0.01, layer='Linear'),
+    ))
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=12, momentum=0.9, weight_decay=0.))
+
+# learning rate scheduler
+param_scheduler = [
+    dict(type='CosineAnnealingLR', T_max=90, by_epoch=True, begin=0, end=90)
+]
+
+# runtime settings
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=90)
+val_cfg = dict()
+test_cfg = dict()
+
+default_hooks = dict(
+    checkpoint=dict(type='CheckpointHook', interval=10, max_keep_ckpts=3))
--- a/configs/mocov3/benchmarks/vit-base-p16_8xb64-coslr-150e_in1k.py
+++ b/configs/mocov3/benchmarks/vit-base-p16_8xb64-coslr-150e_in1k.py
+_base_ = [
+    '../../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../../_base_/default_runtime.py',
+]
+
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='VisionTransformer',
+        arch='base',
+        img_size=224,
+        patch_size=16,
+        drop_path_rate=0.1,
+        init_cfg=dict(type='Pretrained', checkpoint='', prefix='backbone.')),
+    neck=None,
+    head=dict(
+        type='VisionTransformerClsHead',
+        num_classes=1000,
+        in_channels=768,
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        init_cfg=[
+            dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+            dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
+        ]),
+    train_cfg=dict(augments=[
+        dict(type='Mixup', alpha=0.8),
+        dict(type='CutMix', alpha=1.0)
+    ]))
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=5e-4, eps=1e-8, betas=(0.9, 0.999),
+        weight_decay=0.05),
+    clip_grad=dict(max_norm=5.0),
+    paramwise_cfg=dict(
+        norm_decay_mult=0.0,
+        bias_decay_mult=0.0,
+        custom_keys={
+            '.cls_token': dict(decay_mult=0.0),
+            '.pos_embed': dict(decay_mult=0.0)
+        }))
+
+# learning rate scheduler
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1e-3,
+        begin=0,
+        end=5,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=145,
+        eta_min=1e-5,
+        by_epoch=True,
+        begin=5,
+        end=150,
+        convert_to_iter_based=True)
+]
+
+# runtime settings
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=150)
+val_cfg = dict()
+test_cfg = dict()
+
+default_hooks = dict(
+    checkpoint=dict(type='CheckpointHook', interval=10, max_keep_ckpts=3))
+custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')]
+
+randomness = dict(seed=0)
--- a/configs/mocov3/benchmarks/vit-large-p16_8xb64-coslr-100e_in1k.py
+++ b/configs/mocov3/benchmarks/vit-large-p16_8xb64-coslr-100e_in1k.py
+_base_ = [
+    '../../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../../_base_/default_runtime.py',
+]
+
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='VisionTransformer',
+        arch='large',
+        img_size=224,
+        patch_size=16,
+        drop_path_rate=0.5,
+        init_cfg=dict(type='Pretrained', checkpoint='', prefix='backbone.')),
+    neck=None,
+    head=dict(
+        type='VisionTransformerClsHead',
+        num_classes=1000,
+        in_channels=1024,
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        init_cfg=[
+            dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+            dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
+        ]),
+    train_cfg=dict(augments=[
+        dict(type='Mixup', alpha=0.8),
+        dict(type='CutMix', alpha=1.0)
+    ]))
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=5e-4, eps=1e-8, betas=(0.9, 0.999),
+        weight_decay=0.05),
+    clip_grad=dict(max_norm=5.0),
+    paramwise_cfg=dict(
+        norm_decay_mult=0.0,
+        bias_decay_mult=0.0,
+        custom_keys={
+            '.cls_token': dict(decay_mult=0.0),
+            '.pos_embed': dict(decay_mult=0.0)
+        }))
+
+# learning rate scheduler
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1e-3,
+        begin=0,
+        end=5,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=95,
+        eta_min=1e-5,
+        by_epoch=True,
+        begin=5,
+        end=100,
+        convert_to_iter_based=True)
+]
+
+# runtime settings
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=100)
+val_cfg = dict()
+test_cfg = dict()
+
+default_hooks = dict(
+    checkpoint=dict(type='CheckpointHook', interval=10, max_keep_ckpts=3))
+custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')]
+
+randomness = dict(seed=0)
--- a/configs/mocov3/benchmarks/vit-small-p16_8xb128-linear-coslr-90e_in1k.py
+++ b/configs/mocov3/benchmarks/vit-small-p16_8xb128-linear-coslr-90e_in1k.py
+_base_ = [
+    '../../_base_/datasets/imagenet_bs32_pil_resize.py',
+    '../../_base_/default_runtime.py',
+]
+
+# dataset settings
+train_dataloader = dict(batch_size=128)
+
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='MoCoV3ViT',
+        arch='mocov3-small',  # embed_dim = 384
+        img_size=224,
+        patch_size=16,
+        stop_grad_conv1=True,
+        frozen_stages=12,
+        norm_eval=True,
+        init_cfg=dict(type='Pretrained', checkpoint='', prefix='backbone.')),
+    head=dict(
+        type='VisionTransformerClsHead',
+        num_classes=1000,
+        in_channels=384,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+        init_cfg=dict(type='Normal', std=0.01, layer='Linear'),
+    ))
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=12, momentum=0.9, weight_decay=0.))
+
+# learning rate scheduler
+param_scheduler = [
+    dict(type='CosineAnnealingLR', T_max=90, by_epoch=True, begin=0, end=90)
+]
+
+# runtime settings
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=90)
+val_cfg = dict()
+test_cfg = dict()
+
+default_hooks = dict(
+    checkpoint=dict(type='CheckpointHook', interval=10, max_keep_ckpts=3))
--- a/configs/mocov3/metafile.yml
+++ b/configs/mocov3/metafile.yml
+Collections:
+  - Name: MoCoV3
+    Metadata:
+      Training Data: ImageNet-1k
+      Training Techniques:
+        - LARS
+      Training Resources: 32x V100 GPUs
+      Architecture:
+        - ResNet
+        - ViT
+        - MoCo
+    Paper:
+      Title: An Empirical Study of Training Self-Supervised Vision Transformers
+      URL: https://arxiv.org/abs/2104.02057
+    README: configs/mocov3/README.md
+
+Models:
+  - Name: mocov3_resnet50_8xb512-amp-coslr-100e_in1k
+    Metadata:
+      Epochs: 100
+      Batch Size: 4096
+      FLOPs: 4109364224
+      Parameters: 68012160
+      Training Data: ImageNet-1k
+    In Collection: MoCoV3
+    Results: null
+    Weights: https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-100e_in1k/mocov3_resnet50_8xb512-amp-coslr-100e_in1k_20220927-f1144efa.pth
+    Config: configs/mocov3/mocov3_resnet50_8xb512-amp-coslr-100e_in1k.py
+    Downstream:
+      - resnet50_mocov3-100e-pre_8xb128-linear-coslr-90e_in1k
+  - Name: mocov3_resnet50_8xb512-amp-coslr-300e_in1k
+    Metadata:
+      Epochs: 300
+      Batch Size: 4096
+      FLOPs: 4109364224
+      Parameters: 68012160
+      Training Data: ImageNet-1k
+    In Collection: MoCoV3
+    Results: null
+    Weights: https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-300e_in1k/mocov3_resnet50_8xb512-amp-coslr-300e_in1k_20220927-1e4f3304.pth
+    Config: configs/mocov3/mocov3_resnet50_8xb512-amp-coslr-300e_in1k.py
+    Downstream:
+      - resnet50_mocov3-300e-pre_8xb128-linear-coslr-90e_in1k
+  - Name: mocov3_resnet50_8xb512-amp-coslr-800e_in1k
+    Metadata:
+      Epochs: 800
+      Batch Size: 4096
+      FLOPs: 4109364224
+      Parameters: 68012160
+      Training Data: ImageNet-1k
+    In Collection: MoCoV3
+    Results: null
+    Weights: https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-800e_in1k/mocov3_resnet50_8xb512-amp-coslr-800e_in1k_20220927-e043f51a.pth
+    Config: configs/mocov3/mocov3_resnet50_8xb512-amp-coslr-800e_in1k.py
+    Downstream:
+      - resnet50_mocov3-800e-pre_8xb128-linear-coslr-90e_in1k
+  - Name: resnet50_mocov3-100e-pre_8xb128-linear-coslr-90e_in1k
+    Metadata:
+      Epochs: 90
+      Batch Size: 1024
+      FLOPs: 4109464576
+      Parameters: 25557032
+      Training Data: ImageNet-1k
+    In Collection: MoCoV3
+    Results:
+      - Task: Image Classification
+        Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 69.6
+    Weights: https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-100e_in1k/resnet50_linear-8xb128-coslr-90e_in1k/resnet50_linear-8xb128-coslr-90e_in1k_20220927-8f7d937e.pth
+    Config: configs/mocov3/benchmarks/resnet50_8xb128-linear-coslr-90e_in1k.py
+  - Name: resnet50_mocov3-300e-pre_8xb128-linear-coslr-90e_in1k
+    Metadata:
+      Epochs: 90
+      Batch Size: 1024
+      FLOPs: 4109464576
+      Parameters: 25557032
+      Training Data: ImageNet-1k
+    In Collection: MoCoV3
+    Results:
+      - Task: Image Classification
+        Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 72.8
+    Weights: https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-300e_in1k/resnet50_linear-8xb128-coslr-90e_in1k/resnet50_linear-8xb128-coslr-90e_in1k_20220927-d21ddac2.pth
+    Config: configs/mocov3/benchmarks/resnet50_8xb128-linear-coslr-90e_in1k.py
+  - Name: resnet50_mocov3-800e-pre_8xb128-linear-coslr-90e_in1k
+    Metadata:
+      Epochs: 90
+      Batch Size: 1024
+      FLOPs: 4109464576
+      Parameters: 25557032
+      Training Data: ImageNet-1k
+    In Collection: MoCoV3
+    Results:
+      - Task: Image Classification
+        Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 74.4
+    Weights: https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-800e_in1k/resnet50_linear-8xb128-coslr-90e_in1k/resnet50_linear-8xb128-coslr-90e_in1k_20220927-0e97a483.pth
+    Config: configs/mocov3/benchmarks/resnet50_8xb128-linear-coslr-90e_in1k.py
+  - Name: mocov3_vit-small-p16_16xb256-amp-coslr-300e_in1k
+    Metadata:
+      Epochs: 300
+      Batch Size: 4096
+      FLOPs: 4607954304
+      Parameters: 84266752
+      Training Data: ImageNet-1k
+    In Collection: MoCoV3
+    Results: null
+    Weights: https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-small-p16_16xb256-amp-coslr-300e_in1k/mocov3_vit-small-p16_16xb256-amp-coslr-300e_in1k-224_20220826-08bc52f7.pth
+    Config: configs/mocov3/mocov3_vit-small-p16_16xb256-amp-coslr-300e_in1k.py
+    Downstream:
+      - vit-small-p16_mocov3-pre_8xb128-linear-coslr-90e_in1k
+  - Name: vit-small-p16_mocov3-pre_8xb128-linear-coslr-90e_in1k
+    Metadata:
+      Epochs: 90
+      Batch Size: 1024
+      FLOPs: 4607954304
+      Parameters: 22050664
+      Training Data: ImageNet-1k
+    In Collection: MoCoV3
+    Results:
+      - Task: Image Classification
+        Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 73.6
+    Weights: https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-small-p16_16xb256-amp-coslr-300e_in1k/vit-small-p16_linear-8xb128-coslr-90e_in1k/vit-small-p16_linear-8xb128-coslr-90e_in1k_20220826-376674ef.pth
+    Config: configs/mocov3/benchmarks/vit-small-p16_8xb128-linear-coslr-90e_in1k.py
+  - Name: mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k
+    Metadata:
+      Epochs: 300
+      Batch Size: 4096
+      FLOPs: 17581972224
+      Parameters: 215678464
+      Training Data: ImageNet-1k
+    In Collection: MoCoV3
+    Results: null
+    Weights: https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k/mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k-224_20220826-25213343.pth
+    Config: configs/mocov3/mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k.py
+    Downstream:
+      - vit-base-p16_mocov3-pre_8xb128-linear-coslr-90e_in1k
+      - vit-base-p16_mocov3-pre_8xb64-coslr-150e_in1k
+  - Name: vit-base-p16_mocov3-pre_8xb64-coslr-150e_in1k
+    Metadata:
+      Epochs: 150
+      Batch Size: 512
+      FLOPs: 17581972224
+      Parameters: 86567656
+      Training Data: ImageNet-1k
+    In Collection: MoCoV3
+    Results:
+      - Task: Image Classification
+        Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 83.0
+    Weights: https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k/vit-base-p16_ft-8xb64-coslr-150e_in1k/vit-base-p16_ft-8xb64-coslr-150e_in1k_20220826-f1e6c442.pth
+    Config: configs/mocov3/benchmarks/vit-base-p16_8xb64-coslr-150e_in1k.py
+  - Name: vit-base-p16_mocov3-pre_8xb128-linear-coslr-90e_in1k
+    Metadata:
+      Epochs: 90
+      Batch Size: 1024
+      FLOPs: 17581972224
+      Parameters: 86567656
+      Training Data: ImageNet-1k
+    In Collection: MoCoV3
+    Results:
+      - Task: Image Classification
+        Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 76.9
+    Weights: https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k/vit-base-p16_linear-8xb128-coslr-90e_in1k/vit-base-p16_linear-8xb128-coslr-90e_in1k_20220826-83be7758.pth
+    Config: configs/mocov3/benchmarks/vit-base-p16_8xb128-linear-coslr-90e_in1k.py
+  - Name: mocov3_vit-large-p16_64xb64-amp-coslr-300e_in1k
+    Metadata:
+      Epochs: 300
+      Batch Size: 4096
+      FLOPs: 61603111936
+      Parameters: 652781568
+      Training Data: ImageNet-1k
+    In Collection: MoCoV3
+    Results: null
+    Weights: https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-large-p16_64xb64-amp-coslr-300e_in1k/mocov3_vit-large-p16_64xb64-amp-coslr-300e_in1k-224_20220829-9b88a442.pth
+    Config: configs/mocov3/mocov3_vit-large-p16_64xb64-amp-coslr-300e_in1k.py
+    Downstream:
+      - vit-large-p16_mocov3-pre_8xb64-coslr-100e_in1k
+  - Name: vit-large-p16_mocov3-pre_8xb64-coslr-100e_in1k
+    Metadata:
+      Epochs: 100
+      Batch Size: 512
+      FLOPs: 61603111936
+      Parameters: 304326632
+      Training Data: ImageNet-1k
+    In Collection: MoCoV3
+    Results:
+      - Task: Image Classification
+        Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 83.7
+    Weights: https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_vit-large-p16_64xb64-amp-coslr-300e_in1k/vit-large-p16_ft-8xb64-coslr-100e_in1k/vit-large-p16_ft-8xb64-coslr-100e_in1k_20220829-878a2f7f.pth
+    Config: configs/mocov3/benchmarks/vit-large-p16_8xb64-coslr-100e_in1k.py
--- a/configs/mocov3/mocov3_resnet50_8xb512-amp-coslr-100e_in1k.py
+++ b/configs/mocov3/mocov3_resnet50_8xb512-amp-coslr-100e_in1k.py
+_base_ = [
+    '../_base_/datasets/imagenet_bs512_mocov3.py',
+    '../_base_/default_runtime.py',
+]
+
+# model settings
+temperature = 1.0
+model = dict(
+    type='MoCoV3',
+    base_momentum=0.01,  # 0.01 for 100e and 300e, 0.004 for 1000e
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        norm_cfg=dict(type='SyncBN'),
+        zero_init_residual=False),
+    neck=dict(
+        type='NonLinearNeck',
+        in_channels=2048,
+        hid_channels=4096,
+        out_channels=256,
+        num_layers=2,
+        with_bias=False,
+        with_last_bn=True,
+        with_last_bn_affine=False,
+        with_last_bias=False,
+        with_avg_pool=True),
+    head=dict(
+        type='MoCoV3Head',
+        predictor=dict(
+            type='NonLinearNeck',
+            in_channels=256,
+            hid_channels=4096,
+            out_channels=256,
+            num_layers=2,
+            with_bias=False,
+            with_last_bn=False,
+            with_last_bn_affine=False,
+            with_last_bias=False,
+            with_avg_pool=False),
+        loss=dict(type='CrossEntropyLoss', loss_weight=2 * temperature),
+        temperature=temperature))
+
+# optimizer
+optim_wrapper = dict(
+    type='AmpOptimWrapper',
+    loss_scale='dynamic',
+    optimizer=dict(type='LARS', lr=9.6, weight_decay=1e-6, momentum=0.9),
+    paramwise_cfg=dict(
+        custom_keys={
+            'bn': dict(decay_mult=0, lars_exclude=True),
+            'bias': dict(decay_mult=0, lars_exclude=True),
+            # bn layer in ResNet block downsample module
+            'downsample.1': dict(decay_mult=0, lars_exclude=True),
+        }),
+)
+
+# learning rate scheduler
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1e-4,
+        by_epoch=True,
+        begin=0,
+        end=10,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=90,
+        by_epoch=True,
+        begin=10,
+        end=100,
+        convert_to_iter_based=True)
+]
+
+# runtime settings
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=100)
+# only keeps the latest 3 checkpoints
+default_hooks = dict(checkpoint=dict(max_keep_ckpts=3))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+auto_scale_lr = dict(base_batch_size=4096)
--- a/configs/mocov3/mocov3_resnet50_8xb512-amp-coslr-300e_in1k.py
+++ b/configs/mocov3/mocov3_resnet50_8xb512-amp-coslr-300e_in1k.py
+_base_ = [
+    '../_base_/datasets/imagenet_bs512_mocov3.py',
+    '../_base_/default_runtime.py',
+]
+
+# model settings
+temperature = 1.0
+model = dict(
+    type='MoCoV3',
+    base_momentum=0.01,  # 0.01 for 100e and 300e, 0.004 for 1000e
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        norm_cfg=dict(type='SyncBN'),
+        zero_init_residual=False),
+    neck=dict(
+        type='NonLinearNeck',
+        in_channels=2048,
+        hid_channels=4096,
+        out_channels=256,
+        num_layers=2,
+        with_bias=False,
+        with_last_bn=True,
+        with_last_bn_affine=False,
+        with_last_bias=False,
+        with_avg_pool=True),
+    head=dict(
+        type='MoCoV3Head',
+        predictor=dict(
+            type='NonLinearNeck',
+            in_channels=256,
+            hid_channels=4096,
+            out_channels=256,
+            num_layers=2,
+            with_bias=False,
+            with_last_bn=False,
+            with_last_bn_affine=False,
+            with_last_bias=False,
+            with_avg_pool=False),
+        loss=dict(type='CrossEntropyLoss', loss_weight=2 * temperature),
+        temperature=temperature))
+
+# optimizer
+optim_wrapper = dict(
+    type='AmpOptimWrapper',
+    loss_scale='dynamic',
+    optimizer=dict(type='LARS', lr=4.8, weight_decay=1e-6, momentum=0.9),
+    paramwise_cfg=dict(
+        custom_keys={
+            'bn': dict(decay_mult=0, lars_exclude=True),
+            'bias': dict(decay_mult=0, lars_exclude=True),
+            # bn layer in ResNet block downsample module
+            'downsample.1': dict(decay_mult=0, lars_exclude=True),
+        }),
+)
+
+# learning rate scheduler
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1e-4,
+        by_epoch=True,
+        begin=0,
+        end=10,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=290,
+        by_epoch=True,
+        begin=10,
+        end=300,
+        convert_to_iter_based=True)
+]
+
+# runtime settings
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=300)
+# only keeps the latest 3 checkpoints
+default_hooks = dict(checkpoint=dict(max_keep_ckpts=3))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+auto_scale_lr = dict(base_batch_size=4096)
--- a/configs/mocov3/mocov3_resnet50_8xb512-amp-coslr-800e_in1k.py
+++ b/configs/mocov3/mocov3_resnet50_8xb512-amp-coslr-800e_in1k.py
+_base_ = [
+    '../_base_/datasets/imagenet_bs512_mocov3.py',
+    '../_base_/default_runtime.py',
+]
+
+# model settings
+temperature = 1.0
+model = dict(
+    type='MoCoV3',
+    base_momentum=0.004,  # 0.01 for 100e and 300e, 0.004 for 800 and 1000e
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        norm_cfg=dict(type='SyncBN'),
+        zero_init_residual=False),
+    neck=dict(
+        type='NonLinearNeck',
+        in_channels=2048,
+        hid_channels=4096,
+        out_channels=256,
+        num_layers=2,
+        with_bias=False,
+        with_last_bn=True,
+        with_last_bn_affine=False,
+        with_last_bias=False,
+        with_avg_pool=True),
+    head=dict(
+        type='MoCoV3Head',
+        predictor=dict(
+            type='NonLinearNeck',
+            in_channels=256,
+            hid_channels=4096,
+            out_channels=256,
+            num_layers=2,
+            with_bias=False,
+            with_last_bn=False,
+            with_last_bn_affine=False,
+            with_last_bias=False,
+            with_avg_pool=False),
+        loss=dict(type='CrossEntropyLoss', loss_weight=2 * temperature),
+        temperature=temperature))
+
+# optimizer
+optim_wrapper = dict(
+    type='AmpOptimWrapper',
+    loss_scale='dynamic',
+    optimizer=dict(type='LARS', lr=4.8, weight_decay=1.5e-6, momentum=0.9),
+    paramwise_cfg=dict(
+        custom_keys={
+            'bn': dict(decay_mult=0, lars_exclude=True),
+            'bias': dict(decay_mult=0, lars_exclude=True),
+            # bn layer in ResNet block downsample module
+            'downsample.1': dict(decay_mult=0, lars_exclude=True),
+        }),
+)
+
+# learning rate scheduler
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1e-4,
+        by_epoch=True,
+        begin=0,
+        end=10,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=790,
+        by_epoch=True,
+        begin=10,
+        end=800,
+        convert_to_iter_based=True)
+]
+
+# runtime settings
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=800)
+# only keeps the latest 3 checkpoints
+default_hooks = dict(checkpoint=dict(max_keep_ckpts=3))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+auto_scale_lr = dict(base_batch_size=4096)
--- a/configs/mocov3/mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k.py
+++ b/configs/mocov3/mocov3_vit-base-p16_16xb256-amp-coslr-300e_in1k.py
+_base_ = [
+    '../_base_/datasets/imagenet_bs512_mocov3.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset settings
+# the difference between ResNet50 and ViT pipeline is the `scale` in
+# `RandomResizedCrop`, `scale=(0.08, 1.)` in ViT pipeline
+view_pipeline1 = [
+    dict(
+        type='RandomResizedCrop',
+        scale=224,
+        crop_ratio_range=(0.08, 1.),
+        backend='pillow'),
+    dict(
+        type='RandomApply',
+        transforms=[
+            dict(
+                type='ColorJitter',
+                brightness=0.4,
+                contrast=0.4,
+                saturation=0.2,
+                hue=0.1)
+        ],
+        prob=0.8),
+    dict(
+        type='RandomGrayscale',
+        prob=0.2,
+        keep_channels=True,
+        channel_weights=(0.114, 0.587, 0.2989)),
+    dict(
+        type='GaussianBlur',
+        magnitude_range=(0.1, 2.0),
+        magnitude_std='inf',
+        prob=1.),
+    dict(type='Solarize', thr=128, prob=0.),
+    dict(type='RandomFlip', prob=0.5),
+]
+view_pipeline2 = [
+    dict(
+        type='RandomResizedCrop',
+        scale=224,
+        crop_ratio_range=(0.08, 1.),
+        backend='pillow'),
+    dict(
+        type='RandomApply',
+        transforms=[
+            dict(
+                type='ColorJitter',
+                brightness=0.4,
+                contrast=0.4,
+                saturation=0.2,
+                hue=0.1)
+        ],
+        prob=0.8),
+    dict(
+        type='RandomGrayscale',
+        prob=0.2,
+        keep_channels=True,
+        channel_weights=(0.114, 0.587, 0.2989)),
+    dict(
+        type='GaussianBlur',
+        magnitude_range=(0.1, 2.0),
+        magnitude_std='inf',
+        prob=0.1),
+    dict(type='Solarize', thr=128, prob=0.2),
+    dict(type='RandomFlip', prob=0.5),
+]
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiView',
+        num_views=[1, 1],
+        transforms=[view_pipeline1, view_pipeline2]),
+    dict(type='PackInputs')
+]
+
+train_dataloader = dict(batch_size=256, dataset=dict(pipeline=train_pipeline))
+
+# model settings
+temperature = 0.2
+model = dict(
+    type='MoCoV3',
+    base_momentum=0.01,
+    backbone=dict(
+        type='MoCoV3ViT',
+        arch='base',  # embed_dim = 768
+        img_size=224,
+        patch_size=16,
+        stop_grad_conv1=True),
+    neck=dict(
+        type='NonLinearNeck',
+        in_channels=768,
+        hid_channels=4096,
+        out_channels=256,
+        num_layers=3,
+        with_bias=False,
+        with_last_bn=True,
+        with_last_bn_affine=False,
+        with_last_bias=False,
+        with_avg_pool=False),
+    head=dict(
+        type='MoCoV3Head',
+        predictor=dict(
+            type='NonLinearNeck',
+            in_channels=256,
+            hid_channels=4096,
+            out_channels=256,
+            num_layers=2,
+            with_bias=False,
+            with_last_bn=True,
+            with_last_bn_affine=False,
+            with_last_bias=False,
+            with_avg_pool=False),
+        loss=dict(type='CrossEntropyLoss', loss_weight=2 * temperature),
+        temperature=temperature))
+
+# optimizer
+optim_wrapper = dict(
+    type='AmpOptimWrapper',
+    loss_scale='dynamic',
+    optimizer=dict(type='AdamW', lr=2.4e-3, weight_decay=0.1))
+find_unused_parameters = True
+
+# learning rate scheduler
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1e-4,
+        by_epoch=True,
+        begin=0,
+        end=40,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=260,
+        by_epoch=True,
+        begin=40,
+        end=300,
+        convert_to_iter_based=True)
+]
+
+# runtime settings
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=300)
+# only keeps the latest 3 checkpoints
+default_hooks = dict(checkpoint=dict(max_keep_ckpts=3))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+auto_scale_lr = dict(base_batch_size=4096)
--- a/configs/mocov3/mocov3_vit-large-p16_64xb64-amp-coslr-300e_in1k.py
+++ b/configs/mocov3/mocov3_vit-large-p16_64xb64-amp-coslr-300e_in1k.py
+_base_ = [
+    '../_base_/datasets/imagenet_bs512_mocov3.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset settings
+# the difference between ResNet50 and ViT pipeline is the `scale` in
+# `RandomResizedCrop`, `scale=(0.08, 1.)` in ViT pipeline
+view_pipeline1 = [
+    dict(
+        type='RandomResizedCrop',
+        scale=224,
+        crop_ratio_range=(0.08, 1.),
+        backend='pillow'),
+    dict(
+        type='RandomApply',
+        transforms=[
+            dict(
+                type='ColorJitter',
+                brightness=0.4,
+                contrast=0.4,
+                saturation=0.2,
+                hue=0.1)
+        ],
+        prob=0.8),
+    dict(
+        type='RandomGrayscale',
+        prob=0.2,
+        keep_channels=True,
+        channel_weights=(0.114, 0.587, 0.2989)),
+    dict(
+        type='GaussianBlur',
+        magnitude_range=(0.1, 2.0),
+        magnitude_std='inf',
+        prob=1.),
+    dict(type='Solarize', thr=128, prob=0.),
+    dict(type='RandomFlip', prob=0.5),
+]
+view_pipeline2 = [
+    dict(
+        type='RandomResizedCrop',
+        scale=224,
+        crop_ratio_range=(0.08, 1.),
+        backend='pillow'),
+    dict(
+        type='RandomApply',
+        transforms=[
+            dict(
+                type='ColorJitter',
+                brightness=0.4,
+                contrast=0.4,
+                saturation=0.2,
+                hue=0.1)
+        ],
+        prob=0.8),
+    dict(
+        type='RandomGrayscale',
+        prob=0.2,
+        keep_channels=True,
+        channel_weights=(0.114, 0.587, 0.2989)),
+    dict(
+        type='GaussianBlur',
+        magnitude_range=(0.1, 2.0),
+        magnitude_std='inf',
+        prob=0.1),
+    dict(type='Solarize', thr=128, prob=0.2),
+    dict(type='RandomFlip', prob=0.5),
+]
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiView',
+        num_views=[1, 1],
+        transforms=[view_pipeline1, view_pipeline2]),
+    dict(type='PackInputs')
+]
+
+train_dataloader = dict(batch_size=64, dataset=dict(pipeline=train_pipeline))
+
+# model settings
+temperature = 0.2
+model = dict(
+    type='MoCoV3',
+    base_momentum=0.01,
+    backbone=dict(
+        type='MoCoV3ViT',
+        arch='large',  # embed_dim = 1024
+        img_size=224,
+        patch_size=16,
+        stop_grad_conv1=True),
+    neck=dict(
+        type='NonLinearNeck',
+        in_channels=1024,
+        hid_channels=4096,
+        out_channels=256,
+        num_layers=3,
+        with_bias=False,
+        with_last_bn=True,
+        with_last_bn_affine=False,
+        with_last_bias=False,
+        with_avg_pool=False),
+    head=dict(
+        type='MoCoV3Head',
+        predictor=dict(
+            type='NonLinearNeck',
+            in_channels=256,
+            hid_channels=4096,
+            out_channels=256,
+            num_layers=2,
+            with_bias=False,
+            with_last_bn=True,
+            with_last_bn_affine=False,
+            with_last_bias=False,
+            with_avg_pool=False),
+        loss=dict(type='CrossEntropyLoss', loss_weight=2 * temperature),
+        temperature=temperature))
+
+# optimizer
+optim_wrapper = dict(
+    type='AmpOptimWrapper',
+    loss_scale='dynamic',
+    clip_grad=dict(max_norm=5.0, error_if_nonfinite=False),
+    optimizer=dict(type='AdamW', lr=2.4e-3, weight_decay=0.1))
+find_unused_parameters = True
+
+# learning rate scheduler
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1e-4,
+        by_epoch=True,
+        begin=0,
+        end=40,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=260,
+        by_epoch=True,
+        begin=40,
+        end=300,
+        convert_to_iter_based=True)
+]
+
+# runtime settings
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=300)
+# only keeps the latest 3 checkpoints
+default_hooks = dict(checkpoint=dict(max_keep_ckpts=3))
+
+randomness = dict(seed=0)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+auto_scale_lr = dict(base_batch_size=4096)
--- a/configs/mocov3/mocov3_vit-small-p16_16xb256-amp-coslr-300e_in1k.py
+++ b/configs/mocov3/mocov3_vit-small-p16_16xb256-amp-coslr-300e_in1k.py
+_base_ = [
+    '../_base_/datasets/imagenet_bs512_mocov3.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset settings
+# the difference between ResNet50 and ViT pipeline is the `scale` in
+# `RandomResizedCrop`, `scale=(0.08, 1.)` in ViT pipeline
+view_pipeline1 = [
+    dict(
+        type='RandomResizedCrop',
+        scale=224,
+        crop_ratio_range=(0.08, 1.),
+        backend='pillow'),
+    dict(
+        type='RandomApply',
+        transforms=[
+            dict(
+                type='ColorJitter',
+                brightness=0.4,
+                contrast=0.4,
+                saturation=0.2,
+                hue=0.1)
+        ],
+        prob=0.8),
+    dict(
+        type='RandomGrayscale',
+        prob=0.2,
+        keep_channels=True,
+        channel_weights=(0.114, 0.587, 0.2989)),
+    dict(
+        type='GaussianBlur',
+        magnitude_range=(0.1, 2.0),
+        magnitude_std='inf',
+        prob=1.),
+    dict(type='Solarize', thr=128, prob=0.),
+    dict(type='RandomFlip', prob=0.5),
+]
+view_pipeline2 = [
+    dict(
+        type='RandomResizedCrop',
+        scale=224,
+        crop_ratio_range=(0.08, 1.),
+        backend='pillow'),
+    dict(
+        type='RandomApply',
+        transforms=[
+            dict(
+                type='ColorJitter',
+                brightness=0.4,
+                contrast=0.4,
+                saturation=0.2,
+                hue=0.1)
+        ],
+        prob=0.8),
+    dict(
+        type='RandomGrayscale',
+        prob=0.2,
+        keep_channels=True,
+        channel_weights=(0.114, 0.587, 0.2989)),
+    dict(
+        type='GaussianBlur',
+        magnitude_range=(0.1, 2.0),
+        magnitude_std='inf',
+        prob=0.1),
+    dict(type='Solarize', thr=128, prob=0.2),
+    dict(type='RandomFlip', prob=0.5),
+]
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiView',
+        num_views=[1, 1],
+        transforms=[view_pipeline1, view_pipeline2]),
+    dict(type='PackInputs')
+]
+
+train_dataloader = dict(batch_size=256, dataset=dict(pipeline=train_pipeline))
+
+# model settings
+temperature = 0.2
+model = dict(
+    type='MoCoV3',
+    base_momentum=0.01,
+    backbone=dict(
+        type='MoCoV3ViT',
+        arch='mocov3-small',  # embed_dim = 384
+        img_size=224,
+        patch_size=16,
+        stop_grad_conv1=True),
+    neck=dict(
+        type='NonLinearNeck',
+        in_channels=384,
+        hid_channels=4096,
+        out_channels=256,
+        num_layers=3,
+        with_bias=False,
+        with_last_bn=True,
+        with_last_bn_affine=False,
+        with_last_bias=False,
+        with_avg_pool=False),
+    head=dict(
+        type='MoCoV3Head',
+        predictor=dict(
+            type='NonLinearNeck',
+            in_channels=256,
+            hid_channels=4096,
+            out_channels=256,
+            num_layers=2,
+            with_bias=False,
+            with_last_bn=True,
+            with_last_bn_affine=False,
+            with_last_bias=False,
+            with_avg_pool=False),
+        loss=dict(type='CrossEntropyLoss', loss_weight=2 * temperature),
+        temperature=temperature))
+
+# optimizer
+optim_wrapper = dict(
+    type='AmpOptimWrapper',
+    loss_scale='dynamic',
+    optimizer=dict(type='AdamW', lr=2.4e-3, weight_decay=0.1))
+find_unused_parameters = True
+
+# learning rate scheduler
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1e-4,
+        by_epoch=True,
+        begin=0,
+        end=40,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=260,
+        by_epoch=True,
+        begin=40,
+        end=300,
+        convert_to_iter_based=True)
+]
+
+# runtime settings
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=300)
+# only keeps the latest 3 checkpoints
+default_hooks = dict(checkpoint=dict(max_keep_ckpts=3))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+auto_scale_lr = dict(base_batch_size=4096)
--- a/configs/mvit/README.md
+++ b/configs/mvit/README.md
+# MViT V2
+
+> [MViTv2: Improved Multiscale Vision Transformers for Classification and Detection](http://openaccess.thecvf.com//content/CVPR2022/papers/Li_MViTv2_Improved_Multiscale_Vision_Transformers_for_Classification_and_Detection_CVPR_2022_paper.pdf)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+In this paper, we study Multiscale Vision Transformers (MViTv2) as a unified architecture for image and video
+classification, as well as object detection. We present an improved version of MViT that incorporates
+decomposed relative positional embeddings and residual pooling connections. We instantiate this architecture
+in five sizes and evaluate it for ImageNet classification, COCO detection and Kinetics video recognition where
+it outperforms prior work. We further compare MViTv2s' pooling attention to window attention mechanisms where
+it outperforms the latter in accuracy/compute. Without bells-and-whistles, MViTv2 has state-of-the-art
+performance in 3 domains: 88.8% accuracy on ImageNet classification, 58.7 boxAP on COCO object detection as
+well as 86.1% on Kinetics-400 video classification.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/26739999/180376227-755243fa-158e-4068-940a-416036519665.png" width="50%"/>
+</div>
+
+## How to use it?
+
+<!-- [TABS-BEGIN] -->
+
+**Predict image**
+
+```python
+from mmpretrain import inference_model
+
+predict = inference_model('mvitv2-tiny_3rdparty_in1k', 'demo/bird.JPEG')
+print(predict['pred_class'])
+print(predict['pred_score'])
+```
+
+**Use the model**
+
+```python
+import torch
+from mmpretrain import get_model
+
+model = get_model('mvitv2-tiny_3rdparty_in1k', pretrained=True)
+inputs = torch.rand(1, 3, 224, 224)
+out = model(inputs)
+print(type(out))
+# To extract features.
+feats = model.extract_feat(inputs)
+print(type(feats))
+```
+
+**Test Command**
+
+Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
+
+Test:
+
+```shell
+python tools/test.py configs/mvit/mvitv2-tiny_8xb256_in1k.py https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-tiny_3rdparty_in1k_20220722-db7beeef.pth
+```
+
+<!-- [TABS-END] -->
+
+## Models and results
+
+### Image Classification on ImageNet-1k
+
+| Model                          |   Pretrain   | Params (M) | Flops (G) | Top-1 (%) | Top-5 (%) |                Config                 |                                       Download                                       |
+| :----------------------------- | :----------: | :--------: | :-------: | :-------: | :-------: | :-----------------------------------: | :----------------------------------------------------------------------------------: |
+| `mvitv2-tiny_3rdparty_in1k`\*  | From scratch |   24.17    |   4.70    |   82.33   |   96.15   | [config](mvitv2-tiny_8xb256_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-tiny_3rdparty_in1k_20220722-db7beeef.pth) |
+| `mvitv2-small_3rdparty_in1k`\* | From scratch |   34.87    |   7.00    |   83.63   |   96.51   | [config](mvitv2-small_8xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-small_3rdparty_in1k_20220722-986bd741.pth) |
+| `mvitv2-base_3rdparty_in1k`\*  | From scratch |   51.47    |   10.16   |   84.34   |   96.86   | [config](mvitv2-base_8xb256_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-base_3rdparty_in1k_20220722-9c4f0a17.pth) |
+| `mvitv2-large_3rdparty_in1k`\* | From scratch |   217.99   |   43.87   |   85.25   |   97.14   | [config](mvitv2-large_8xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-large_3rdparty_in1k_20220722-2b57b983.pth) |
+
+*Models with * are converted from the [official repo](https://github.com/facebookresearch/mvit). The config files of these models are only for inference. We haven't reproduce the training results.*
+
+## Citation
+
+```bibtex
+@inproceedings{li2021improved,
+  title={MViTv2: Improved multiscale vision transformers for classification and detection},
+  author={Li, Yanghao and Wu, Chao-Yuan and Fan, Haoqi and Mangalam, Karttikeya and Xiong, Bo and Malik, Jitendra and Feichtenhofer, Christoph},
+  booktitle={CVPR},
+  year={2022}
+}
+```
--- a/configs/mvit/metafile.yml
+++ b/configs/mvit/metafile.yml
+Collections:
+  - Name: MViT V2
+    Metadata:
+      Architecture:
+        - Attention Dropout
+        - Convolution
+        - Dense Connections
+        - GELU
+        - Layer Normalization
+        - Scaled Dot-Product Attention
+        - Attention Pooling
+    Paper:
+      URL: http://openaccess.thecvf.com//content/CVPR2022/papers/Li_MViTv2_Improved_Multiscale_Vision_Transformers_for_Classification_and_Detection_CVPR_2022_paper.pdf
+      Title: 'MViTv2: Improved Multiscale Vision Transformers for Classification and Detection'
+    README: configs/mvit/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmpretrain/blob/v0.24.0/mmcls/models/backbones/mvit.py
+      Version: v0.24.0
+
+Models:
+  - Name: mvitv2-tiny_3rdparty_in1k
+    In Collection: MViT V2
+    Metadata:
+      FLOPs: 4703510768
+      Parameters: 24173320
+      Training Data:
+        - ImageNet-1k
+    Results:
+    - Dataset: ImageNet-1k
+      Task: Image Classification
+      Metrics:
+        Top 1 Accuracy: 82.33
+        Top 5 Accuracy: 96.15
+    Weights: https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-tiny_3rdparty_in1k_20220722-db7beeef.pth
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/mvit/mvitv2_models/MViTv2_T_in1k.pyth
+      Code: https://github.com/facebookresearch/mvit
+    Config: configs/mvit/mvitv2-tiny_8xb256_in1k.py
+
+  - Name: mvitv2-small_3rdparty_in1k
+    In Collection: MViT V2
+    Metadata:
+      FLOPs: 6997555136
+      Parameters: 34870216
+      Training Data:
+        - ImageNet-1k
+    Results:
+    - Dataset: ImageNet-1k
+      Task: Image Classification
+      Metrics:
+        Top 1 Accuracy: 83.63
+        Top 5 Accuracy: 96.51
+    Weights: https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-small_3rdparty_in1k_20220722-986bd741.pth
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/mvit/mvitv2_models/MViTv2_S_in1k.pyth
+      Code: https://github.com/facebookresearch/mvit
+    Config: configs/mvit/mvitv2-small_8xb256_in1k.py
+
+  - Name: mvitv2-base_3rdparty_in1k
+    In Collection: MViT V2
+    Metadata:
+      FLOPs: 10157964400
+      Parameters: 51472744
+      Training Data:
+        - ImageNet-1k
+    Results:
+    - Dataset: ImageNet-1k
+      Task: Image Classification
+      Metrics:
+        Top 1 Accuracy: 84.34
+        Top 5 Accuracy: 96.86
+    Weights: https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-base_3rdparty_in1k_20220722-9c4f0a17.pth
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/mvit/mvitv2_models/MViTv2_B_in1k.pyth
+      Code: https://github.com/facebookresearch/mvit
+    Config: configs/mvit/mvitv2-base_8xb256_in1k.py
+
+  - Name: mvitv2-large_3rdparty_in1k
+    In Collection: MViT V2
+    Metadata:
+      FLOPs: 43868151412
+      Parameters: 217992952
+      Training Data:
+        - ImageNet-1k
+    Results:
+    - Dataset: ImageNet-1k
+      Task: Image Classification
+      Metrics:
+        Top 1 Accuracy: 85.25
+        Top 5 Accuracy: 97.14
+    Weights: https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-large_3rdparty_in1k_20220722-2b57b983.pth
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/mvit/mvitv2_models/MViTv2_L_in1k.pyth
+      Code: https://github.com/facebookresearch/mvit
+    Config: configs/mvit/mvitv2-large_8xb256_in1k.py
--- a/configs/mvit/mvitv2-base_8xb256_in1k.py
+++ b/configs/mvit/mvitv2-base_8xb256_in1k.py
+_base_ = [
+    '../_base_/models/mvit/mvitv2-base.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py'
+]
+
+# dataset settings
+train_dataloader = dict(batch_size=256)
+val_dataloader = dict(batch_size=256)
+test_dataloader = dict(batch_size=256)
+
+# schedule settings
+optim_wrapper = dict(
+    optimizer=dict(lr=2.5e-4),
+    paramwise_cfg=dict(
+        norm_decay_mult=0.0,
+        bias_decay_mult=0.0,
+        custom_keys={
+            '.pos_embed': dict(decay_mult=0.0),
+            '.rel_pos_h': dict(decay_mult=0.0),
+            '.rel_pos_w': dict(decay_mult=0.0)
+        }),
+    clip_grad=dict(max_norm=1.0),
+)
+
+# learning policy
+param_scheduler = [
+    # warm up learning rate scheduler
+    dict(
+        type='LinearLR',
+        start_factor=1e-3,
+        by_epoch=True,
+        end=70,
+        # update by iter
+        convert_to_iter_based=True),
+    # main learning rate scheduler
+    dict(type='CosineAnnealingLR', eta_min=1e-5, by_epoch=True, begin=70)
+]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# based on the actual training batch size.
+auto_scale_lr = dict(base_batch_size=2048)
--- a/configs/mvit/mvitv2-large_8xb256_in1k.py
+++ b/configs/mvit/mvitv2-large_8xb256_in1k.py
+_base_ = [
+    '../_base_/models/mvit/mvitv2-large.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs2048_AdamW.py',
+    '../_base_/default_runtime.py'
+]
+
+# dataset settings
+train_dataloader = dict(batch_size=256)
+val_dataloader = dict(batch_size=256)
+test_dataloader = dict(batch_size=256)
+
+# schedule settings
+optim_wrapper = dict(
+    optimizer=dict(lr=2.5e-4),
+    paramwise_cfg=dict(
+        norm_decay_mult=0.0,
+        bias_decay_mult=0.0,
+        custom_keys={
+            '.pos_embed': dict(decay_mult=0.0),
+            '.rel_pos_h': dict(decay_mult=0.0),
+            '.rel_pos_w': dict(decay_mult=0.0)
+        }),
+    clip_grad=dict(max_norm=1.0),
+)
+
+# learning policy
+param_scheduler = [
+    # warm up learning rate scheduler
+    dict(
+        type='LinearLR',
+        start_factor=1e-3,
+        by_epoch=True,
+        end=70,
+        # update by iter
+        convert_to_iter_based=True),
+    # main learning rate scheduler
+    dict(type='CosineAnnealingLR', eta_min=1e-5, by_epoch=True, begin=70)
+]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# based on the actual training batch size.
+auto_scale_lr = dict(base_batch_size=2048)
--- a/configs/mvit/mvitv2-small_8xb256_in1k.py
+++ b/configs/mvit/mvitv2-small_8xb256_in1k.py
+_base_ = [
+    '../_base_/models/mvit/mvitv2-small.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs2048_AdamW.py',
+    '../_base_/default_runtime.py'
+]
+
+# dataset settings
+train_dataloader = dict(batch_size=256)
+val_dataloader = dict(batch_size=256)
+test_dataloader = dict(batch_size=256)
+
+# schedule settings
+optim_wrapper = dict(
+    optimizer=dict(lr=2.5e-4),
+    paramwise_cfg=dict(
+        norm_decay_mult=0.0,
+        bias_decay_mult=0.0,
+        custom_keys={
+            '.pos_embed': dict(decay_mult=0.0),
+            '.rel_pos_h': dict(decay_mult=0.0),
+            '.rel_pos_w': dict(decay_mult=0.0)
+        }),
+    clip_grad=dict(max_norm=1.0),
+)
+
+# learning policy
+param_scheduler = [
+    # warm up learning rate scheduler
+    dict(
+        type='LinearLR',
+        start_factor=1e-3,
+        by_epoch=True,
+        end=70,
+        # update by iter
+        convert_to_iter_based=True),
+    # main learning rate scheduler
+    dict(type='CosineAnnealingLR', eta_min=1e-5, by_epoch=True, begin=70)
+]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# based on the actual training batch size.
+auto_scale_lr = dict(base_batch_size=2048)
--- a/configs/mvit/mvitv2-tiny_8xb256_in1k.py
+++ b/configs/mvit/mvitv2-tiny_8xb256_in1k.py
+_base_ = [
+    '../_base_/models/mvit/mvitv2-tiny.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs2048_AdamW.py',
+    '../_base_/default_runtime.py'
+]
+
+# dataset settings
+train_dataloader = dict(batch_size=256)
+val_dataloader = dict(batch_size=256)
+test_dataloader = dict(batch_size=256)
+
+# schedule settings
+optim_wrapper = dict(
+    optimizer=dict(lr=2.5e-4),
+    paramwise_cfg=dict(
+        norm_decay_mult=0.0,
+        bias_decay_mult=0.0,
+        custom_keys={
+            '.pos_embed': dict(decay_mult=0.0),
+            '.rel_pos_h': dict(decay_mult=0.0),
+            '.rel_pos_w': dict(decay_mult=0.0)
+        }),
+    clip_grad=dict(max_norm=1.0),
+)
+
+# learning policy
+param_scheduler = [
+    # warm up learning rate scheduler
+    dict(
+        type='LinearLR',
+        start_factor=1e-3,
+        by_epoch=True,
+        end=70,
+        # update by iter
+        convert_to_iter_based=True),
+    # main learning rate scheduler
+    dict(type='CosineAnnealingLR', eta_min=1e-5, by_epoch=True, begin=70)
+]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# based on the actual training batch size.
+auto_scale_lr = dict(base_batch_size=2048)
--- a/configs/ofa/README.md
+++ b/configs/ofa/README.md
+# OFA
+
+> [OFA: Unifying Architectures, Tasks, and Modalities Through a Simple Sequence-to-Sequence Learning Framework](https://arxiv.org/abs/2202.03052)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+In this work, we pursue a unified paradigm for multimodal pretraining to break the scaffolds of complex task/modality-specific customization. We propose OFA, a Task-Agnostic and Modality-Agnostic framework that supports Task Comprehensiveness. OFA unifies a diverse set of cross-modal and unimodal tasks, including image generation, visual grounding, image captioning, image classification, language modeling, etc., in a simple sequence-to-sequence learning framework. OFA follows the instruction-based learning in both pretraining and finetuning stages, requiring no extra task-specific layers for downstream tasks. In comparison with the recent state-of-the-art vision & language models that rely on extremely large cross-modal datasets, OFA is pretrained on only 20M publicly available image-text pairs. Despite its simplicity and relatively small-scale training data, OFA achieves new SOTAs in a series of cross-modal tasks while attaining highly competitive performances on uni-modal tasks. Our further analysis indicates that OFA can also effectively transfer to unseen tasks and unseen domains.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/26739999/236164275-2429bf20-6e2a-4325-acc2-6117f9b53a53.png" width="80%"/>
+</div>
+
+## How to use it?
+
+<!-- [TABS-BEGIN] -->
+
+**Use the model**
+
+```python
+from mmpretrain import inference_model
+
+result = inference_model('ofa-base_3rdparty-finetuned_caption', 'demo/cat-dog.png')
+print(result)
+# {'pred_caption': 'a dog and a kitten sitting next to each other'}
+```
+
+**Test Command**
+
+Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
+
+Test:
+
+```shell
+python tools/test.py configs/ofa/ofa-base_finetuned_refcoco.py https://download.openmmlab.com/mmclassification/v1/ofa/ofa-base_3rdparty_refcoco_20230418-2797d3ab.pth
+```
+
+<!-- [TABS-END] -->
+
+## Models and results
+
+### Image Caption on COCO
+
+| Model                                   | Params (M) | BLEU-4 | CIDER  |                 Config                  |                                               Download                                               |
+| :-------------------------------------- | :--------: | :----: | :----: | :-------------------------------------: | :--------------------------------------------------------------------------------------------------: |
+| `ofa-base_3rdparty-finetuned_caption`\* |   182.24   | 42.64  | 144.50 | [config](ofa-base_finetuned_caption.py) | [model](https://download.openmmlab.com/mmclassification/v1/ofa/ofa-base_3rdparty_coco-caption_20230418-de18914e.pth) |
+
+*Models with * are converted from the [official repo](https://github.com/OFA-Sys/OFA). The config files of these models are only for inference. We haven't reproduce the training results.*
+
+### Visual Grounding on RefCOCO
+
+| Model                                   | Params (M) | Accuracy (testA) | Accuracy (testB) |                 Config                  |                                     Download                                     |
+| :-------------------------------------- | :--------: | :--------------: | :--------------: | :-------------------------------------: | :------------------------------------------------------------------------------: |
+| `ofa-base_3rdparty-finetuned_refcoco`\* |   182.24   |      90.49       |      83.63       | [config](ofa-base_finetuned_refcoco.py) | [model](https://download.openmmlab.com/mmclassification/v1/ofa/ofa-base_3rdparty_refcoco_20230418-2797d3ab.pth) |
+
+*Models with * are converted from the [official repo](https://github.com/OFA-Sys/OFA). The config files of these models are only for inference. We haven't reproduce the training results.*
+
+### Visual Question Answering on VQAv2
+
+| Model                               | Params (M) | Accuracy |               Config                |                                                     Download                                                     |
+| :---------------------------------- | :--------: | :------: | :---------------------------------: | :--------------------------------------------------------------------------------------------------------------: |
+| `ofa-base_3rdparty-finetuned_vqa`\* |   182.24   |  78.00   | [config](ofa-base_finetuned_vqa.py) | [model](https://download.openmmlab.com/mmclassification/v1/ofa/ofa-base_3rdparty_coco-vqa_20230418-f38539a5.pth) |
+| `ofa-base_3rdparty-zeroshot_vqa`\*  |   182.24   |  58.32   | [config](ofa-base_zeroshot_vqa.py)  | [model](https://download.openmmlab.com/mmclassification/v1/ofa/ofa-base_3rdparty_pretrain_20230418-dccfc07f.pth) |
+
+*Models with * are converted from the [official repo](https://github.com/OFA-Sys/OFA). The config files of these models are only for inference. We haven't reproduce the training results.*
+
+## Citation
+
+```bibtex
+@article{wang2022ofa,
+  author    = {Peng Wang and
+               An Yang and
+               Rui Men and
+               Junyang Lin and
+               Shuai Bai and
+               Zhikang Li and
+               Jianxin Ma and
+               Chang Zhou and
+               Jingren Zhou and
+               Hongxia Yang},
+  title     = {OFA: Unifying Architectures, Tasks, and Modalities Through a Simple Sequence-to-Sequence
+               Learning Framework},
+  journal   = {CoRR},
+  volume    = {abs/2202.03052},
+  year      = {2022}
+}
+```
--- a/configs/ofa/metafile.yml
+++ b/configs/ofa/metafile.yml
+Collections:
+  - Name: OFA
+    Metadata:
+      Architecture:
+        - ResNet
+        - Transformer
+      Training Data:
+        - CC12M
+        - CC3M
+        - SBU
+        - COCO
+        - VG
+        - VQAv2
+        - GQA
+        - RefCOCO
+        - OpenImages
+        - Object365
+        - YFCC100M
+        - ImageNet-21K
+        - Pile
+    Paper:
+      Title: 'OFA: Unifying Architectures, Tasks, and Modalities Through a Simple
+        Sequence-to-Sequence Learning Framework'
+      URL: https://arxiv.org/abs/2202.03052
+    README: configs/ofa/README.md
+
+Models:
+  - Name: ofa-base_3rdparty-finetuned_refcoco
+    Metadata:
+      FLOPs: null
+      Parameters: 182238536
+    In Collection: OFA
+    Results:
+      - Task: Visual Grounding
+        Dataset: RefCOCO
+        Metrics:
+          Accuracy (testA): 90.49
+          Accuracy (testB): 83.63
+    Weights: https://download.openmmlab.com/mmclassification/v1/ofa/ofa-base_3rdparty_refcoco_20230418-2797d3ab.pth
+    Config: configs/ofa/ofa-base_finetuned_refcoco.py
+    Converted From:
+      Weights: https://ofa-beijing.oss-cn-beijing.aliyuncs.com/checkpoints/refcoco_base_best.pt
+      Code: https://github.com/OFA-Sys/OFA
+  - Name: ofa-base_3rdparty-finetuned_vqa
+    Metadata:
+      FLOPs: null
+      Parameters: 182238536
+    In Collection: OFA
+    Results:
+      - Task: Visual Question Answering
+        Dataset: VQAv2
+        Metrics:
+          Accuracy: 78.00   # Report from the official repo
+    Weights: https://download.openmmlab.com/mmclassification/v1/ofa/ofa-base_3rdparty_coco-vqa_20230418-f38539a5.pth
+    Config: configs/ofa/ofa-base_finetuned_vqa.py
+    Converted From:
+      Weights: https://ofa-beijing.oss-cn-beijing.aliyuncs.com/checkpoints/vqa_large_best.pt
+      Code: https://github.com/OFA-Sys/OFA
+  - Name: ofa-base_3rdparty-finetuned_caption
+    Metadata:
+      FLOPs: null
+      Parameters: 182238536
+    In Collection: OFA
+    Results:
+      - Task: Image Caption
+        Dataset: COCO
+        Metrics:
+          BLEU-4: 42.64
+          CIDER: 144.50
+    Weights: https://download.openmmlab.com/mmclassification/v1/ofa/ofa-base_3rdparty_coco-caption_20230418-de18914e.pth
+    Config: configs/ofa/ofa-base_finetuned_caption.py
+    Converted From:
+      Weights: https://ofa-beijing.oss-cn-beijing.aliyuncs.com/checkpoints/caption_base_best.pt
+      Code: https://github.com/OFA-Sys/OFA
+  - Name: ofa-base_3rdparty-zeroshot_vqa
+    Metadata:
+      FLOPs: null
+      Parameters: 182238536
+    In Collection: OFA
+    Results:
+      - Task: Visual Question Answering
+        Dataset: VQAv2
+        Metrics:
+          Accuracy: 58.32
+    Weights: https://download.openmmlab.com/mmclassification/v1/ofa/ofa-base_3rdparty_pretrain_20230418-dccfc07f.pth
+    Config: configs/ofa/ofa-base_zeroshot_vqa.py
+    Converted From:
+      Weights: https://ofa-beijing.oss-cn-beijing.aliyuncs.com/checkpoints/ofa_base.pt
+      Code: https://github.com/OFA-Sys/OFA