first commit

dff2c686 · renzhc · 8f9dd0ed · dff2c686 · dff2c686 · dff2c686
Commit dff2c686 authored Sep 03, 2024 by renzhc
20 changed files
--- a/configs/convnext_v2/convnext-v2-large_32xb32_in1k.py
+++ b/configs/convnext_v2/convnext-v2-large_32xb32_in1k.py
+_base_ = [
+    '../_base_/models/convnext_v2/large.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset setting
+train_dataloader = dict(batch_size=32)
+
+# schedule setting
+optim_wrapper = dict(
+    optimizer=dict(lr=2.5e-3),
+    clip_grad=None,
+)
+
+# learning policy
+param_scheduler = [
+    # warm up learning rate scheduler
+    dict(
+        type='LinearLR',
+        start_factor=1e-3,
+        by_epoch=True,
+        end=20,
+        # update by iter
+        convert_to_iter_based=True),
+    # main learning rate scheduler
+    dict(type='CosineAnnealingLR', eta_min=1e-5, by_epoch=True, begin=20)
+]
+
+# train, val, test setting
+train_cfg = dict(by_epoch=True, max_epochs=100, val_interval=1)
+
+# runtime setting
+custom_hooks = [dict(type='EMAHook', momentum=1e-4, priority='ABOVE_NORMAL')]
--- a/configs/convnext_v2/convnext-v2-nano_32xb32_in1k-384px.py
+++ b/configs/convnext_v2/convnext-v2-nano_32xb32_in1k-384px.py
+_base_ = [
+    '../_base_/models/convnext_v2/nano.py',
+    '../_base_/datasets/imagenet_bs64_swin_384.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset setting
+train_dataloader = dict(batch_size=32)
+
+# schedule setting
+optim_wrapper = dict(
+    optimizer=dict(lr=8e-4, weight_decay=0.3),
+    clip_grad=None,
+)
+
+# learning policy
+param_scheduler = [dict(type='CosineAnnealingLR', eta_min=1e-5, by_epoch=True)]
+
+# train, val, test setting
+train_cfg = dict(by_epoch=True, max_epochs=600, val_interval=1)
+
+# runtime setting
+custom_hooks = [dict(type='EMAHook', momentum=1e-4, priority='ABOVE_NORMAL')]
--- a/configs/convnext_v2/convnext-v2-nano_32xb32_in1k.py
+++ b/configs/convnext_v2/convnext-v2-nano_32xb32_in1k.py
+_base_ = [
+    '../_base_/models/convnext_v2/nano.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset setting
+train_dataloader = dict(batch_size=32)
+
+# schedule setting
+optim_wrapper = dict(
+    optimizer=dict(lr=8e-4, weight_decay=0.3),
+    clip_grad=None,
+)
+
+# learning policy
+param_scheduler = [dict(type='CosineAnnealingLR', eta_min=1e-5, by_epoch=True)]
+
+# train, val, test setting
+train_cfg = dict(by_epoch=True, max_epochs=600, val_interval=1)
+
+# runtime setting
+custom_hooks = [dict(type='EMAHook', momentum=1e-4, priority='ABOVE_NORMAL')]
--- a/configs/convnext_v2/convnext-v2-pico_32xb32_in1k.py
+++ b/configs/convnext_v2/convnext-v2-pico_32xb32_in1k.py
+_base_ = [
+    '../_base_/models/convnext_v2/pico.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset setting
+train_dataloader = dict(batch_size=32)
+
+# schedule setting
+optim_wrapper = dict(
+    optimizer=dict(lr=8e-4, weight_decay=0.3),
+    clip_grad=None,
+)
+
+# learning policy
+param_scheduler = [dict(type='CosineAnnealingLR', eta_min=1e-5, by_epoch=True)]
+
+# train, val, test setting
+train_cfg = dict(by_epoch=True, max_epochs=600, val_interval=1)
+
+# runtime setting
+custom_hooks = [dict(type='EMAHook', momentum=1e-4, priority='ABOVE_NORMAL')]
--- a/configs/convnext_v2/convnext-v2-tiny_32xb32_in1k-384px.py
+++ b/configs/convnext_v2/convnext-v2-tiny_32xb32_in1k-384px.py
+_base_ = [
+    '../_base_/models/convnext_v2/tiny.py',
+    '../_base_/datasets/imagenet_bs64_swin_384.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset setting
+train_dataloader = dict(batch_size=32)
+
+# schedule setting
+optim_wrapper = dict(
+    optimizer=dict(lr=3.2e-3),
+    clip_grad=None,
+)
+
+# learning policy
+param_scheduler = [
+    # warm up learning rate scheduler
+    dict(
+        type='LinearLR',
+        start_factor=1e-3,
+        by_epoch=True,
+        end=40,
+        # update by iter
+        convert_to_iter_based=True),
+    # main learning rate scheduler
+    dict(type='CosineAnnealingLR', eta_min=1e-5, by_epoch=True, begin=40)
+]
+
+# train, val, test setting
+train_cfg = dict(by_epoch=True, max_epochs=300, val_interval=1)
+
+# runtime setting
+custom_hooks = [dict(type='EMAHook', momentum=1e-4, priority='ABOVE_NORMAL')]
--- a/configs/convnext_v2/convnext-v2-tiny_32xb32_in1k.py
+++ b/configs/convnext_v2/convnext-v2-tiny_32xb32_in1k.py
+_base_ = [
+    '../_base_/models/convnext_v2/tiny.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset setting
+train_dataloader = dict(batch_size=32)
+
+# schedule setting
+optim_wrapper = dict(
+    optimizer=dict(lr=3.2e-3),
+    clip_grad=None,
+)
+
+# learning policy
+param_scheduler = [
+    # warm up learning rate scheduler
+    dict(
+        type='LinearLR',
+        start_factor=1e-3,
+        by_epoch=True,
+        end=40,
+        # update by iter
+        convert_to_iter_based=True),
+    # main learning rate scheduler
+    dict(type='CosineAnnealingLR', eta_min=1e-5, by_epoch=True, begin=40)
+]
+
+# train, val, test setting
+train_cfg = dict(by_epoch=True, max_epochs=300, val_interval=1)
+
+# runtime setting
+custom_hooks = [dict(type='EMAHook', momentum=1e-4, priority='ABOVE_NORMAL')]
--- a/configs/convnext_v2/metafile.yml
+++ b/configs/convnext_v2/metafile.yml
+Collections:
+  - Name: ConvNeXt V2
+    Metadata:
+      Architecture:
+        - Global Response Normalization
+    Paper:
+      Title: Co-designing and Scaling ConvNets with Masked Autoencoders
+      URL: http://arxiv.org/abs/2301.00808
+    README: configs/convnext_v2/README.md
+
+Models:
+  - Name: convnext-v2-atto_3rdparty-fcmae_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 551718080
+      Parameters: 3708400
+    In Collection: ConvNeXt V2
+    Results: null
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-atto_3rdparty-fcmae_in1k_20230104-07514db4.pth
+    Config: configs/convnext_v2/convnext-v2-atto_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/pt_only/convnextv2_atto_1k_224_fcmae.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-atto_fcmae-pre_3rdparty_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 551718080
+      Parameters: 3708400
+    In Collection: ConvNeXt V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 76.64
+          Top 5 Accuracy: 93.04
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-atto_fcmae-pre_3rdparty_in1k_20230104-23765f83.pth
+    Config: configs/convnext_v2/convnext-v2-atto_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_atto_1k_224_ema.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-femto_3rdparty-fcmae_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 784892544
+      Parameters: 5233240
+    In Collection: ConvNeXt V2
+    Results: null
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-femto_3rdparty-fcmae_in1k_20230104-adbe2082.pth
+    Config: configs/convnext_v2/convnext-v2-femto_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/pt_only/convnextv2_femto_1k_224_fcmae.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-femto_fcmae-pre_3rdparty_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 784892544
+      Parameters: 5233240
+    In Collection: ConvNeXt V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 78.48
+          Top 5 Accuracy: 93.98
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-femto_fcmae-pre_3rdparty_in1k_20230104-92a75d75.pth
+    Config: configs/convnext_v2/convnext-v2-femto_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_femto_1k_224_ema.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-pico_3rdparty-fcmae_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 1374072320
+      Parameters: 9066280
+    In Collection: ConvNeXt V2
+    Results: null
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-pico_3rdparty-fcmae_in1k_20230104-147b1b59.pth
+    Config: configs/convnext_v2/convnext-v2-pico_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/pt_only/convnextv2_pico_1k_224_fcmae.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-pico_fcmae-pre_3rdparty_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 1374072320
+      Parameters: 9066280
+    In Collection: ConvNeXt V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 80.31
+          Top 5 Accuracy: 95.08
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-pico_fcmae-pre_3rdparty_in1k_20230104-d20263ca.pth
+    Config: configs/convnext_v2/convnext-v2-pico_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_pico_1k_224_ema.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-nano_3rdparty-fcmae_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 2454926720
+      Parameters: 15623800
+    In Collection: ConvNeXt V2
+    Results: null
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-nano_3rdparty-fcmae_in1k_20230104-3dd1f29e.pth
+    Config: configs/convnext_v2/convnext-v2-nano_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/pt_only/convnextv2_nano_1k_224_fcmae.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-nano_fcmae-pre_3rdparty_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 2454926720
+      Parameters: 15623800
+    In Collection: ConvNeXt V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 81.86
+          Top 5 Accuracy: 95.75
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-nano_fcmae-pre_3rdparty_in1k_20230104-fe1aaaf2.pth
+    Config: configs/convnext_v2/convnext-v2-nano_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_nano_1k_224_ema.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-nano_fcmae-in21k-pre_3rdparty_in1k
+    Metadata:
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+      FLOPs: 2454926720
+      Parameters: 15623800
+    In Collection: ConvNeXt V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 82.04
+          Top 5 Accuracy: 96.16
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-nano_fcmae-in21k-pre_3rdparty_in1k_20230104-91fa8ae2.pth
+    Config: configs/convnext_v2/convnext-v2-nano_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_nano_22k_224_ema.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-tiny_3rdparty-fcmae_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 4469631744
+      Parameters: 28635496
+    In Collection: ConvNeXt V2
+    Results: null
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-tiny_3rdparty-fcmae_in1k_20230104-80513adc.pth
+    Config: configs/convnext_v2/convnext-v2-tiny_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/pt_only/convnextv2_tiny_1k_224_fcmae.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-tiny_fcmae-pre_3rdparty_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 4469631744
+      Parameters: 28635496
+    In Collection: ConvNeXt V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 82.94
+          Top 5 Accuracy: 96.29
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-tiny_fcmae-pre_3rdparty_in1k_20230104-471a86de.pth
+    Config: configs/convnext_v2/convnext-v2-tiny_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_tiny_1k_224_ema.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-tiny_fcmae-in21k-pre_3rdparty_in1k
+    Metadata:
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+      FLOPs: 4469631744
+      Parameters: 28635496
+    In Collection: ConvNeXt V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 83.89
+          Top 5 Accuracy: 96.96
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-tiny_fcmae-in21k-pre_3rdparty_in1k_20230104-8cc8b8f2.pth
+    Config: configs/convnext_v2/convnext-v2-tiny_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_tiny_22k_224_ema.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-nano_fcmae-in21k-pre_3rdparty_in1k-384px
+    Metadata:
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+      FLOPs: 7214472320
+      Parameters: 15623800
+    In Collection: ConvNeXt V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 83.36
+          Top 5 Accuracy: 96.75
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-nano_fcmae-in21k-pre_3rdparty_in1k-384px_20230104-f951ae87.pth
+    Config: configs/convnext_v2/convnext-v2-nano_32xb32_in1k-384px.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_nano_22k_384_ema.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-tiny_fcmae-in21k-pre_3rdparty_in1k-384px
+    Metadata:
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+      FLOPs: 13135236864
+      Parameters: 28635496
+    In Collection: ConvNeXt V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 85.09
+          Top 5 Accuracy: 97.63
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-tiny_fcmae-in21k-pre_3rdparty_in1k-384px_20230104-d8579f84.pth
+    Config: configs/convnext_v2/convnext-v2-tiny_32xb32_in1k-384px.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_tiny_22k_384_ema.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-base_3rdparty-fcmae_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 15382561792
+      Parameters: 88717800
+    In Collection: ConvNeXt V2
+    Results: null
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-base_3rdparty-fcmae_in1k_20230104-8a798eaf.pth
+    Config: configs/convnext_v2/convnext-v2-base_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/pt_only/convnextv2_base_1k_224_fcmae.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-base_fcmae-pre_3rdparty_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 15382561792
+      Parameters: 88717800
+    In Collection: ConvNeXt V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 84.87
+          Top 5 Accuracy: 97.08
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-base_fcmae-pre_3rdparty_in1k_20230104-00a70fa4.pth
+    Config: configs/convnext_v2/convnext-v2-base_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_base_1k_224_ema.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-base_fcmae-in21k-pre_3rdparty_in1k
+    Metadata:
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+      FLOPs: 15382561792
+      Parameters: 88717800
+    In Collection: ConvNeXt V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 86.74
+          Top 5 Accuracy: 98.02
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-base_fcmae-in21k-pre_3rdparty_in1k_20230104-c48d16a5.pth
+    Config: configs/convnext_v2/convnext-v2-base_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_base_22k_224_ema.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-large_3rdparty-fcmae_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 34403182080
+      Parameters: 197956840
+    In Collection: ConvNeXt V2
+    Results: null
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-large_3rdparty-fcmae_in1k_20230104-bf38df92.pth
+    Config: configs/convnext_v2/convnext-v2-large_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/pt_only/convnextv2_large_1k_224_fcmae.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-large_fcmae-pre_3rdparty_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 34403182080
+      Parameters: 197956840
+    In Collection: ConvNeXt V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 85.76
+          Top 5 Accuracy: 97.59
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-large_fcmae-pre_3rdparty_in1k_20230104-ef393013.pth
+    Config: configs/convnext_v2/convnext-v2-large_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_large_1k_224_ema.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-large_fcmae-in21k-pre_3rdparty_in1k
+    Metadata:
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+      FLOPs: 34403182080
+      Parameters: 197956840
+    In Collection: ConvNeXt V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 87.26
+          Top 5 Accuracy: 98.24
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-large_fcmae-in21k-pre_3rdparty_in1k_20230104-d9c4dc0c.pth
+    Config: configs/convnext_v2/convnext-v2-large_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_large_22k_224_ema.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-base_fcmae-in21k-pre_3rdparty_in1k-384px
+    Metadata:
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+      FLOPs: 45205885952
+      Parameters: 88717800
+    In Collection: ConvNeXt V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 87.63
+          Top 5 Accuracy: 98.42
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-base_fcmae-in21k-pre_3rdparty_in1k-384px_20230104-379425cc.pth
+    Config: configs/convnext_v2/convnext-v2-base_32xb32_in1k-384px.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_base_22k_384_ema.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-large_fcmae-in21k-pre_3rdparty_in1k-384px
+    Metadata:
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+      FLOPs: 101103214080
+      Parameters: 197956840
+    In Collection: ConvNeXt V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 88.18
+          Top 5 Accuracy: 98.52
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-large_fcmae-in21k-pre_3rdparty_in1k-384px_20230104-9139a1f3.pth
+    Config: configs/convnext_v2/convnext-v2-large_32xb32_in1k-384px.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_large_22k_384_ema.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-huge_3rdparty-fcmae_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 114998639360
+      Parameters: 660289640
+    In Collection: ConvNeXt V2
+    Results: null
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-huge_3rdparty-fcmae_in1k_20230104-fe43ae6c.pth
+    Config: configs/convnext_v2/convnext-v2-huge_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/pt_only/convnextv2_huge_1k_224_fcmae.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-huge_fcmae-pre_3rdparty_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 114998639360
+      Parameters: 660289640
+    In Collection: ConvNeXt V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 86.25
+          Top 5 Accuracy: 97.75
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-huge_fcmae-pre_3rdparty_in1k_20230104-f795e5b8.pth
+    Config: configs/convnext_v2/convnext-v2-huge_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_huge_1k_224_ema.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-huge_fcmae-in21k-pre_3rdparty_in1k-384px
+    Metadata:
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+      FLOPs: 337955157760
+      Parameters: 660289640
+    In Collection: ConvNeXt V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 88.68
+          Top 5 Accuracy: 98.73
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-huge_fcmae-in21k-pre_3rdparty_in1k-384px_20230104-02a4eb35.pth
+    Config: configs/convnext_v2/convnext-v2-huge_32xb32_in1k-384px.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_huge_22k_384_ema.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-huge_fcmae-in21k-pre_3rdparty_in1k-512px
+    Metadata:
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+      FLOPs: 600809158400
+      Parameters: 660289640
+    In Collection: ConvNeXt V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 88.86
+          Top 5 Accuracy: 98.74
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-huge_fcmae-in21k-pre_3rdparty_in1k-512px_20230104-ce32e63c.pth
+    Config: configs/convnext_v2/convnext-v2-huge_32xb32_in1k-512px.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_huge_22k_512_ema.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
--- a/configs/cspnet/README.md
+++ b/configs/cspnet/README.md
+# CSPNet
+
+> [CSPNet: A New Backbone that can Enhance Learning Capability of CNN](https://arxiv.org/abs/1911.11929)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Neural networks have enabled state-of-the-art approaches to achieve incredible results on computer vision tasks such as object detection. However, such success greatly relies on costly computation resources, which hinders people with cheap devices from appreciating the advanced technology. In this paper, we propose Cross Stage Partial Network (CSPNet) to mitigate the problem that previous works require heavy inference computations from the network architecture perspective. We attribute the problem to the duplicate gradient information within network optimization. The proposed networks respect the variability of the gradients by integrating feature maps from the beginning and the end of a network stage, which, in our experiments, reduces computations by 20% with equivalent or even superior accuracy on the ImageNet dataset, and significantly outperforms state-of-the-art approaches in terms of AP50 on the MS COCO object detection dataset. The CSPNet is easy to implement and general enough to cope with architectures based on ResNet, ResNeXt, and DenseNet. Source code is at this https URL.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/18586273/159420842-6147c687-a488-460c-8bb2-4ea5276c26c7.png" width="60%"/>
+</div>
+
+## How to use it?
+
+<!-- [TABS-BEGIN] -->
+
+**Predict image**
+
+```python
+from mmpretrain import inference_model
+
+predict = inference_model('cspdarknet50_3rdparty_8xb32_in1k', 'demo/bird.JPEG')
+print(predict['pred_class'])
+print(predict['pred_score'])
+```
+
+**Use the model**
+
+```python
+import torch
+from mmpretrain import get_model
+
+model = get_model('cspdarknet50_3rdparty_8xb32_in1k', pretrained=True)
+inputs = torch.rand(1, 3, 224, 224)
+out = model(inputs)
+print(type(out))
+# To extract features.
+feats = model.extract_feat(inputs)
+print(type(feats))
+```
+
+**Test Command**
+
+Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
+
+Test:
+
+```shell
+python tools/test.py configs/cspnet/cspdarknet50_8xb32_in1k.py https://download.openmmlab.com/mmclassification/v0/cspnet/cspdarknet50_3rdparty_8xb32_in1k_20220329-bd275287.pth
+```
+
+<!-- [TABS-END] -->
+
+## Models and results
+
+### Image Classification on ImageNet-1k
+
+| Model                                |   Pretrain   | Params (M) | Flops (G) | Top-1 (%) | Top-5 (%) |                Config                |                                    Download                                     |
+| :----------------------------------- | :----------: | :--------: | :-------: | :-------: | :-------: | :----------------------------------: | :-----------------------------------------------------------------------------: |
+| `cspdarknet50_3rdparty_8xb32_in1k`\* | From scratch |   27.64    |   5.04    |   80.05   |   95.07   | [config](cspdarknet50_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/cspnet/cspdarknet50_3rdparty_8xb32_in1k_20220329-bd275287.pth) |
+| `cspresnet50_3rdparty_8xb32_in1k`\*  | From scratch |   21.62    |   3.48    |   79.55   |   94.68   | [config](cspresnet50_8xb32_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/cspnet/cspresnet50_3rdparty_8xb32_in1k_20220329-dd6dddfb.pth) |
+| `cspresnext50_3rdparty_8xb32_in1k`\* | From scratch |   20.57    |   3.11    |   79.96   |   94.96   | [config](cspresnext50_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/cspnet/cspresnext50_3rdparty_8xb32_in1k_20220329-2cc84d21.pth) |
+
+*Models with * are converted from the [official repo](https://github.com/rwightman/pytorch-image-models). The config files of these models are only for inference. We haven't reproduce the training results.*
+
+## Citation
+
+```bibtex
+@inproceedings{wang2020cspnet,
+  title={CSPNet: A new backbone that can enhance learning capability of CNN},
+  author={Wang, Chien-Yao and Liao, Hong-Yuan Mark and Wu, Yueh-Hua and Chen, Ping-Yang and Hsieh, Jun-Wei and Yeh, I-Hau},
+  booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition workshops},
+  pages={390--391},
+  year={2020}
+}
+```
--- a/configs/cspnet/cspdarknet50_8xb32_in1k.py
+++ b/configs/cspnet/cspdarknet50_8xb32_in1k.py
+_base_ = [
+    '../_base_/datasets/imagenet_bs32.py',
+    '../_base_/schedules/imagenet_bs256.py',
+    '../_base_/default_runtime.py',
+]
+
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='CSPDarkNet', depth=53),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=1024,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+    ))
+
+# dataset settings
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='RandomResizedCrop',
+        scale=224,
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='ResizeEdge',
+        scale=288,
+        edge='short',
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='CenterCrop', crop_size=256),
+    dict(type='PackInputs'),
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
--- a/configs/cspnet/cspresnet50_8xb32_in1k.py
+++ b/configs/cspnet/cspresnet50_8xb32_in1k.py
+_base_ = [
+    '../_base_/datasets/imagenet_bs32.py',
+    '../_base_/schedules/imagenet_bs256.py',
+    '../_base_/default_runtime.py',
+]
+
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='CSPResNet', depth=50),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=1024,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+    ))
+
+# dataset settings
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='RandomResizedCrop',
+        scale=224,
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='ResizeEdge',
+        scale=288,
+        edge='short',
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='CenterCrop', crop_size=256),
+    dict(type='PackInputs'),
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
--- a/configs/cspnet/cspresnext50_8xb32_in1k.py
+++ b/configs/cspnet/cspresnext50_8xb32_in1k.py
+_base_ = [
+    '../_base_/datasets/imagenet_bs32.py',
+    '../_base_/schedules/imagenet_bs256.py',
+    '../_base_/default_runtime.py',
+]
+
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='CSPResNeXt', depth=50),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=2048,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+    ))
+
+# dataset settings
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='RandomResizedCrop',
+        scale=224,
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='ResizeEdge',
+        scale=256,
+        edge='short',
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='PackInputs'),
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
--- a/configs/cspnet/metafile.yml
+++ b/configs/cspnet/metafile.yml
+Collections:
+  - Name: CSPNet
+    Metadata:
+      Training Data: ImageNet-1k
+      Architecture:
+        - Cross Stage Partia Stage
+    Paper:
+      URL: https://arxiv.org/abs/1911.11929
+      Title: 'CSPNet: A New Backbone that can Enhance Learning Capability of CNN'
+    README: configs/cspnet/README.md
+    Code:
+      Version: v0.22.0
+      URL: https://github.com/open-mmlab/mmpretrain/blob/v0.22.0/mmcls/models/backbones/cspnet.py
+
+Models:
+  - Name: cspdarknet50_3rdparty_8xb32_in1k
+    Metadata:
+      FLOPs: 5040000000
+      Parameters: 27640000
+    In Collection: CSPNet
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 80.05
+          Top 5 Accuracy: 95.07
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/cspnet/cspdarknet50_3rdparty_8xb32_in1k_20220329-bd275287.pth
+    Config: configs/cspnet/cspdarknet50_8xb32_in1k.py
+    Converted From:
+      Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/cspdarknet53_ra_256-d05c7c21.pth
+      Code: https://github.com/rwightman/pytorch-image-models
+  - Name: cspresnet50_3rdparty_8xb32_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 3480000000
+      Parameters: 21620000
+    In Collection: CSPNet
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 79.55
+          Top 5 Accuracy: 94.68
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/cspnet/cspresnet50_3rdparty_8xb32_in1k_20220329-dd6dddfb.pth
+    Config: configs/cspnet/cspresnet50_8xb32_in1k.py
+    Converted From:
+      Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/cspresnet50_ra-d3e8d487.pth
+      Code: https://github.com/rwightman/pytorch-image-models
+  - Name: cspresnext50_3rdparty_8xb32_in1k
+    Metadata:
+      FLOPs: 3110000000
+      Parameters: 20570000
+    In Collection: CSPNet
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 79.96
+          Top 5 Accuracy: 94.96
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/cspnet/cspresnext50_3rdparty_8xb32_in1k_20220329-2cc84d21.pth
+    Config: configs/cspnet/cspresnext50_8xb32_in1k.py
+    Converted From:
+      Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/cspresnext50_ra_224-648b4713.pth
+      Code: https://github.com/rwightman/pytorch-image-models
--- a/configs/csra/README.md
+++ b/configs/csra/README.md
+# CSRA
+
+> [Residual Attention: A Simple but Effective Method for Multi-Label Recognition](https://arxiv.org/abs/2108.02456)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Multi-label image recognition is a challenging computer vision task of practical use. Progresses in this area, however, are often characterized by complicated methods, heavy computations, and lack of intuitive explanations. To effectively capture different spatial regions occupied by objects from different categories, we propose an embarrassingly simple module, named class-specific residual attention (CSRA). CSRA generates class-specific features for every category by proposing a simple spatial attention score, and then combines it with the class-agnostic average pooling feature. CSRA achieves state-of-the-art results on multilabel recognition, and at the same time is much simpler than them. Furthermore, with only 4 lines of code, CSRA also leads to consistent improvement across many diverse pretrained models and datasets without any extra training. CSRA is both easy to implement and light in computations, which also enjoys intuitive explanations and visualizations.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/84259897/176982245-3ffcff56-a4ea-4474-9967-bc2b612bbaa3.png" width="80%"/>
+</div>
+
+## How to use it?
+
+<!-- [TABS-BEGIN] -->
+
+**Use the model**
+
+```python
+import torch
+from mmpretrain import get_model
+
+model = get_model('resnet101-csra_1xb16_voc07-448px', pretrained=True)
+inputs = torch.rand(1, 3, 224, 224)
+out = model(inputs)
+print(type(out))
+# To extract features.
+feats = model.extract_feat(inputs)
+print(type(feats))
+```
+
+**Train/Test Command**
+
+Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
+
+Train:
+
+```shell
+python tools/train.py configs/csra/resnet101-csra_1xb16_voc07-448px.py
+```
+
+Test:
+
+```shell
+python tools/test.py configs/csra/resnet101-csra_1xb16_voc07-448px.py https://download.openmmlab.com/mmclassification/v0/csra/resnet101-csra_1xb16_voc07-448px_20220722-29efb40a.pth
+```
+
+<!-- [TABS-END] -->
+
+## Models and results
+
+### Multi-Label Classification on PASCAL VOC 2007
+
+| Model                              |   Pretrain   | Params (M) | Flops (G) |  CF1  |  OF1  |  mAP  |                    Config                     |                                  Download                                   |
+| :--------------------------------- | :----------: | :--------: | :-------: | :---: | :---: | :---: | :-------------------------------------------: | :-------------------------------------------------------------------------: |
+| `resnet101-csra_1xb16_voc07-448px` | From scratch |   23.55    |   4.12    | 89.16 | 90.80 | 94.98 | [config](resnet101-csra_1xb16_voc07-448px.py) | [model](https://download.openmmlab.com/mmclassification/v0/csra/resnet101-csra_1xb16_voc07-448px_20220722-29efb40a.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/csra/resnet101-csra_1xb16_voc07-448px_20220722-29efb40a.json) |
+
+## Citation
+
+```bibtex
+@misc{https://doi.org/10.48550/arxiv.2108.02456,
+  doi = {10.48550/ARXIV.2108.02456},
+  url = {https://arxiv.org/abs/2108.02456},
+  author = {Zhu, Ke and Wu, Jianxin},
+  keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
+  title = {Residual Attention: A Simple but Effective Method for Multi-Label Recognition},
+  publisher = {arXiv},
+  year = {2021},
+  copyright = {arXiv.org perpetual, non-exclusive license}
+}
+```
--- a/configs/csra/metafile.yml
+++ b/configs/csra/metafile.yml
+Collections:
+  - Name: CSRA
+    Metadata:
+      Training Data: PASCAL VOC 2007
+      Architecture:
+        - Class-specific Residual Attention
+    Paper:
+      URL: https://arxiv.org/abs/2108.02456
+      Title: 'Residual Attention: A Simple but Effective Method for Multi-Label Recognition'
+    README: configs/csra/README.md
+    Code:
+      Version: v0.24.0
+      URL: https://github.com/open-mmlab/mmpretrain/blob/v0.24.0/mmcls/models/heads/multi_label_csra_head.py
+
+Models:
+  - Name: resnet101-csra_1xb16_voc07-448px
+    Metadata:
+      FLOPs: 4120000000
+      Parameters: 23550000
+    In Collection: CSRA
+    Results:
+      - Dataset: PASCAL VOC 2007
+        Metrics:
+          mAP: 94.98
+          OF1: 90.80
+          CF1: 89.16
+        Task: Multi-Label Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/csra/resnet101-csra_1xb16_voc07-448px_20220722-29efb40a.pth
+    Config: configs/csra/resnet101-csra_1xb16_voc07-448px.py
--- a/configs/csra/resnet101-csra_1xb16_voc07-448px.py
+++ b/configs/csra/resnet101-csra_1xb16_voc07-448px.py
+_base_ = ['../_base_/datasets/voc_bs16.py', '../_base_/default_runtime.py']
+
+# Pre-trained Checkpoint Path
+checkpoint = 'https://download.openmmlab.com/mmclassification/v0/resnet/resnet101_8xb32_in1k_20210831-539c63f8.pth'  # noqa
+# If you want to use the pre-trained weight of ResNet101-CutMix from the
+# originary repo(https://github.com/Kevinz-code/CSRA). Script of
+# 'tools/model_converters/torchvision_to_mmpretrain.py' can help you convert
+# weight into mmpretrain format. The mAP result would hit 95.5 by using the
+# weight. checkpoint = 'PATH/TO/PRE-TRAINED_WEIGHT'
+
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(3, ),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint=checkpoint, prefix='backbone')),
+    neck=None,
+    head=dict(
+        type='CSRAClsHead',
+        num_classes=20,
+        in_channels=2048,
+        num_heads=1,
+        lam=0.1,
+        loss=dict(type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)))
+
+# dataset setting
+data_preprocessor = dict(
+    # RGB format normalization parameters
+    mean=[0, 0, 0],
+    std=[255, 255, 255])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='RandomResizedCrop', scale=448, crop_ratio_range=(0.7, 1.0)),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=448),
+    dict(
+        type='PackInputs',
+        # `gt_label_difficult` is needed for VOC evaluation
+        meta_keys=('sample_idx', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction',
+                   'gt_label_difficult')),
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# optimizer
+# the lr of classifier.head is 10 * base_lr, which help convergence.
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.0002, momentum=0.9, weight_decay=0.0001),
+    paramwise_cfg=dict(custom_keys={'head': dict(lr_mult=10)}))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1e-7,
+        by_epoch=True,
+        begin=0,
+        end=1,
+        convert_to_iter_based=True),
+    dict(type='StepLR', by_epoch=True, step_size=6, gamma=0.1)
+]
+
+train_cfg = dict(by_epoch=True, max_epochs=20, val_interval=1)
+val_cfg = dict()
+test_cfg = dict()
--- a/configs/davit/README.md
+++ b/configs/davit/README.md
+# DaViT
+
+> [DaViT: Dual Attention Vision Transformers](https://arxiv.org/abs/2204.03645v1)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+In this work, we introduce Dual Attention Vision Transformers (DaViT), a simple yet effective vision transformer architecture that is able to capture global context while maintaining computational efficiency. We propose approaching the problem from an orthogonal angle: exploiting self-attention mechanisms with both "spatial tokens" and "channel tokens". With spatial tokens, the spatial dimension defines the token scope, and the channel dimension defines the token feature dimension. With channel tokens, we have the inverse: the channel dimension defines the token scope, and the spatial dimension defines the token feature dimension. We further group tokens along the sequence direction for both spatial and channel tokens to maintain the linear complexity of the entire model. We show that these two self-attentions complement each other: (i) since each channel token contains an abstract representation of the entire image, the channel attention naturally captures global interactions and representations by taking all spatial positions into account when computing attention scores between channels; (ii) the spatial attention refines the local representations by performing fine-grained interactions across spatial locations, which in turn helps the global information modeling in channel attention. Extensive experiments show our DaViT achieves state-of-the-art performance on four different tasks with efficient computations. Without extra data, DaViT-Tiny, DaViT-Small, and DaViT-Base achieve 82.8%, 84.2%, and 84.6% top-1 accuracy on ImageNet-1K with 28.3M, 49.7M, and 87.9M parameters, respectively. When we further scale up DaViT with 1.5B weakly supervised image and text pairs, DaViT-Gaint reaches 90.4% top-1 accuracy on ImageNet-1K.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24734142/196125065-e232409b-f710-4729-b657-4e5f9158f2d1.png" width="90%"/>
+</div>
+
+## How to use it?
+
+<!-- [TABS-BEGIN] -->
+
+**Predict image**
+
+```python
+from mmpretrain import inference_model
+
+predict = inference_model('davit-tiny_3rdparty_in1k', 'demo/bird.JPEG')
+print(predict['pred_class'])
+print(predict['pred_score'])
+```
+
+**Use the model**
+
+```python
+import torch
+from mmpretrain import get_model
+
+model = get_model('davit-tiny_3rdparty_in1k', pretrained=True)
+inputs = torch.rand(1, 3, 224, 224)
+out = model(inputs)
+print(type(out))
+# To extract features.
+feats = model.extract_feat(inputs)
+print(type(feats))
+```
+
+**Test Command**
+
+Prepare your dataset according to the [docs](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#prepare-dataset).
+
+Test:
+
+```shell
+python tools/test.py configs/davit/davit-tiny_4xb256_in1k.py https://download.openmmlab.com/mmclassification/v0/davit/davit-tiny_3rdparty_in1k_20221116-700fdf7d.pth
+```
+
+<!-- [TABS-END] -->
+
+## Models and results
+
+### Image Classification on ImageNet-1k
+
+| Model                         |   Pretrain   | Params (M) | Flops (G) | Top-1 (%) | Top-5 (%) |                Config                |                                        Download                                        |
+| :---------------------------- | :----------: | :--------: | :-------: | :-------: | :-------: | :----------------------------------: | :------------------------------------------------------------------------------------: |
+| `davit-tiny_3rdparty_in1k`\*  | From scratch |   28.36    |   4.54    |   82.24   |   96.13   | [config](davit-tiny_4xb256_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/davit/davit-tiny_3rdparty_in1k_20221116-700fdf7d.pth) |
+| `davit-small_3rdparty_in1k`\* | From scratch |   49.75    |   8.80    |   83.61   |   96.75   | [config](davit-small_4xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/davit/davit-small_3rdparty_in1k_20221116-51a849a6.pth) |
+| `davit-base_3rdparty_in1k`\*  | From scratch |   87.95    |   15.51   |   84.09   |   96.82   | [config](davit-base_4xb256_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/davit/davit-base_3rdparty_in1k_20221116-19e0d956.pth) |
+
+*Models with * are converted from the [official repo](https://github.com/dingmyu/davit/blob/main/mmdet/mmdet/models/backbones/davit.py#L355). The config files of these models are only for inference. We haven't reproduce the training results.*
+
+## Citation
+
+```bibtex
+@inproceedings{ding2022davit,
+    title={DaViT: Dual Attention Vision Transformer},
+    author={Ding, Mingyu and Xiao, Bin and Codella, Noel and Luo, Ping and Wang, Jingdong and Yuan, Lu},
+    booktitle={ECCV},
+    year={2022},
+}
+```
--- a/configs/davit/davit-base_4xb256_in1k.py
+++ b/configs/davit/davit-base_4xb256_in1k.py
+_base_ = [
+    '../_base_/models/davit/davit-base.py',
+    '../_base_/datasets/imagenet_bs256_davit_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py'
+]
+
+# data settings
+train_dataloader = dict(batch_size=256)
--- a/configs/davit/davit-small_4xb256_in1k.py
+++ b/configs/davit/davit-small_4xb256_in1k.py
+_base_ = [
+    '../_base_/models/davit/davit-small.py',
+    '../_base_/datasets/imagenet_bs256_davit_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py'
+]
+
+# data settings
+train_dataloader = dict(batch_size=256)
--- a/configs/davit/davit-tiny_4xb256_in1k.py
+++ b/configs/davit/davit-tiny_4xb256_in1k.py
+_base_ = [
+    '../_base_/models/davit/davit-tiny.py',
+    '../_base_/datasets/imagenet_bs256_davit_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py'
+]
+
+# data settings
+train_dataloader = dict(batch_size=256)
--- a/configs/davit/metafile.yml
+++ b/configs/davit/metafile.yml
+Collections:
+  - Name: DaViT
+    Metadata:
+      Architecture:
+        - GELU
+        - Layer Normalization
+        - Multi-Head Attention
+        - Scaled Dot-Product Attention
+    Paper:
+      URL: https://arxiv.org/abs/2204.03645v1
+      Title: 'DaViT: Dual Attention Vision Transformers'
+    README: configs/davit/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmpretrain/blob/v1.0.0rc3/mmcls/models/backbones/davit.py
+      Version: v1.0.0rc3
+
+Models:
+  - Name: davit-tiny_3rdparty_in1k
+    In Collection: DaViT
+    Metadata:
+      FLOPs: 4539698688
+      Parameters: 28360168
+      Training Data:
+        - ImageNet-1k
+    Results:
+    - Dataset: ImageNet-1k
+      Task: Image Classification
+      Metrics:
+        Top 1 Accuracy: 82.24
+        Top 5 Accuracy: 96.13
+    Weights: https://download.openmmlab.com/mmclassification/v0/davit/davit-tiny_3rdparty_in1k_20221116-700fdf7d.pth
+    Converted From:
+      Weights: https://drive.google.com/file/d/1RSpi3lxKaloOL5-or20HuG975tbPwxRZ/view?usp=sharing
+      Code: https://github.com/dingmyu/davit/blob/main/mmdet/mmdet/models/backbones/davit.py#L355
+    Config: configs/davit/davit-tiny_4xb256_in1k.py
+  - Name: davit-small_3rdparty_in1k
+    In Collection: DaViT
+    Metadata:
+      FLOPs: 8799942144
+      Parameters: 49745896
+      Training Data:
+        - ImageNet-1k
+    Results:
+    - Dataset: ImageNet-1k
+      Task: Image Classification
+      Metrics:
+        Top 1 Accuracy: 83.61
+        Top 5 Accuracy: 96.75
+    Weights: https://download.openmmlab.com/mmclassification/v0/davit/davit-small_3rdparty_in1k_20221116-51a849a6.pth
+    Converted From:
+      Weights: https://drive.google.com/file/d/1q976ruj45mt0RhO9oxhOo6EP_cmj4ahQ/view?usp=sharing
+      Code: https://github.com/dingmyu/davit/blob/main/mmdet/mmdet/models/backbones/davit.py#L355
+    Config: configs/davit/davit-small_4xb256_in1k.py
+  - Name: davit-base_3rdparty_in1k
+    In Collection: DaViT
+    Metadata:
+      FLOPs: 15509702656
+      Parameters: 87954408
+      Training Data:
+        - ImageNet-1k
+    Results:
+    - Dataset: ImageNet-1k
+      Task: Image Classification
+      Metrics:
+        Top 1 Accuracy: 84.09
+        Top 5 Accuracy: 96.82
+    Weights: https://download.openmmlab.com/mmclassification/v0/davit/davit-base_3rdparty_in1k_20221116-19e0d956.pth
+    Converted From:
+      Weights: https://drive.google.com/file/d/1u9sDBEueB-YFuLigvcwf4b2YyA4MIVsZ/view?usp=sharing
+      Code: https://github.com/dingmyu/davit/blob/main/mmdet/mmdet/models/backbones/davit.py#L355
+    Config: configs/davit/davit-base_4xb256_in1k.py