add tests part

1baf0566 · limm · 495d9ed9 · 1baf0566 · 1baf0566 · 1baf0566
Commit 1baf0566 authored Jun 24, 2025 by limm
20 changed files
--- a/tests/test_models/test_backbones/test_resnext.py
+++ b/tests/test_models/test_backbones/test_resnext.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmpretrain.models.backbones import ResNeXt
+from mmpretrain.models.backbones.resnext import Bottleneck as BottleneckX
+
+
+def test_bottleneck():
+    with pytest.raises(AssertionError):
+        # Style must be in ['pytorch', 'caffe']
+        BottleneckX(64, 64, groups=32, width_per_group=4, style='tensorflow')
+
+    # Test ResNeXt Bottleneck structure
+    block = BottleneckX(
+        64, 256, groups=32, width_per_group=4, stride=2, style='pytorch')
+    assert block.conv2.stride == (2, 2)
+    assert block.conv2.groups == 32
+    assert block.conv2.out_channels == 128
+
+    # Test ResNeXt Bottleneck forward
+    block = BottleneckX(64, 64, base_channels=16, groups=32, width_per_group=4)
+    x = torch.randn(1, 64, 56, 56)
+    x_out = block(x)
+    assert x_out.shape == torch.Size([1, 64, 56, 56])
+
+
+def test_resnext():
+    with pytest.raises(KeyError):
+        # ResNeXt depth should be in [50, 101, 152]
+        ResNeXt(depth=18)
+
+    # Test ResNeXt with group 32, width_per_group 4
+    model = ResNeXt(
+        depth=50, groups=32, width_per_group=4, out_indices=(0, 1, 2, 3))
+    for m in model.modules():
+        if isinstance(m, BottleneckX):
+            assert m.conv2.groups == 32
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 4
+    assert feat[0].shape == torch.Size([1, 256, 56, 56])
+    assert feat[1].shape == torch.Size([1, 512, 28, 28])
+    assert feat[2].shape == torch.Size([1, 1024, 14, 14])
+    assert feat[3].shape == torch.Size([1, 2048, 7, 7])
+
+    # Test ResNeXt with group 32, width_per_group 4 and layers 3 out forward
+    model = ResNeXt(depth=50, groups=32, width_per_group=4, out_indices=(3, ))
+    for m in model.modules():
+        if isinstance(m, BottleneckX):
+            assert m.conv2.groups == 32
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 1
+    assert feat[0].shape == torch.Size([1, 2048, 7, 7])
--- a/tests/test_models/test_backbones/test_revvit.py
+++ b/tests/test_models/test_backbones/test_revvit.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import tempfile
+from copy import deepcopy
+from unittest import TestCase
+
+import torch
+from mmengine.runner import load_checkpoint, save_checkpoint
+
+from mmpretrain.models.backbones import RevVisionTransformer
+from .utils import timm_resize_pos_embed
+
+
+class TestRevVisionTransformer(TestCase):
+
+    def setUp(self):
+        self.cfg = dict(
+            arch='b', img_size=224, patch_size=16, drop_path_rate=0.1)
+
+    def test_structure(self):
+        # Test invalid default arch
+        with self.assertRaisesRegex(AssertionError, 'not in default archs'):
+            cfg = deepcopy(self.cfg)
+            cfg['arch'] = 'unknown'
+            RevVisionTransformer(**cfg)
+
+        # Test invalid custom arch
+        with self.assertRaisesRegex(AssertionError, 'Custom arch needs'):
+            cfg = deepcopy(self.cfg)
+            cfg['arch'] = {
+                'num_layers': 24,
+                'num_heads': 16,
+                'feedforward_channels': 4096
+            }
+            RevVisionTransformer(**cfg)
+
+        # Test custom arch
+        cfg = deepcopy(self.cfg)
+        cfg['arch'] = {
+            'embed_dims': 128,
+            'num_layers': 24,
+            'num_heads': 16,
+            'feedforward_channels': 1024
+        }
+        model = RevVisionTransformer(**cfg)
+        self.assertEqual(model.embed_dims, 128)
+        self.assertEqual(model.num_layers, 24)
+        for layer in model.layers:
+            self.assertEqual(layer.attn.num_heads, 16)
+            self.assertEqual(layer.ffn.feedforward_channels, 1024)
+
+        # Test model structure
+        cfg = deepcopy(self.cfg)
+        model = RevVisionTransformer(**cfg)
+        self.assertEqual(len(model.layers), 12)
+        dpr_inc = 0.1 / (12 - 1)
+        dpr = 0
+        for layer in model.layers:
+            self.assertEqual(layer.attn.embed_dims, 768)
+            self.assertEqual(layer.attn.num_heads, 12)
+            self.assertEqual(layer.ffn.feedforward_channels, 3072)
+            # self.assertAlmostEqual(layer.attn.out_drop.drop_prob, dpr)
+            # self.assertAlmostEqual(layer.ffn.dropout_layer.drop_prob, dpr)
+            dpr += dpr_inc
+
+    def test_init_weights(self):
+        # test weight init cfg
+        cfg = deepcopy(self.cfg)
+        cfg['init_cfg'] = [
+            dict(
+                type='Kaiming',
+                layer='Conv2d',
+                mode='fan_in',
+                nonlinearity='linear')
+        ]
+        model = RevVisionTransformer(**cfg)
+        ori_weight = model.patch_embed.projection.weight.clone().detach()
+        # The pos_embed is all zero before initialize
+        self.assertTrue(torch.allclose(model.pos_embed, torch.tensor(0.)))
+
+        model.init_weights()
+        initialized_weight = model.patch_embed.projection.weight
+        self.assertFalse(torch.allclose(ori_weight, initialized_weight))
+        self.assertFalse(torch.allclose(model.pos_embed, torch.tensor(0.)))
+
+        # test load checkpoint
+        pretrain_pos_embed = model.pos_embed.clone().detach()
+        tmpdir = tempfile.gettempdir()
+        checkpoint = os.path.join(tmpdir, 'test.pth')
+        save_checkpoint(model.state_dict(), checkpoint)
+        cfg = deepcopy(self.cfg)
+        model = RevVisionTransformer(**cfg)
+        load_checkpoint(model, checkpoint, strict=True)
+        self.assertTrue(torch.allclose(model.pos_embed, pretrain_pos_embed))
+
+        # test load checkpoint with different img_size
+        cfg = deepcopy(self.cfg)
+        cfg['img_size'] = 384
+        model = RevVisionTransformer(**cfg)
+        load_checkpoint(model, checkpoint, strict=True)
+        resized_pos_embed = timm_resize_pos_embed(
+            pretrain_pos_embed, model.pos_embed, num_tokens=0)
+        self.assertTrue(torch.allclose(model.pos_embed, resized_pos_embed))
+
+        os.remove(checkpoint)
+
+    def test_forward(self):
+        imgs = torch.randn(1, 3, 224, 224)
+
+        cfg = deepcopy(self.cfg)
+        cfg['with_cls_token'] = False
+        cfg['out_type'] = 'avg_featmap'
+        model = RevVisionTransformer(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 1)
+        patch_token = outs[-1]
+        self.assertEqual(patch_token.shape, (1, 768 * 2))
+
+        # Test forward with dynamic input size
+        imgs1 = torch.randn(1, 3, 224, 224)
+        imgs2 = torch.randn(1, 3, 256, 256)
+        imgs3 = torch.randn(1, 3, 256, 309)
+        cfg = deepcopy(self.cfg)
+        model = RevVisionTransformer(**cfg)
+        for imgs in [imgs1, imgs2, imgs3]:
+            outs = model(imgs)
+            self.assertIsInstance(outs, tuple)
+            self.assertEqual(len(outs), 1)
+            avg_featmap = outs[-1]
+            self.assertEqual(avg_featmap.shape, (1, 768 * 2))
--- a/tests/test_models/test_backbones/test_riformer.py
+++ b/tests/test_models/test_backbones/test_riformer.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from copy import deepcopy
+from unittest import TestCase
+
+import torch
+import torch.nn as nn
+
+from mmpretrain.models.backbones import RIFormer
+from mmpretrain.models.backbones.riformer import RIFormerBlock
+
+
+class TestRIFormer(TestCase):
+
+    def setUp(self):
+        arch = 's12'
+        self.cfg = dict(arch=arch, drop_path_rate=0.1)
+        self.arch = RIFormer.arch_settings[arch]
+
+    def test_arch(self):
+        # Test invalid default arch
+        with self.assertRaisesRegex(AssertionError, 'Unavailable arch'):
+            cfg = deepcopy(self.cfg)
+            cfg['arch'] = 'unknown'
+            RIFormer(**cfg)
+
+        # Test invalid custom arch
+        with self.assertRaisesRegex(AssertionError, 'must have "layers"'):
+            cfg = deepcopy(self.cfg)
+            cfg['arch'] = {
+                'embed_dims': 96,
+                'num_heads': [3, 6, 12, 16],
+            }
+            RIFormer(**cfg)
+
+        # Test custom arch
+        cfg = deepcopy(self.cfg)
+        layers = [2, 2, 4, 2]
+        embed_dims = [6, 12, 6, 12]
+        mlp_ratios = [2, 3, 4, 4]
+        layer_scale_init_value = 1e-4
+        cfg['arch'] = dict(
+            layers=layers,
+            embed_dims=embed_dims,
+            mlp_ratios=mlp_ratios,
+            layer_scale_init_value=layer_scale_init_value,
+        )
+        model = RIFormer(**cfg)
+        for i, stage in enumerate(model.network):
+            if not isinstance(stage, RIFormerBlock):
+                continue
+            self.assertEqual(len(stage), layers[i])
+            self.assertEqual(stage[0].mlp.fc1.in_channels, embed_dims[i])
+            self.assertEqual(stage[0].mlp.fc1.out_channels,
+                             embed_dims[i] * mlp_ratios[i])
+            self.assertTrue(
+                torch.allclose(stage[0].layer_scale_1,
+                               torch.tensor(layer_scale_init_value)))
+            self.assertTrue(
+                torch.allclose(stage[0].layer_scale_2,
+                               torch.tensor(layer_scale_init_value)))
+
+    def test_init_weights(self):
+        # test weight init cfg
+        cfg = deepcopy(self.cfg)
+        cfg['init_cfg'] = [
+            dict(
+                type='Kaiming',
+                layer='Conv2d',
+                mode='fan_in',
+                nonlinearity='linear')
+        ]
+        model = RIFormer(**cfg)
+        ori_weight = model.patch_embed.proj.weight.clone().detach()
+
+        model.init_weights()
+        initialized_weight = model.patch_embed.proj.weight
+        self.assertFalse(torch.allclose(ori_weight, initialized_weight))
+
+    def test_forward(self):
+        imgs = torch.randn(1, 3, 224, 224)
+
+        cfg = deepcopy(self.cfg)
+        model = RIFormer(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 1)
+        feat = outs[-1]
+        self.assertEqual(feat.shape, (1, 512, 7, 7))
+
+        # test multiple output indices
+        cfg = deepcopy(self.cfg)
+        cfg['out_indices'] = (0, 2, 4, 6)
+        model = RIFormer(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 4)
+        for dim, stride, out in zip(self.arch['embed_dims'], [1, 2, 4, 8],
+                                    outs):
+            self.assertEqual(out.shape, (1, dim, 56 // stride, 56 // stride))
+
+    def test_repameterization(self):
+        # Test eval of "train" mode and "deploy" mode
+        imgs = torch.randn(1, 3, 224, 224)
+        gap = nn.AdaptiveAvgPool2d(output_size=(1))
+        fc = nn.Linear(self.arch['embed_dims'][3], 10)
+
+        cfg = deepcopy(self.cfg)
+        cfg['out_indices'] = (0, 2, 4, 6)
+        model = RIFormer(**cfg)
+        model.eval()
+        feats = model(imgs)
+        self.assertIsInstance(feats, tuple)
+        feat = feats[-1]
+        pred = fc(gap(feat).flatten(1))
+        model.switch_to_deploy()
+        for m in model.modules():
+            if isinstance(m, RIFormerBlock):
+                assert m.deploy is True
+        feats_deploy = model(imgs)
+        pred_deploy = fc(gap(feats_deploy[-1]).flatten(1))
+        for i in range(4):
+            torch.allclose(feats[i], feats_deploy[i])
+        torch.allclose(pred, pred_deploy)
+
+    def test_structure(self):
+        # test drop_path_rate decay
+        cfg = deepcopy(self.cfg)
+        cfg['drop_path_rate'] = 0.2
+        model = RIFormer(**cfg)
+        layers = self.arch['layers']
+        for i, block in enumerate(model.network):
+            expect_prob = 0.2 / (sum(layers) - 1) * i
+            if hasattr(block, 'drop_path'):
+                if expect_prob == 0:
+                    self.assertIsInstance(block.drop_path, torch.nn.Identity)
+                else:
+                    self.assertAlmostEqual(block.drop_path.drop_prob,
+                                           expect_prob)
+
+        # test with first stage frozen.
+        cfg = deepcopy(self.cfg)
+        frozen_stages = 1
+        cfg['frozen_stages'] = frozen_stages
+        cfg['out_indices'] = (0, 2, 4, 6)
+        model = RIFormer(**cfg)
+        model.init_weights()
+        model.train()
+
+        # the patch_embed and first stage should not require grad.
+        self.assertFalse(model.patch_embed.training)
+        for param in model.patch_embed.parameters():
+            self.assertFalse(param.requires_grad)
+        for i in range(frozen_stages):
+            module = model.network[i]
+            for param in module.parameters():
+                self.assertFalse(param.requires_grad)
+        for param in model.norm0.parameters():
+            self.assertFalse(param.requires_grad)
+
+        # the second stage should require grad.
+        for i in range(frozen_stages + 1, 7):
+            module = model.network[i]
+            for param in module.parameters():
+                self.assertTrue(param.requires_grad)
+            if hasattr(model, f'norm{i}'):
+                norm = getattr(model, f'norm{i}')
+                for param in norm.parameters():
+                    self.assertTrue(param.requires_grad)
--- a/tests/test_models/test_backbones/test_seresnet.py
+++ b/tests/test_models/test_backbones/test_seresnet.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+from torch.nn.modules import AvgPool2d
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmpretrain.models.backbones import SEResNet
+from mmpretrain.models.backbones.resnet import ResLayer
+from mmpretrain.models.backbones.seresnet import SEBottleneck, SELayer
+
+
+def all_zeros(modules):
+    """Check if the weight(and bias) is all zero."""
+    weight_zero = torch.equal(modules.weight.data,
+                              torch.zeros_like(modules.weight.data))
+    if hasattr(modules, 'bias'):
+        bias_zero = torch.equal(modules.bias.data,
+                                torch.zeros_like(modules.bias.data))
+    else:
+        bias_zero = True
+
+    return weight_zero and bias_zero
+
+
+def check_norm_state(modules, train_state):
+    """Check if norm layer is in correct train state."""
+    for mod in modules:
+        if isinstance(mod, _BatchNorm):
+            if mod.training != train_state:
+                return False
+    return True
+
+
+def test_selayer():
+    # Test selayer forward
+    layer = SELayer(64)
+    x = torch.randn(1, 64, 56, 56)
+    x_out = layer(x)
+    assert x_out.shape == torch.Size([1, 64, 56, 56])
+
+    # Test selayer forward with different ratio
+    layer = SELayer(64, ratio=8)
+    x = torch.randn(1, 64, 56, 56)
+    x_out = layer(x)
+    assert x_out.shape == torch.Size([1, 64, 56, 56])
+
+
+def test_bottleneck():
+
+    with pytest.raises(AssertionError):
+        # Style must be in ['pytorch', 'caffe']
+        SEBottleneck(64, 64, style='tensorflow')
+
+    # Test SEBottleneck with checkpoint forward
+    block = SEBottleneck(64, 64, with_cp=True)
+    assert block.with_cp
+    x = torch.randn(1, 64, 56, 56)
+    x_out = block(x)
+    assert x_out.shape == torch.Size([1, 64, 56, 56])
+
+    # Test Bottleneck style
+    block = SEBottleneck(64, 256, stride=2, style='pytorch')
+    assert block.conv1.stride == (1, 1)
+    assert block.conv2.stride == (2, 2)
+    block = SEBottleneck(64, 256, stride=2, style='caffe')
+    assert block.conv1.stride == (2, 2)
+    assert block.conv2.stride == (1, 1)
+
+    # Test Bottleneck forward
+    block = SEBottleneck(64, 64)
+    x = torch.randn(1, 64, 56, 56)
+    x_out = block(x)
+    assert x_out.shape == torch.Size([1, 64, 56, 56])
+
+
+def test_res_layer():
+    # Test ResLayer of 3 Bottleneck w\o downsample
+    layer = ResLayer(SEBottleneck, 3, 64, 64, se_ratio=16)
+    assert len(layer) == 3
+    assert layer[0].conv1.in_channels == 64
+    assert layer[0].conv1.out_channels == 16
+    for i in range(1, len(layer)):
+        assert layer[i].conv1.in_channels == 64
+        assert layer[i].conv1.out_channels == 16
+    for i in range(len(layer)):
+        assert layer[i].downsample is None
+    x = torch.randn(1, 64, 56, 56)
+    x_out = layer(x)
+    assert x_out.shape == torch.Size([1, 64, 56, 56])
+
+    # Test ResLayer of 3 SEBottleneck with downsample
+    layer = ResLayer(SEBottleneck, 3, 64, 256, se_ratio=16)
+    assert layer[0].downsample[0].out_channels == 256
+    for i in range(1, len(layer)):
+        assert layer[i].downsample is None
+    x = torch.randn(1, 64, 56, 56)
+    x_out = layer(x)
+    assert x_out.shape == torch.Size([1, 256, 56, 56])
+
+    # Test ResLayer of 3 SEBottleneck with stride=2
+    layer = ResLayer(SEBottleneck, 3, 64, 256, stride=2, se_ratio=8)
+    assert layer[0].downsample[0].out_channels == 256
+    assert layer[0].downsample[0].stride == (2, 2)
+    for i in range(1, len(layer)):
+        assert layer[i].downsample is None
+    x = torch.randn(1, 64, 56, 56)
+    x_out = layer(x)
+    assert x_out.shape == torch.Size([1, 256, 28, 28])
+
+    # Test ResLayer of 3 SEBottleneck with stride=2 and average downsample
+    layer = ResLayer(
+        SEBottleneck, 3, 64, 256, stride=2, avg_down=True, se_ratio=8)
+    assert isinstance(layer[0].downsample[0], AvgPool2d)
+    assert layer[0].downsample[1].out_channels == 256
+    assert layer[0].downsample[1].stride == (1, 1)
+    for i in range(1, len(layer)):
+        assert layer[i].downsample is None
+    x = torch.randn(1, 64, 56, 56)
+    x_out = layer(x)
+    assert x_out.shape == torch.Size([1, 256, 28, 28])
+
+
+def test_seresnet():
+    """Test resnet backbone."""
+    with pytest.raises(KeyError):
+        # SEResNet depth should be in [50, 101, 152]
+        SEResNet(20)
+
+    with pytest.raises(AssertionError):
+        # In SEResNet: 1 <= num_stages <= 4
+        SEResNet(50, num_stages=0)
+
+    with pytest.raises(AssertionError):
+        # In SEResNet: 1 <= num_stages <= 4
+        SEResNet(50, num_stages=5)
+
+    with pytest.raises(AssertionError):
+        # len(strides) == len(dilations) == num_stages
+        SEResNet(50, strides=(1, ), dilations=(1, 1), num_stages=3)
+
+    with pytest.raises(TypeError):
+        # pretrained must be a string path
+        model = SEResNet(50)
+        model.init_weights(pretrained=0)
+
+    with pytest.raises(AssertionError):
+        # Style must be in ['pytorch', 'caffe']
+        SEResNet(50, style='tensorflow')
+
+    # Test SEResNet50 norm_eval=True
+    model = SEResNet(50, norm_eval=True)
+    model.init_weights()
+    model.train()
+    assert check_norm_state(model.modules(), False)
+
+    # Test SEResNet50 with torchvision pretrained weight
+    model = SEResNet(
+        depth=50,
+        norm_eval=True,
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'))
+    model.init_weights()
+    model.train()
+    assert check_norm_state(model.modules(), False)
+
+    # Test SEResNet50 with first stage frozen
+    frozen_stages = 1
+    model = SEResNet(50, frozen_stages=frozen_stages)
+    model.init_weights()
+    model.train()
+    assert model.norm1.training is False
+    for layer in [model.conv1, model.norm1]:
+        for param in layer.parameters():
+            assert param.requires_grad is False
+    for i in range(1, frozen_stages + 1):
+        layer = getattr(model, f'layer{i}')
+        for mod in layer.modules():
+            if isinstance(mod, _BatchNorm):
+                assert mod.training is False
+        for param in layer.parameters():
+            assert param.requires_grad is False
+
+    # Test SEResNet50 with BatchNorm forward
+    model = SEResNet(50, out_indices=(0, 1, 2, 3))
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 4
+    assert feat[0].shape == torch.Size([1, 256, 56, 56])
+    assert feat[1].shape == torch.Size([1, 512, 28, 28])
+    assert feat[2].shape == torch.Size([1, 1024, 14, 14])
+    assert feat[3].shape == torch.Size([1, 2048, 7, 7])
+
+    # Test SEResNet50 with layers 1, 2, 3 out forward
+    model = SEResNet(50, out_indices=(0, 1, 2))
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 3
+    assert feat[0].shape == torch.Size([1, 256, 56, 56])
+    assert feat[1].shape == torch.Size([1, 512, 28, 28])
+    assert feat[2].shape == torch.Size([1, 1024, 14, 14])
+
+    # Test SEResNet50 with layers 3 (top feature maps) out forward
+    model = SEResNet(50, out_indices=(3, ))
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 1
+    assert feat[0].shape == torch.Size([1, 2048, 7, 7])
+
+    # Test SEResNet50 with checkpoint forward
+    model = SEResNet(50, out_indices=(0, 1, 2, 3), with_cp=True)
+    for m in model.modules():
+        if isinstance(m, SEBottleneck):
+            assert m.with_cp
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 4
+    assert feat[0].shape == torch.Size([1, 256, 56, 56])
+    assert feat[1].shape == torch.Size([1, 512, 28, 28])
+    assert feat[2].shape == torch.Size([1, 1024, 14, 14])
+    assert feat[3].shape == torch.Size([1, 2048, 7, 7])
+
+    # Test SEResNet50 zero initialization of residual
+    model = SEResNet(50, out_indices=(0, 1, 2, 3), zero_init_residual=True)
+    model.init_weights()
+    for m in model.modules():
+        if isinstance(m, SEBottleneck):
+            assert all_zeros(m.norm3)
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 4
+    assert feat[0].shape == torch.Size([1, 256, 56, 56])
+    assert feat[1].shape == torch.Size([1, 512, 28, 28])
+    assert feat[2].shape == torch.Size([1, 1024, 14, 14])
+    assert feat[3].shape == torch.Size([1, 2048, 7, 7])
--- a/tests/test_models/test_backbones/test_seresnext.py
+++ b/tests/test_models/test_backbones/test_seresnext.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmpretrain.models.backbones import SEResNeXt
+from mmpretrain.models.backbones.seresnext import SEBottleneck as SEBottleneckX
+
+
+def test_bottleneck():
+    with pytest.raises(AssertionError):
+        # Style must be in ['pytorch', 'caffe']
+        SEBottleneckX(64, 64, groups=32, width_per_group=4, style='tensorflow')
+
+    # Test SEResNeXt Bottleneck structure
+    block = SEBottleneckX(
+        64, 256, groups=32, width_per_group=4, stride=2, style='pytorch')
+    assert block.width_per_group == 4
+    assert block.conv2.stride == (2, 2)
+    assert block.conv2.groups == 32
+    assert block.conv2.out_channels == 128
+    assert block.conv2.out_channels == block.mid_channels
+
+    # Test SEResNeXt Bottleneck structure (groups=1)
+    block = SEBottleneckX(
+        64, 256, groups=1, width_per_group=4, stride=2, style='pytorch')
+    assert block.conv2.stride == (2, 2)
+    assert block.conv2.groups == 1
+    assert block.conv2.out_channels == 64
+    assert block.mid_channels == 64
+    assert block.conv2.out_channels == block.mid_channels
+
+    # Test SEResNeXt Bottleneck forward
+    block = SEBottleneckX(
+        64, 64, base_channels=16, groups=32, width_per_group=4)
+    x = torch.randn(1, 64, 56, 56)
+    x_out = block(x)
+    assert x_out.shape == torch.Size([1, 64, 56, 56])
+
+
+def test_seresnext():
+    with pytest.raises(KeyError):
+        # SEResNeXt depth should be in [50, 101, 152]
+        SEResNeXt(depth=18)
+
+    # Test SEResNeXt with group 32, width_per_group 4
+    model = SEResNeXt(
+        depth=50, groups=32, width_per_group=4, out_indices=(0, 1, 2, 3))
+    for m in model.modules():
+        if isinstance(m, SEBottleneckX):
+            assert m.conv2.groups == 32
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 4
+    assert feat[0].shape == torch.Size([1, 256, 56, 56])
+    assert feat[1].shape == torch.Size([1, 512, 28, 28])
+    assert feat[2].shape == torch.Size([1, 1024, 14, 14])
+    assert feat[3].shape == torch.Size([1, 2048, 7, 7])
+
+    # Test SEResNeXt with group 32, width_per_group 4 and layers 3 out forward
+    model = SEResNeXt(
+        depth=50, groups=32, width_per_group=4, out_indices=(3, ))
+    for m in model.modules():
+        if isinstance(m, SEBottleneckX):
+            assert m.conv2.groups == 32
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 1
+    assert feat[0].shape == torch.Size([1, 2048, 7, 7])
--- a/tests/test_models/test_backbones/test_shufflenet_v1.py
+++ b/tests/test_models/test_backbones/test_shufflenet_v1.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+from torch.nn.modules import GroupNorm
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmpretrain.models.backbones import ShuffleNetV1
+from mmpretrain.models.backbones.shufflenet_v1 import ShuffleUnit
+
+
+def is_block(modules):
+    """Check if is ResNet building block."""
+    if isinstance(modules, (ShuffleUnit, )):
+        return True
+    return False
+
+
+def is_norm(modules):
+    """Check if is one of the norms."""
+    if isinstance(modules, (GroupNorm, _BatchNorm)):
+        return True
+    return False
+
+
+def check_norm_state(modules, train_state):
+    """Check if norm layer is in correct train state."""
+    for mod in modules:
+        if isinstance(mod, _BatchNorm):
+            if mod.training != train_state:
+                return False
+    return True
+
+
+def test_shufflenetv1_shuffleuint():
+
+    with pytest.raises(ValueError):
+        # combine must be in ['add', 'concat']
+        ShuffleUnit(24, 16, groups=3, first_block=True, combine='test')
+
+    with pytest.raises(AssertionError):
+        # in_channels must be equal tp = outplanes when combine='add'
+        ShuffleUnit(64, 24, groups=4, first_block=True, combine='add')
+
+    # Test ShuffleUnit with combine='add'
+    block = ShuffleUnit(24, 24, groups=3, first_block=True, combine='add')
+    x = torch.randn(1, 24, 56, 56)
+    x_out = block(x)
+    assert x_out.shape == torch.Size((1, 24, 56, 56))
+
+    # Test ShuffleUnit with combine='concat'
+    block = ShuffleUnit(24, 240, groups=3, first_block=True, combine='concat')
+    x = torch.randn(1, 24, 56, 56)
+    x_out = block(x)
+    assert x_out.shape == torch.Size((1, 240, 28, 28))
+
+    # Test ShuffleUnit with checkpoint forward
+    block = ShuffleUnit(
+        24, 24, groups=3, first_block=True, combine='add', with_cp=True)
+    assert block.with_cp
+    x = torch.randn(1, 24, 56, 56)
+    x.requires_grad = True
+    x_out = block(x)
+    assert x_out.shape == torch.Size((1, 24, 56, 56))
+
+
+def test_shufflenetv1_backbone():
+
+    with pytest.raises(ValueError):
+        # frozen_stages must be in  range(-1, 4)
+        ShuffleNetV1(frozen_stages=10)
+
+    with pytest.raises(ValueError):
+        # the item in out_indices must be in  range(0, 4)
+        ShuffleNetV1(out_indices=[5])
+
+    with pytest.raises(ValueError):
+        # groups must be in  [1, 2, 3, 4, 8]
+        ShuffleNetV1(groups=10)
+
+    with pytest.raises(TypeError):
+        # pretrained must be str or None
+        model = ShuffleNetV1()
+        model.init_weights(pretrained=1)
+
+    # Test ShuffleNetV1 norm state
+    model = ShuffleNetV1()
+    model.init_weights()
+    model.train()
+    assert check_norm_state(model.modules(), True)
+
+    # Test ShuffleNetV1 with first stage frozen
+    frozen_stages = 1
+    model = ShuffleNetV1(frozen_stages=frozen_stages, out_indices=(0, 1, 2))
+    model.init_weights()
+    model.train()
+    for param in model.conv1.parameters():
+        assert param.requires_grad is False
+    for i in range(frozen_stages):
+        layer = model.layers[i]
+        for mod in layer.modules():
+            if isinstance(mod, _BatchNorm):
+                assert mod.training is False
+        for param in layer.parameters():
+            assert param.requires_grad is False
+
+    # Test ShuffleNetV1 forward with groups=1
+    model = ShuffleNetV1(groups=1, out_indices=(0, 1, 2))
+    model.init_weights()
+    model.train()
+
+    for m in model.modules():
+        if is_norm(m):
+            assert isinstance(m, _BatchNorm)
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 3
+    assert feat[0].shape == torch.Size((1, 144, 28, 28))
+    assert feat[1].shape == torch.Size((1, 288, 14, 14))
+    assert feat[2].shape == torch.Size((1, 576, 7, 7))
+
+    # Test ShuffleNetV1 forward with groups=2
+    model = ShuffleNetV1(groups=2, out_indices=(0, 1, 2))
+    model.init_weights()
+    model.train()
+
+    for m in model.modules():
+        if is_norm(m):
+            assert isinstance(m, _BatchNorm)
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 3
+    assert feat[0].shape == torch.Size((1, 200, 28, 28))
+    assert feat[1].shape == torch.Size((1, 400, 14, 14))
+    assert feat[2].shape == torch.Size((1, 800, 7, 7))
+
+    # Test ShuffleNetV1 forward with groups=3
+    model = ShuffleNetV1(groups=3, out_indices=(0, 1, 2))
+    model.init_weights()
+    model.train()
+
+    for m in model.modules():
+        if is_norm(m):
+            assert isinstance(m, _BatchNorm)
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 3
+    assert feat[0].shape == torch.Size((1, 240, 28, 28))
+    assert feat[1].shape == torch.Size((1, 480, 14, 14))
+    assert feat[2].shape == torch.Size((1, 960, 7, 7))
+
+    # Test ShuffleNetV1 forward with groups=4
+    model = ShuffleNetV1(groups=4, out_indices=(0, 1, 2))
+    model.init_weights()
+    model.train()
+
+    for m in model.modules():
+        if is_norm(m):
+            assert isinstance(m, _BatchNorm)
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 3
+    assert feat[0].shape == torch.Size((1, 272, 28, 28))
+    assert feat[1].shape == torch.Size((1, 544, 14, 14))
+    assert feat[2].shape == torch.Size((1, 1088, 7, 7))
+
+    # Test ShuffleNetV1 forward with groups=8
+    model = ShuffleNetV1(groups=8, out_indices=(0, 1, 2))
+    model.init_weights()
+    model.train()
+
+    for m in model.modules():
+        if is_norm(m):
+            assert isinstance(m, _BatchNorm)
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 3
+    assert feat[0].shape == torch.Size((1, 384, 28, 28))
+    assert feat[1].shape == torch.Size((1, 768, 14, 14))
+    assert feat[2].shape == torch.Size((1, 1536, 7, 7))
+
+    # Test ShuffleNetV1 forward with GroupNorm forward
+    model = ShuffleNetV1(
+        groups=3,
+        norm_cfg=dict(type='GN', num_groups=2, requires_grad=True),
+        out_indices=(0, 1, 2))
+    model.init_weights()
+    model.train()
+
+    for m in model.modules():
+        if is_norm(m):
+            assert isinstance(m, GroupNorm)
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 3
+    assert feat[0].shape == torch.Size((1, 240, 28, 28))
+    assert feat[1].shape == torch.Size((1, 480, 14, 14))
+    assert feat[2].shape == torch.Size((1, 960, 7, 7))
+
+    # Test ShuffleNetV1 forward with layers 1, 2 forward
+    model = ShuffleNetV1(groups=3, out_indices=(1, 2))
+    model.init_weights()
+    model.train()
+
+    for m in model.modules():
+        if is_norm(m):
+            assert isinstance(m, _BatchNorm)
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 2
+    assert feat[0].shape == torch.Size((1, 480, 14, 14))
+    assert feat[1].shape == torch.Size((1, 960, 7, 7))
+
+    # Test ShuffleNetV1 forward with layers 2 forward
+    model = ShuffleNetV1(groups=3, out_indices=(2, ))
+    model.init_weights()
+    model.train()
+
+    for m in model.modules():
+        if is_norm(m):
+            assert isinstance(m, _BatchNorm)
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 1
+    assert isinstance(feat[0], torch.Tensor)
+    assert feat[0].shape == torch.Size((1, 960, 7, 7))
+
+    # Test ShuffleNetV1 forward with checkpoint forward
+    model = ShuffleNetV1(groups=3, with_cp=True)
+    for m in model.modules():
+        if is_block(m):
+            assert m.with_cp
+
+    # Test ShuffleNetV1 with norm_eval
+    model = ShuffleNetV1(norm_eval=True)
+    model.init_weights()
+    model.train()
+
+    assert check_norm_state(model.modules(), False)
--- a/tests/test_models/test_backbones/test_shufflenet_v2.py
+++ b/tests/test_models/test_backbones/test_shufflenet_v2.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+from torch.nn.modules import GroupNorm
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmpretrain.models.backbones import ShuffleNetV2
+from mmpretrain.models.backbones.shufflenet_v2 import InvertedResidual
+
+
+def is_block(modules):
+    """Check if is ResNet building block."""
+    if isinstance(modules, (InvertedResidual, )):
+        return True
+    return False
+
+
+def is_norm(modules):
+    """Check if is one of the norms."""
+    if isinstance(modules, (GroupNorm, _BatchNorm)):
+        return True
+    return False
+
+
+def check_norm_state(modules, train_state):
+    """Check if norm layer is in correct train state."""
+    for mod in modules:
+        if isinstance(mod, _BatchNorm):
+            if mod.training != train_state:
+                return False
+    return True
+
+
+def test_shufflenetv2_invertedresidual():
+
+    with pytest.raises(AssertionError):
+        # when stride==1, in_channels should be equal to out_channels // 2 * 2
+        InvertedResidual(24, 32, stride=1)
+
+    with pytest.raises(AssertionError):
+        # when in_channels !=  out_channels // 2 * 2, stride should not be
+        # equal to 1.
+        InvertedResidual(24, 32, stride=1)
+
+    # Test InvertedResidual forward
+    block = InvertedResidual(24, 48, stride=2)
+    x = torch.randn(1, 24, 56, 56)
+    x_out = block(x)
+    assert x_out.shape == torch.Size((1, 48, 28, 28))
+
+    # Test InvertedResidual with checkpoint forward
+    block = InvertedResidual(48, 48, stride=1, with_cp=True)
+    assert block.with_cp
+    x = torch.randn(1, 48, 56, 56)
+    x.requires_grad = True
+    x_out = block(x)
+    assert x_out.shape == torch.Size((1, 48, 56, 56))
+
+
+def test_shufflenetv2_backbone():
+
+    with pytest.raises(ValueError):
+        # groups must be in 0.5, 1.0, 1.5, 2.0]
+        ShuffleNetV2(widen_factor=3.0)
+
+    with pytest.raises(ValueError):
+        # frozen_stages must be in [0, 1, 2, 3]
+        ShuffleNetV2(widen_factor=1.0, frozen_stages=4)
+
+    with pytest.raises(ValueError):
+        # out_indices must be in [0, 1, 2, 3]
+        ShuffleNetV2(widen_factor=1.0, out_indices=(4, ))
+
+    with pytest.raises(TypeError):
+        # pretrained must be str or None
+        model = ShuffleNetV2()
+        model.init_weights(pretrained=1)
+
+    # Test ShuffleNetV2 norm state
+    model = ShuffleNetV2()
+    model.init_weights()
+    model.train()
+    assert check_norm_state(model.modules(), True)
+
+    # Test ShuffleNetV2 with first stage frozen
+    frozen_stages = 1
+    model = ShuffleNetV2(frozen_stages=frozen_stages)
+    model.init_weights()
+    model.train()
+    for param in model.conv1.parameters():
+        assert param.requires_grad is False
+    for i in range(0, frozen_stages):
+        layer = model.layers[i]
+        for mod in layer.modules():
+            if isinstance(mod, _BatchNorm):
+                assert mod.training is False
+        for param in layer.parameters():
+            assert param.requires_grad is False
+
+    # Test ShuffleNetV2 with norm_eval
+    model = ShuffleNetV2(norm_eval=True)
+    model.init_weights()
+    model.train()
+
+    assert check_norm_state(model.modules(), False)
+
+    # Test ShuffleNetV2 forward with widen_factor=0.5
+    model = ShuffleNetV2(widen_factor=0.5, out_indices=(0, 1, 2, 3))
+    model.init_weights()
+    model.train()
+
+    for m in model.modules():
+        if is_norm(m):
+            assert isinstance(m, _BatchNorm)
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 4
+    assert feat[0].shape == torch.Size((1, 48, 28, 28))
+    assert feat[1].shape == torch.Size((1, 96, 14, 14))
+    assert feat[2].shape == torch.Size((1, 192, 7, 7))
+
+    # Test ShuffleNetV2 forward with widen_factor=1.0
+    model = ShuffleNetV2(widen_factor=1.0, out_indices=(0, 1, 2, 3))
+    model.init_weights()
+    model.train()
+
+    for m in model.modules():
+        if is_norm(m):
+            assert isinstance(m, _BatchNorm)
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 4
+    assert feat[0].shape == torch.Size((1, 116, 28, 28))
+    assert feat[1].shape == torch.Size((1, 232, 14, 14))
+    assert feat[2].shape == torch.Size((1, 464, 7, 7))
+
+    # Test ShuffleNetV2 forward with widen_factor=1.5
+    model = ShuffleNetV2(widen_factor=1.5, out_indices=(0, 1, 2, 3))
+    model.init_weights()
+    model.train()
+
+    for m in model.modules():
+        if is_norm(m):
+            assert isinstance(m, _BatchNorm)
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 4
+    assert feat[0].shape == torch.Size((1, 176, 28, 28))
+    assert feat[1].shape == torch.Size((1, 352, 14, 14))
+    assert feat[2].shape == torch.Size((1, 704, 7, 7))
+
+    # Test ShuffleNetV2 forward with widen_factor=2.0
+    model = ShuffleNetV2(widen_factor=2.0, out_indices=(0, 1, 2, 3))
+    model.init_weights()
+    model.train()
+
+    for m in model.modules():
+        if is_norm(m):
+            assert isinstance(m, _BatchNorm)
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 4
+    assert feat[0].shape == torch.Size((1, 244, 28, 28))
+    assert feat[1].shape == torch.Size((1, 488, 14, 14))
+    assert feat[2].shape == torch.Size((1, 976, 7, 7))
+
+    # Test ShuffleNetV2 forward with layers 3 forward
+    model = ShuffleNetV2(widen_factor=1.0, out_indices=(2, ))
+    model.init_weights()
+    model.train()
+
+    for m in model.modules():
+        if is_norm(m):
+            assert isinstance(m, _BatchNorm)
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 1
+    assert isinstance(feat[0], torch.Tensor)
+    assert feat[0].shape == torch.Size((1, 464, 7, 7))
+
+    # Test ShuffleNetV2 forward with layers 1 2 forward
+    model = ShuffleNetV2(widen_factor=1.0, out_indices=(1, 2))
+    model.init_weights()
+    model.train()
+
+    for m in model.modules():
+        if is_norm(m):
+            assert isinstance(m, _BatchNorm)
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 2
+    assert feat[0].shape == torch.Size((1, 232, 14, 14))
+    assert feat[1].shape == torch.Size((1, 464, 7, 7))
+
+    # Test ShuffleNetV2 forward with checkpoint forward
+    model = ShuffleNetV2(widen_factor=1.0, with_cp=True)
+    for m in model.modules():
+        if is_block(m):
+            assert m.with_cp
--- a/tests/test_models/test_backbones/test_swin_transformer.py
+++ b/tests/test_models/test_backbones/test_swin_transformer.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import os
+import tempfile
+from copy import deepcopy
+from itertools import chain
+from unittest import TestCase
+
+import torch
+from mmengine.runner import load_checkpoint, save_checkpoint
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
+
+from mmpretrain.models.backbones import SwinTransformer
+from mmpretrain.models.backbones.swin_transformer import SwinBlock
+from .utils import timm_resize_pos_embed
+
+
+def check_norm_state(modules, train_state):
+    """Check if norm layer is in correct train state."""
+    for mod in modules:
+        if isinstance(mod, _BatchNorm):
+            if mod.training != train_state:
+                return False
+    return True
+
+
+class TestSwinTransformer(TestCase):
+
+    def setUp(self):
+        self.cfg = dict(
+            arch='tiny', img_size=224, patch_size=4, drop_path_rate=0.1)
+
+    def test_arch(self):
+        # Test invalid default arch
+        with self.assertRaisesRegex(AssertionError, 'not in default archs'):
+            cfg = deepcopy(self.cfg)
+            cfg['arch'] = 'unknown'
+            SwinTransformer(**cfg)
+
+        # Test invalid custom arch
+        with self.assertRaisesRegex(AssertionError, 'Custom arch needs'):
+            cfg = deepcopy(self.cfg)
+            cfg['arch'] = {
+                'embed_dims': 96,
+                'num_heads': [3, 6, 12, 16],
+            }
+            SwinTransformer(**cfg)
+
+        # Test custom arch
+        cfg = deepcopy(self.cfg)
+        depths = [2, 2, 4, 2]
+        num_heads = [6, 12, 6, 12]
+        cfg['arch'] = {
+            'embed_dims': 256,
+            'depths': depths,
+            'num_heads': num_heads
+        }
+        model = SwinTransformer(**cfg)
+        for i, stage in enumerate(model.stages):
+            self.assertEqual(stage.embed_dims, 256 * (2**i))
+            self.assertEqual(len(stage.blocks), depths[i])
+            self.assertEqual(stage.blocks[0].attn.w_msa.num_heads,
+                             num_heads[i])
+
+    def test_init_weights(self):
+        # test weight init cfg
+        cfg = deepcopy(self.cfg)
+        cfg['use_abs_pos_embed'] = True
+        cfg['init_cfg'] = [
+            dict(
+                type='Kaiming',
+                layer='Conv2d',
+                mode='fan_in',
+                nonlinearity='linear')
+        ]
+        model = SwinTransformer(**cfg)
+        ori_weight = model.patch_embed.projection.weight.clone().detach()
+        # The pos_embed is all zero before initialize
+        self.assertTrue(
+            torch.allclose(model.absolute_pos_embed, torch.tensor(0.)))
+
+        model.init_weights()
+        initialized_weight = model.patch_embed.projection.weight
+        self.assertFalse(torch.allclose(ori_weight, initialized_weight))
+        self.assertFalse(
+            torch.allclose(model.absolute_pos_embed, torch.tensor(0.)))
+
+        pretrain_pos_embed = model.absolute_pos_embed.clone().detach()
+
+        tmpdir = tempfile.gettempdir()
+        # Save v3 checkpoints
+        checkpoint_v2 = os.path.join(tmpdir, 'v3.pth')
+        save_checkpoint(model.state_dict(), checkpoint_v2)
+        # Save v1 checkpoints
+        setattr(model, 'norm', model.norm3)
+        setattr(model.stages[0].blocks[1].attn, 'attn_mask',
+                torch.zeros(64, 49, 49))
+        model._version = 1
+        del model.norm3
+        checkpoint_v1 = os.path.join(tmpdir, 'v1.pth')
+        save_checkpoint(model.state_dict(), checkpoint_v1)
+
+        # test load v1 checkpoint
+        cfg = deepcopy(self.cfg)
+        cfg['use_abs_pos_embed'] = True
+        model = SwinTransformer(**cfg)
+        load_checkpoint(model, checkpoint_v1, strict=True)
+
+        # test load v3 checkpoint
+        cfg = deepcopy(self.cfg)
+        cfg['use_abs_pos_embed'] = True
+        model = SwinTransformer(**cfg)
+        load_checkpoint(model, checkpoint_v2, strict=True)
+
+        # test load v3 checkpoint with different img_size
+        cfg = deepcopy(self.cfg)
+        cfg['img_size'] = 384
+        cfg['use_abs_pos_embed'] = True
+        model = SwinTransformer(**cfg)
+        load_checkpoint(model, checkpoint_v2, strict=True)
+        resized_pos_embed = timm_resize_pos_embed(
+            pretrain_pos_embed, model.absolute_pos_embed, num_tokens=0)
+        self.assertTrue(
+            torch.allclose(model.absolute_pos_embed, resized_pos_embed))
+
+        os.remove(checkpoint_v1)
+        os.remove(checkpoint_v2)
+
+    def test_forward(self):
+        imgs = torch.randn(1, 3, 224, 224)
+
+        cfg = deepcopy(self.cfg)
+        model = SwinTransformer(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 1)
+        feat = outs[-1]
+        self.assertEqual(feat.shape, (1, 768, 7, 7))
+
+        # test with window_size=12
+        cfg = deepcopy(self.cfg)
+        cfg['window_size'] = 12
+        model = SwinTransformer(**cfg)
+        outs = model(torch.randn(1, 3, 384, 384))
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 1)
+        feat = outs[-1]
+        self.assertEqual(feat.shape, (1, 768, 12, 12))
+        with self.assertRaisesRegex(AssertionError, r'the window size \(12\)'):
+            model(torch.randn(1, 3, 224, 224))
+
+        # test with pad_small_map=True
+        cfg = deepcopy(self.cfg)
+        cfg['window_size'] = 12
+        cfg['pad_small_map'] = True
+        model = SwinTransformer(**cfg)
+        outs = model(torch.randn(1, 3, 224, 224))
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 1)
+        feat = outs[-1]
+        self.assertEqual(feat.shape, (1, 768, 7, 7))
+
+        # test multiple output indices
+        cfg = deepcopy(self.cfg)
+        cfg['out_indices'] = (0, 1, 2, 3)
+        model = SwinTransformer(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 4)
+        for stride, out in zip([1, 2, 4, 8], outs):
+            self.assertEqual(out.shape,
+                             (1, 96 * stride, 56 // stride, 56 // stride))
+
+        # test with checkpoint forward
+        cfg = deepcopy(self.cfg)
+        cfg['with_cp'] = True
+        model = SwinTransformer(**cfg)
+        for m in model.modules():
+            if isinstance(m, SwinBlock):
+                self.assertTrue(m.with_cp)
+        model.init_weights()
+        model.train()
+
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 1)
+        feat = outs[-1]
+        self.assertEqual(feat.shape, (1, 768, 7, 7))
+
+        # test with dynamic input shape
+        imgs1 = torch.randn(1, 3, 224, 224)
+        imgs2 = torch.randn(1, 3, 256, 256)
+        imgs3 = torch.randn(1, 3, 256, 309)
+        cfg = deepcopy(self.cfg)
+        model = SwinTransformer(**cfg)
+        for imgs in [imgs1, imgs2, imgs3]:
+            outs = model(imgs)
+            self.assertIsInstance(outs, tuple)
+            self.assertEqual(len(outs), 1)
+            feat = outs[-1]
+            expect_feat_shape = (math.ceil(imgs.shape[2] / 32),
+                                 math.ceil(imgs.shape[3] / 32))
+            self.assertEqual(feat.shape, (1, 768, *expect_feat_shape))
+
+    def test_structure(self):
+        # test drop_path_rate decay
+        cfg = deepcopy(self.cfg)
+        cfg['drop_path_rate'] = 0.2
+        model = SwinTransformer(**cfg)
+        depths = model.arch_settings['depths']
+        blocks = chain(*[stage.blocks for stage in model.stages])
+        for i, block in enumerate(blocks):
+            expect_prob = 0.2 / (sum(depths) - 1) * i
+            self.assertAlmostEqual(block.ffn.dropout_layer.drop_prob,
+                                   expect_prob)
+            self.assertAlmostEqual(block.attn.drop.drop_prob, expect_prob)
+
+        # test Swin-Transformer with norm_eval=True
+        cfg = deepcopy(self.cfg)
+        cfg['norm_eval'] = True
+        cfg['norm_cfg'] = dict(type='BN')
+        cfg['stage_cfgs'] = dict(block_cfgs=dict(norm_cfg=dict(type='BN')))
+        model = SwinTransformer(**cfg)
+        model.init_weights()
+        model.train()
+        self.assertTrue(check_norm_state(model.modules(), False))
+
+        # test Swin-Transformer with first stage frozen.
+        cfg = deepcopy(self.cfg)
+        frozen_stages = 0
+        cfg['frozen_stages'] = frozen_stages
+        cfg['out_indices'] = (0, 1, 2, 3)
+        model = SwinTransformer(**cfg)
+        model.init_weights()
+        model.train()
+
+        # the patch_embed and first stage should not require grad.
+        self.assertFalse(model.patch_embed.training)
+        for param in model.patch_embed.parameters():
+            self.assertFalse(param.requires_grad)
+        for i in range(frozen_stages + 1):
+            stage = model.stages[i]
+            for param in stage.parameters():
+                self.assertFalse(param.requires_grad)
+        for param in model.norm0.parameters():
+            self.assertFalse(param.requires_grad)
+
+        # the second stage should require grad.
+        for i in range(frozen_stages + 1, 4):
+            stage = model.stages[i]
+            for param in stage.parameters():
+                self.assertTrue(param.requires_grad)
+            norm = getattr(model, f'norm{i}')
+            for param in norm.parameters():
+                self.assertTrue(param.requires_grad)
--- a/tests/test_models/test_backbones/test_swin_transformer_v2.py
+++ b/tests/test_models/test_backbones/test_swin_transformer_v2.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import os
+import tempfile
+from copy import deepcopy
+from itertools import chain
+from unittest import TestCase
+
+import torch
+from mmengine.runner import load_checkpoint, save_checkpoint
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
+
+from mmpretrain.models.backbones import SwinTransformerV2
+from mmpretrain.models.backbones.swin_transformer import SwinBlock
+from .utils import timm_resize_pos_embed
+
+
+def check_norm_state(modules, train_state):
+    """Check if norm layer is in correct train state."""
+    for mod in modules:
+        if isinstance(mod, _BatchNorm):
+            if mod.training != train_state:
+                return False
+    return True
+
+
+class TestSwinTransformerV2(TestCase):
+
+    def setUp(self):
+        self.cfg = dict(
+            arch='b', img_size=256, patch_size=4, drop_path_rate=0.1)
+
+    def test_arch(self):
+        # Test invalid default arch
+        with self.assertRaisesRegex(AssertionError, 'not in default archs'):
+            cfg = deepcopy(self.cfg)
+            cfg['arch'] = 'unknown'
+            SwinTransformerV2(**cfg)
+
+        # Test invalid custom arch
+        with self.assertRaisesRegex(AssertionError, 'Custom arch needs'):
+            cfg = deepcopy(self.cfg)
+            cfg['arch'] = {
+                'embed_dims': 96,
+                'num_heads': [3, 6, 12, 16],
+            }
+            SwinTransformerV2(**cfg)
+
+        # Test custom arch
+        cfg = deepcopy(self.cfg)
+        depths = [2, 2, 6, 2]
+        num_heads = [6, 12, 6, 12]
+        cfg['arch'] = {
+            'embed_dims': 256,
+            'depths': depths,
+            'num_heads': num_heads,
+            'extra_norm_every_n_blocks': 2
+        }
+        model = SwinTransformerV2(**cfg)
+        for i, stage in enumerate(model.stages):
+            self.assertEqual(stage.out_channels, 256 * (2**i))
+            self.assertEqual(len(stage.blocks), depths[i])
+            self.assertEqual(stage.blocks[0].attn.w_msa.num_heads,
+                             num_heads[i])
+        self.assertIsInstance(model.stages[2].blocks[5], torch.nn.Module)
+
+    def test_init_weights(self):
+        # test weight init cfg
+        cfg = deepcopy(self.cfg)
+        cfg['use_abs_pos_embed'] = True
+        cfg['init_cfg'] = [
+            dict(
+                type='Kaiming',
+                layer='Conv2d',
+                mode='fan_in',
+                nonlinearity='linear')
+        ]
+        model = SwinTransformerV2(**cfg)
+        ori_weight = model.patch_embed.projection.weight.clone().detach()
+        # The pos_embed is all zero before initialize
+        self.assertTrue(
+            torch.allclose(model.absolute_pos_embed, torch.tensor(0.)))
+
+        model.init_weights()
+        initialized_weight = model.patch_embed.projection.weight
+        self.assertFalse(torch.allclose(ori_weight, initialized_weight))
+        self.assertFalse(
+            torch.allclose(model.absolute_pos_embed, torch.tensor(0.)))
+
+        pretrain_pos_embed = model.absolute_pos_embed.clone().detach()
+
+        tmpdir = tempfile.TemporaryDirectory()
+        # Save checkpoints
+        checkpoint = os.path.join(tmpdir.name, 'checkpoint.pth')
+        save_checkpoint(model.state_dict(), checkpoint)
+
+        # test load checkpoint
+        cfg = deepcopy(self.cfg)
+        cfg['use_abs_pos_embed'] = True
+        model = SwinTransformerV2(**cfg)
+        load_checkpoint(model, checkpoint, strict=False)
+
+        # test load checkpoint with different img_size
+        cfg = deepcopy(self.cfg)
+        cfg['img_size'] = 384
+        cfg['use_abs_pos_embed'] = True
+        model = SwinTransformerV2(**cfg)
+        load_checkpoint(model, checkpoint, strict=False)
+        resized_pos_embed = timm_resize_pos_embed(
+            pretrain_pos_embed, model.absolute_pos_embed, num_tokens=0)
+        self.assertTrue(
+            torch.allclose(model.absolute_pos_embed, resized_pos_embed))
+
+        tmpdir.cleanup()
+
+    def test_forward(self):
+        imgs = torch.randn(1, 3, 256, 256)
+
+        cfg = deepcopy(self.cfg)
+        model = SwinTransformerV2(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 1)
+        feat = outs[-1]
+        self.assertEqual(feat.shape, (1, 1024, 8, 8))
+
+        # test with window_size=12
+        cfg = deepcopy(self.cfg)
+        cfg['window_size'] = 12
+        model = SwinTransformerV2(**cfg)
+        outs = model(torch.randn(1, 3, 384, 384))
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 1)
+        feat = outs[-1]
+        self.assertEqual(feat.shape, (1, 1024, 12, 12))
+        with self.assertRaisesRegex(AssertionError, r'the window size \(12\)'):
+            model(torch.randn(1, 3, 256, 256))
+
+        # test with pad_small_map=True
+        cfg = deepcopy(self.cfg)
+        cfg['window_size'] = 12
+        cfg['pad_small_map'] = True
+        model = SwinTransformerV2(**cfg)
+        outs = model(torch.randn(1, 3, 256, 256))
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 1)
+        feat = outs[-1]
+        self.assertEqual(feat.shape, (1, 1024, 8, 8))
+
+        # test multiple output indices
+        cfg = deepcopy(self.cfg)
+        cfg['out_indices'] = (0, 1, 2, 3)
+        model = SwinTransformerV2(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 4)
+        for stride, out in zip([1, 2, 4, 8], outs):
+            self.assertEqual(out.shape,
+                             (1, 128 * stride, 64 // stride, 64 // stride))
+
+        # test with checkpoint forward
+        cfg = deepcopy(self.cfg)
+        cfg['with_cp'] = True
+        model = SwinTransformerV2(**cfg)
+        for m in model.modules():
+            if isinstance(m, SwinBlock):
+                self.assertTrue(m.with_cp)
+        model.init_weights()
+        model.train()
+
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 1)
+        feat = outs[-1]
+        self.assertEqual(feat.shape, (1, 1024, 8, 8))
+
+        # test with dynamic input shape
+        imgs1 = torch.randn(1, 3, 224, 224)
+        imgs2 = torch.randn(1, 3, 256, 256)
+        imgs3 = torch.randn(1, 3, 256, 309)
+        cfg = deepcopy(self.cfg)
+        cfg['pad_small_map'] = True
+        model = SwinTransformerV2(**cfg)
+        for imgs in [imgs1, imgs2, imgs3]:
+            outs = model(imgs)
+            self.assertIsInstance(outs, tuple)
+            self.assertEqual(len(outs), 1)
+            feat = outs[-1]
+            expect_feat_shape = (math.ceil(imgs.shape[2] / 32),
+                                 math.ceil(imgs.shape[3] / 32))
+            self.assertEqual(feat.shape, (1, 1024, *expect_feat_shape))
+
+    def test_structure(self):
+        # test drop_path_rate decay
+        cfg = deepcopy(self.cfg)
+        cfg['drop_path_rate'] = 0.2
+        model = SwinTransformerV2(**cfg)
+        depths = model.arch_settings['depths']
+        blocks = chain(*[stage.blocks for stage in model.stages])
+        for i, block in enumerate(blocks):
+            expect_prob = 0.2 / (sum(depths) - 1) * i
+            self.assertAlmostEqual(block.ffn.dropout_layer.drop_prob,
+                                   expect_prob)
+            self.assertAlmostEqual(block.attn.drop.drop_prob, expect_prob)
+
+        # test Swin-Transformer V2 with norm_eval=True
+        cfg = deepcopy(self.cfg)
+        cfg['norm_eval'] = True
+        cfg['norm_cfg'] = dict(type='BN')
+        cfg['stage_cfgs'] = dict(block_cfgs=dict(norm_cfg=dict(type='BN')))
+        model = SwinTransformerV2(**cfg)
+        model.init_weights()
+        model.train()
+        self.assertTrue(check_norm_state(model.modules(), False))
+
+        # test Swin-Transformer V2 with first stage frozen.
+        cfg = deepcopy(self.cfg)
+        frozen_stages = 0
+        cfg['frozen_stages'] = frozen_stages
+        cfg['out_indices'] = (0, 1, 2, 3)
+        model = SwinTransformerV2(**cfg)
+        model.init_weights()
+        model.train()
+
+        # the patch_embed and first stage should not require grad.
+        self.assertFalse(model.patch_embed.training)
+        for param in model.patch_embed.parameters():
+            self.assertFalse(param.requires_grad)
+        for i in range(frozen_stages + 1):
+            stage = model.stages[i]
+            for param in stage.parameters():
+                self.assertFalse(param.requires_grad)
+        for param in model.norm0.parameters():
+            self.assertFalse(param.requires_grad)
+
+        # the second stage should require grad.
+        for i in range(frozen_stages + 1, 4):
+            stage = model.stages[i]
+            for param in stage.parameters():
+                self.assertTrue(param.requires_grad)
+            norm = getattr(model, f'norm{i}')
+            for param in norm.parameters():
+                self.assertTrue(param.requires_grad)
--- a/tests/test_models/test_backbones/test_t2t_vit.py
+++ b/tests/test_models/test_backbones/test_t2t_vit.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import os
+import tempfile
+from copy import deepcopy
+from unittest import TestCase
+
+import torch
+from mmengine.runner import load_checkpoint, save_checkpoint
+
+from mmpretrain.models.backbones import T2T_ViT
+from .utils import timm_resize_pos_embed
+
+
+class TestT2TViT(TestCase):
+
+    def setUp(self):
+        self.cfg = dict(
+            img_size=224,
+            in_channels=3,
+            embed_dims=384,
+            t2t_cfg=dict(
+                token_dims=64,
+                use_performer=False,
+            ),
+            num_layers=14,
+            drop_path_rate=0.1)
+
+    def test_structure(self):
+        # The performer hasn't been implemented
+        cfg = deepcopy(self.cfg)
+        cfg['t2t_cfg']['use_performer'] = True
+        with self.assertRaises(NotImplementedError):
+            T2T_ViT(**cfg)
+
+        # Test out_indices
+        cfg = deepcopy(self.cfg)
+        cfg['out_indices'] = {1: 1}
+        with self.assertRaisesRegex(AssertionError, "get <class 'dict'>"):
+            T2T_ViT(**cfg)
+        cfg['out_indices'] = [0, 15]
+        with self.assertRaisesRegex(AssertionError, 'Invalid out_indices 15'):
+            T2T_ViT(**cfg)
+
+        # Test model structure
+        cfg = deepcopy(self.cfg)
+        model = T2T_ViT(**cfg)
+        self.assertEqual(len(model.encoder), 14)
+        dpr_inc = 0.1 / (14 - 1)
+        dpr = 0
+        for layer in model.encoder:
+            self.assertEqual(layer.attn.embed_dims, 384)
+            # The default mlp_ratio is 3
+            self.assertEqual(layer.ffn.feedforward_channels, 384 * 3)
+            self.assertAlmostEqual(layer.attn.out_drop.drop_prob, dpr)
+            self.assertAlmostEqual(layer.ffn.dropout_layer.drop_prob, dpr)
+            dpr += dpr_inc
+
+    def test_init_weights(self):
+        # test weight init cfg
+        cfg = deepcopy(self.cfg)
+        cfg['init_cfg'] = [dict(type='TruncNormal', layer='Linear', std=.02)]
+        model = T2T_ViT(**cfg)
+        ori_weight = model.tokens_to_token.project.weight.clone().detach()
+
+        model.init_weights()
+        initialized_weight = model.tokens_to_token.project.weight
+        self.assertFalse(torch.allclose(ori_weight, initialized_weight))
+
+        # test load checkpoint
+        pretrain_pos_embed = model.pos_embed.clone().detach()
+        tmpdir = tempfile.gettempdir()
+        checkpoint = os.path.join(tmpdir, 'test.pth')
+        save_checkpoint(model.state_dict(), checkpoint)
+        cfg = deepcopy(self.cfg)
+        model = T2T_ViT(**cfg)
+        load_checkpoint(model, checkpoint, strict=True)
+        self.assertTrue(torch.allclose(model.pos_embed, pretrain_pos_embed))
+
+        # test load checkpoint with different img_size
+        cfg = deepcopy(self.cfg)
+        cfg['img_size'] = 384
+        model = T2T_ViT(**cfg)
+        load_checkpoint(model, checkpoint, strict=True)
+        resized_pos_embed = timm_resize_pos_embed(pretrain_pos_embed,
+                                                  model.pos_embed)
+        self.assertTrue(torch.allclose(model.pos_embed, resized_pos_embed))
+
+        os.remove(checkpoint)
+
+    def test_forward(self):
+        imgs = torch.randn(1, 3, 224, 224)
+
+        # test with_cls_token=False
+        cfg = deepcopy(self.cfg)
+        cfg['with_cls_token'] = False
+        cfg['out_type'] = 'cls_token'
+        with self.assertRaisesRegex(ValueError, 'must be True'):
+            T2T_ViT(**cfg)
+
+        cfg = deepcopy(self.cfg)
+        cfg['with_cls_token'] = False
+        cfg['out_type'] = 'featmap'
+        model = T2T_ViT(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 1)
+        patch_token = outs[-1]
+        self.assertEqual(patch_token.shape, (1, 384, 14, 14))
+
+        # test with output cls_token
+        cfg = deepcopy(self.cfg)
+        model = T2T_ViT(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 1)
+        cls_token = outs[-1]
+        self.assertEqual(cls_token.shape, (1, 384))
+
+        # Test forward with multi out indices
+        cfg = deepcopy(self.cfg)
+        cfg['out_indices'] = [-3, -2, -1]
+        model = T2T_ViT(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 3)
+        for out in outs:
+            self.assertEqual(out.shape, (1, 384))
+
+        # Test forward with dynamic input size
+        imgs1 = torch.randn(1, 3, 224, 224)
+        imgs2 = torch.randn(1, 3, 256, 256)
+        imgs3 = torch.randn(1, 3, 256, 309)
+        cfg = deepcopy(self.cfg)
+        cfg['out_type'] = 'featmap'
+        model = T2T_ViT(**cfg)
+        for imgs in [imgs1, imgs2, imgs3]:
+            outs = model(imgs)
+            self.assertIsInstance(outs, tuple)
+            self.assertEqual(len(outs), 1)
+            patch_token = outs[-1]
+            expect_feat_shape = (math.ceil(imgs.shape[2] / 16),
+                                 math.ceil(imgs.shape[3] / 16))
+            self.assertEqual(patch_token.shape, (1, 384, *expect_feat_shape))
--- a/tests/test_models/test_backbones/test_timm_backbone.py
+++ b/tests/test_models/test_backbones/test_timm_backbone.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import unittest
+
+import pytest
+import torch
+from torch import nn
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmpretrain.models.backbones import TIMMBackbone
+
+
+def has_timm() -> bool:
+    try:
+        import timm  # noqa: F401
+        return True
+    except ImportError:
+        return False
+
+
+def check_norm_state(modules, train_state):
+    """Check if norm layer is in correct train state."""
+    for mod in modules:
+        if isinstance(mod, _BatchNorm):
+            if mod.training != train_state:
+                return False
+    return True
+
+
+@unittest.skipIf(not has_timm(), 'timm is not installed')
+def test_timm_backbone():
+    """Test timm backbones, features_only=False (default)."""
+    with pytest.raises(TypeError):
+        # TIMMBackbone has 1 required positional argument: 'model_name'
+        model = TIMMBackbone(pretrained=True)
+
+    with pytest.raises(TypeError):
+        # pretrained must be bool
+        model = TIMMBackbone(model_name='resnet18', pretrained='model.pth')
+
+    # Test resnet18 from timm
+    model = TIMMBackbone(model_name='resnet18')
+    model.init_weights()
+    model.train()
+    assert check_norm_state(model.modules(), True)
+    assert isinstance(model.timm_model.global_pool.pool, nn.Identity)
+    assert isinstance(model.timm_model.fc, nn.Identity)
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 1
+    assert feat[0].shape == torch.Size((1, 512, 7, 7))
+
+    # Test efficientnet_b1 with pretrained weights
+    model = TIMMBackbone(model_name='efficientnet_b1', pretrained=True)
+    model.init_weights()
+    model.train()
+    assert isinstance(model.timm_model.global_pool.pool, nn.Identity)
+    assert isinstance(model.timm_model.classifier, nn.Identity)
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 1
+    assert feat[0].shape == torch.Size((1, 1280, 7, 7))
+
+    # Test vit_tiny_patch16_224 with pretrained weights
+    model = TIMMBackbone(model_name='vit_tiny_patch16_224', pretrained=True)
+    model.init_weights()
+    model.train()
+    assert isinstance(model.timm_model.head, nn.Identity)
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 1
+    # Disable the test since TIMM's behavior changes between 0.5.4 and 0.5.5
+    # assert feat[0].shape == torch.Size((1, 197, 192))
+
+
+@unittest.skipIf(not has_timm(), 'timm is not installed')
+def test_timm_backbone_features_only():
+    """Test timm backbones, features_only=True."""
+    # Test different norm_layer, can be: 'SyncBN', 'BN2d', 'GN', 'LN', 'IN'
+    # Test resnet18 from timm, norm_layer='BN2d'
+    model = TIMMBackbone(
+        model_name='resnet18',
+        features_only=True,
+        pretrained=False,
+        output_stride=32,
+        norm_layer='BN2d')
+
+    # Test resnet18 from timm, norm_layer='SyncBN'
+    model = TIMMBackbone(
+        model_name='resnet18',
+        features_only=True,
+        pretrained=False,
+        output_stride=32,
+        norm_layer='SyncBN')
+
+    # Test resnet18 from timm, output_stride=32
+    model = TIMMBackbone(
+        model_name='resnet18',
+        features_only=True,
+        pretrained=False,
+        output_stride=32)
+    model.init_weights()
+    model.train()
+    assert check_norm_state(model.modules(), True)
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feats = model(imgs)
+    assert len(feats) == 5
+    assert feats[0].shape == torch.Size((1, 64, 112, 112))
+    assert feats[1].shape == torch.Size((1, 64, 56, 56))
+    assert feats[2].shape == torch.Size((1, 128, 28, 28))
+    assert feats[3].shape == torch.Size((1, 256, 14, 14))
+    assert feats[4].shape == torch.Size((1, 512, 7, 7))
+
+    # Test resnet18 from timm, output_stride=32, out_indices=(1, 2, 3)
+    model = TIMMBackbone(
+        model_name='resnet18',
+        features_only=True,
+        pretrained=False,
+        output_stride=32,
+        out_indices=(1, 2, 3))
+    imgs = torch.randn(1, 3, 224, 224)
+    feats = model(imgs)
+    assert len(feats) == 3
+    assert feats[0].shape == torch.Size((1, 64, 56, 56))
+    assert feats[1].shape == torch.Size((1, 128, 28, 28))
+    assert feats[2].shape == torch.Size((1, 256, 14, 14))
+
+    # Test resnet18 from timm, output_stride=16
+    model = TIMMBackbone(
+        model_name='resnet18',
+        features_only=True,
+        pretrained=False,
+        output_stride=16)
+    imgs = torch.randn(1, 3, 224, 224)
+    feats = model(imgs)
+    assert len(feats) == 5
+    assert feats[0].shape == torch.Size((1, 64, 112, 112))
+    assert feats[1].shape == torch.Size((1, 64, 56, 56))
+    assert feats[2].shape == torch.Size((1, 128, 28, 28))
+    assert feats[3].shape == torch.Size((1, 256, 14, 14))
+    assert feats[4].shape == torch.Size((1, 512, 14, 14))
+
+    # Test resnet18 from timm, output_stride=8
+    model = TIMMBackbone(
+        model_name='resnet18',
+        features_only=True,
+        pretrained=False,
+        output_stride=8)
+    imgs = torch.randn(1, 3, 224, 224)
+    feats = model(imgs)
+    assert len(feats) == 5
+    assert feats[0].shape == torch.Size((1, 64, 112, 112))
+    assert feats[1].shape == torch.Size((1, 64, 56, 56))
+    assert feats[2].shape == torch.Size((1, 128, 28, 28))
+    assert feats[3].shape == torch.Size((1, 256, 28, 28))
+    assert feats[4].shape == torch.Size((1, 512, 28, 28))
+
+    # Test efficientnet_b1 with pretrained weights
+    model = TIMMBackbone(
+        model_name='efficientnet_b1', features_only=True, pretrained=True)
+    imgs = torch.randn(1, 3, 64, 64)
+    feats = model(imgs)
+    assert len(feats) == 5
+    assert feats[0].shape == torch.Size((1, 16, 32, 32))
+    assert feats[1].shape == torch.Size((1, 24, 16, 16))
+    assert feats[2].shape == torch.Size((1, 40, 8, 8))
+    assert feats[3].shape == torch.Size((1, 112, 4, 4))
+    assert feats[4].shape == torch.Size((1, 320, 2, 2))
+
+    # Test resnetv2_50x1_bitm from timm, output_stride=8
+    model = TIMMBackbone(
+        model_name='resnetv2_50x1_bitm',
+        features_only=True,
+        pretrained=False,
+        output_stride=8)
+    imgs = torch.randn(1, 3, 8, 8)
+    feats = model(imgs)
+    assert len(feats) == 5
+    assert feats[0].shape == torch.Size((1, 64, 4, 4))
+    assert feats[1].shape == torch.Size((1, 256, 2, 2))
+    assert feats[2].shape == torch.Size((1, 512, 1, 1))
+    assert feats[3].shape == torch.Size((1, 1024, 1, 1))
+    assert feats[4].shape == torch.Size((1, 2048, 1, 1))
+
+    # Test resnetv2_50x3_bitm from timm, output_stride=8
+    model = TIMMBackbone(
+        model_name='resnetv2_50x3_bitm',
+        features_only=True,
+        pretrained=False,
+        output_stride=8)
+    imgs = torch.randn(1, 3, 8, 8)
+    feats = model(imgs)
+    assert len(feats) == 5
+    assert feats[0].shape == torch.Size((1, 192, 4, 4))
+    assert feats[1].shape == torch.Size((1, 768, 2, 2))
+    assert feats[2].shape == torch.Size((1, 1536, 1, 1))
+    assert feats[3].shape == torch.Size((1, 3072, 1, 1))
+    assert feats[4].shape == torch.Size((1, 6144, 1, 1))
+
+    # Test resnetv2_101x1_bitm from timm, output_stride=8
+    model = TIMMBackbone(
+        model_name='resnetv2_101x1_bitm',
+        features_only=True,
+        pretrained=False,
+        output_stride=8)
+    imgs = torch.randn(1, 3, 8, 8)
+    feats = model(imgs)
+    assert len(feats) == 5
+    assert feats[0].shape == torch.Size((1, 64, 4, 4))
+    assert feats[1].shape == torch.Size((1, 256, 2, 2))
+    assert feats[2].shape == torch.Size((1, 512, 1, 1))
+    assert feats[3].shape == torch.Size((1, 1024, 1, 1))
+    assert feats[4].shape == torch.Size((1, 2048, 1, 1))
--- a/tests/test_models/test_backbones/test_tinyvit.py
+++ b/tests/test_models/test_backbones/test_tinyvit.py
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmpretrain.models.backbones import TinyViT
+
+
+def test_assertion():
+    with pytest.raises(AssertionError):
+        TinyViT(arch='unknown')
+
+    with pytest.raises(AssertionError):
+        # MobileViT out_indices should be valid depth.
+        TinyViT(out_indices=-100)
+
+
+def test_tinyvit():
+
+    # Test forward
+    model = TinyViT(arch='5m')
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 1
+    assert feat[0].shape == torch.Size([1, 320])
+
+    # Test forward with multiple outputs
+    model = TinyViT(arch='5m', out_indices=(0, 1, 2, 3))
+    feat = model(imgs)
+    assert len(feat) == 4
+    assert feat[0].shape == torch.Size([1, 128])
+    assert feat[1].shape == torch.Size([1, 160])
+    assert feat[2].shape == torch.Size([1, 320])
+    assert feat[3].shape == torch.Size([1, 320])
+
+    # Test with custom arch
+    model = TinyViT(
+        arch={
+            'depths': [2, 3, 4, 5],
+            'channels': [64, 128, 256, 448],
+            'num_heads': [4, 4, 4, 4]
+        },
+        out_indices=(0, 1, 2, 3))
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 4
+    assert feat[0].shape == torch.Size([1, 128])
+    assert feat[1].shape == torch.Size([1, 256])
+    assert feat[2].shape == torch.Size([1, 448])
+    assert feat[3].shape == torch.Size([1, 448])
+
+    # Test without gap before final norm
+    model = TinyViT(
+        arch='21m', out_indices=(0, 1, 2, 3), gap_before_final_norm=False)
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 4
+
+    assert feat[0].shape == torch.Size([1, 192, 28, 28])
+    assert feat[1].shape == torch.Size([1, 384, 14, 14])
+    assert feat[2].shape == torch.Size([1, 576, 7, 7])
+    assert feat[3].shape == torch.Size([1, 576, 7, 7])
+
+    # Test frozen_stages
+    model = TinyViT(arch='11m', out_indices=(0, 1, 2, 3), frozen_stages=2)
+    model.init_weights()
+    model.train()
+
+    for i in range(2):
+        assert not model.stages[i].training
+
+    for i in range(2, 4):
+        assert model.stages[i].training
--- a/tests/test_models/test_backbones/test_tnt.py
+++ b/tests/test_models/test_backbones/test_tnt.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmpretrain.models.backbones import TNT
+
+
+def check_norm_state(modules, train_state):
+    """Check if norm layer is in correct train state."""
+    for mod in modules:
+        if isinstance(mod, _BatchNorm):
+            if mod.training != train_state:
+                return False
+    return True
+
+
+def test_tnt_backbone():
+    with pytest.raises(TypeError):
+        # pretrained must be a string path
+        model = TNT()
+        model.init_weights(pretrained=0)
+
+    # Test tnt_base_patch16_224
+    model = TNT()
+    model.init_weights()
+    model.train()
+    assert check_norm_state(model.modules(), True)
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 1
+    assert feat[0].shape == torch.Size((1, 640))
+
+    # Test tnt with embed_dims=768
+    arch = {
+        'embed_dims_outer': 768,
+        'embed_dims_inner': 48,
+        'num_layers': 12,
+        'num_heads_outer': 6,
+        'num_heads_inner': 4
+    }
+    model = TNT(arch=arch)
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 1
+    assert feat[0].shape == torch.Size((1, 768))
--- a/tests/test_models/test_backbones/test_twins.py
+++ b/tests/test_models/test_backbones/test_twins.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import pytest
+import torch
+import torch.nn as nn
+
+from mmpretrain.models.backbones.twins import (PCPVT, SVT,
+                                               GlobalSubsampledAttention,
+                                               LocallyGroupedSelfAttention)
+
+
+def test_LSA_module():
+    lsa = LocallyGroupedSelfAttention(embed_dims=32, window_size=3)
+    outs = lsa(torch.randn(1, 3136, 32), (56, 56))
+    assert outs.shape == torch.Size([1, 3136, 32])
+
+
+def test_GSA_module():
+    gsa = GlobalSubsampledAttention(embed_dims=32, num_heads=8)
+    outs = gsa(torch.randn(1, 3136, 32), (56, 56))
+    assert outs.shape == torch.Size([1, 3136, 32])
+
+
+def test_pcpvt():
+    # test init
+    path = 'PATH_THAT_DO_NOT_EXIST'
+
+    # init_cfg loads pretrain from an non-existent file
+    model = PCPVT('s', init_cfg=dict(type='Pretrained', checkpoint=path))
+    assert model.init_cfg == dict(type='Pretrained', checkpoint=path)
+
+    # Test loading a checkpoint from an non-existent file
+    with pytest.raises(OSError):
+        model.init_weights()
+
+    # init_cfg=123, whose type is unsupported
+    model = PCPVT('s', init_cfg=123)
+    with pytest.raises(TypeError):
+        model.init_weights()
+
+    H, W = (64, 64)
+    temp = torch.randn((1, 3, H, W))
+
+    # test output last feat
+    model = PCPVT('small')
+    model.init_weights()
+    outs = model(temp)
+    assert len(outs) == 1
+    assert outs[-1].shape == (1, 512, H // 32, W // 32)
+
+    # test with multi outputs
+    model = PCPVT('small', out_indices=(0, 1, 2, 3))
+    model.init_weights()
+    outs = model(temp)
+    assert len(outs) == 4
+    assert outs[0].shape == (1, 64, H // 4, W // 4)
+    assert outs[1].shape == (1, 128, H // 8, W // 8)
+    assert outs[2].shape == (1, 320, H // 16, W // 16)
+    assert outs[3].shape == (1, 512, H // 32, W // 32)
+
+    # test with arch of dict
+    arch = {
+        'embed_dims': [64, 128, 320, 512],
+        'depths': [3, 4, 18, 3],
+        'num_heads': [1, 2, 5, 8],
+        'patch_sizes': [4, 2, 2, 2],
+        'strides': [4, 2, 2, 2],
+        'mlp_ratios': [8, 8, 4, 4],
+        'sr_ratios': [8, 4, 2, 1]
+    }
+
+    pcpvt_arch = copy.deepcopy(arch)
+    model = PCPVT(pcpvt_arch, out_indices=(0, 1, 2, 3))
+    model.init_weights()
+    outs = model(temp)
+    assert len(outs) == 4
+    assert outs[0].shape == (1, 64, H // 4, W // 4)
+    assert outs[1].shape == (1, 128, H // 8, W // 8)
+    assert outs[2].shape == (1, 320, H // 16, W // 16)
+    assert outs[3].shape == (1, 512, H // 32, W // 32)
+
+    # assert length of arch value not equal
+    pcpvt_arch = copy.deepcopy(arch)
+    pcpvt_arch['sr_ratios'] = [8, 4, 2]
+    with pytest.raises(AssertionError):
+        model = PCPVT(pcpvt_arch, out_indices=(0, 1, 2, 3))
+
+    # assert lack arch essential_keys
+    pcpvt_arch = copy.deepcopy(arch)
+    del pcpvt_arch['sr_ratios']
+    with pytest.raises(AssertionError):
+        model = PCPVT(pcpvt_arch, out_indices=(0, 1, 2, 3))
+
+    # assert arch value not list
+    pcpvt_arch = copy.deepcopy(arch)
+    pcpvt_arch['sr_ratios'] = 1
+    with pytest.raises(AssertionError):
+        model = PCPVT(pcpvt_arch, out_indices=(0, 1, 2, 3))
+
+    pcpvt_arch = copy.deepcopy(arch)
+    pcpvt_arch['sr_ratios'] = '1, 2, 3, 4'
+    with pytest.raises(AssertionError):
+        model = PCPVT(pcpvt_arch, out_indices=(0, 1, 2, 3))
+
+    # test norm_after_stage is bool True
+    model = PCPVT('small', norm_after_stage=True, norm_cfg=dict(type='LN'))
+    for i in range(model.num_stage):
+        assert hasattr(model, f'norm_after_stage{i}')
+        assert isinstance(getattr(model, f'norm_after_stage{i}'), nn.LayerNorm)
+
+    # test norm_after_stage is bool Flase
+    model = PCPVT('small', norm_after_stage=False)
+    for i in range(model.num_stage):
+        assert hasattr(model, f'norm_after_stage{i}')
+        assert isinstance(getattr(model, f'norm_after_stage{i}'), nn.Identity)
+
+    # test norm_after_stage is bool list
+    norm_after_stage = [False, True, False, True]
+    model = PCPVT('small', norm_after_stage=norm_after_stage)
+    assert len(norm_after_stage) == model.num_stage
+    for i in range(model.num_stage):
+        assert hasattr(model, f'norm_after_stage{i}')
+        norm_layer = getattr(model, f'norm_after_stage{i}')
+        if norm_after_stage[i]:
+            assert isinstance(norm_layer, nn.LayerNorm)
+        else:
+            assert isinstance(norm_layer, nn.Identity)
+
+    # test norm_after_stage is not bool list
+    norm_after_stage = [False, 'True', False, True]
+    with pytest.raises(AssertionError):
+        model = PCPVT('small', norm_after_stage=norm_after_stage)
+
+
+def test_svt():
+    # test init
+    path = 'PATH_THAT_DO_NOT_EXIST'
+
+    # init_cfg loads pretrain from an non-existent file
+    model = SVT('s', init_cfg=dict(type='Pretrained', checkpoint=path))
+    assert model.init_cfg == dict(type='Pretrained', checkpoint=path)
+
+    # Test loading a checkpoint from an non-existent file
+    with pytest.raises(OSError):
+        model.init_weights()
+
+    # init_cfg=123, whose type is unsupported
+    model = SVT('s', init_cfg=123)
+    with pytest.raises(TypeError):
+        model.init_weights()
+
+    # Test feature map output
+    H, W = (64, 64)
+    temp = torch.randn((1, 3, H, W))
+
+    model = SVT('s')
+    model.init_weights()
+    outs = model(temp)
+    assert len(outs) == 1
+    assert outs[-1].shape == (1, 512, H // 32, W // 32)
+
+    # test with multi outputs
+    model = SVT('small', out_indices=(0, 1, 2, 3))
+    model.init_weights()
+    outs = model(temp)
+    assert len(outs) == 4
+    assert outs[0].shape == (1, 64, H // 4, W // 4)
+    assert outs[1].shape == (1, 128, H // 8, W // 8)
+    assert outs[2].shape == (1, 256, H // 16, W // 16)
+    assert outs[3].shape == (1, 512, H // 32, W // 32)
+
+    # test with arch of dict
+    arch = {
+        'embed_dims': [96, 192, 384, 768],
+        'depths': [2, 2, 18, 2],
+        'num_heads': [3, 6, 12, 24],
+        'patch_sizes': [4, 2, 2, 2],
+        'strides': [4, 2, 2, 2],
+        'mlp_ratios': [4, 4, 4, 4],
+        'sr_ratios': [8, 4, 2, 1],
+        'window_sizes': [7, 7, 7, 7]
+    }
+    model = SVT(arch, out_indices=(0, 1, 2, 3))
+    model.init_weights()
+    outs = model(temp)
+    assert len(outs) == 4
+    assert outs[0].shape == (1, 96, H // 4, W // 4)
+    assert outs[1].shape == (1, 192, H // 8, W // 8)
+    assert outs[2].shape == (1, 384, H // 16, W // 16)
+    assert outs[3].shape == (1, 768, H // 32, W // 32)
+
+    # assert length of arch value not equal
+    svt_arch = copy.deepcopy(arch)
+    svt_arch['sr_ratios'] = [8, 4, 2]
+    with pytest.raises(AssertionError):
+        model = SVT(svt_arch, out_indices=(0, 1, 2, 3))
+
+    # assert lack arch essential_keys
+    svt_arch = copy.deepcopy(arch)
+    del svt_arch['window_sizes']
+    with pytest.raises(AssertionError):
+        model = SVT(svt_arch, out_indices=(0, 1, 2, 3))
+
+    # assert arch value not list
+    svt_arch = copy.deepcopy(arch)
+    svt_arch['sr_ratios'] = 1
+    with pytest.raises(AssertionError):
+        model = SVT(svt_arch, out_indices=(0, 1, 2, 3))
+
+    svt_arch = copy.deepcopy(arch)
+    svt_arch['sr_ratios'] = '1, 2, 3, 4'
+    with pytest.raises(AssertionError):
+        model = SVT(svt_arch, out_indices=(0, 1, 2, 3))
+
+    # test norm_after_stage is bool True
+    model = SVT('small', norm_after_stage=True, norm_cfg=dict(type='LN'))
+    for i in range(model.num_stage):
+        assert hasattr(model, f'norm_after_stage{i}')
+        assert isinstance(getattr(model, f'norm_after_stage{i}'), nn.LayerNorm)
+
+    # test norm_after_stage is bool Flase
+    model = SVT('small', norm_after_stage=False)
+    for i in range(model.num_stage):
+        assert hasattr(model, f'norm_after_stage{i}')
+        assert isinstance(getattr(model, f'norm_after_stage{i}'), nn.Identity)
+
+    # test norm_after_stage is bool list
+    norm_after_stage = [False, True, False, True]
+    model = SVT('small', norm_after_stage=norm_after_stage)
+    assert len(norm_after_stage) == model.num_stage
+    for i in range(model.num_stage):
+        assert hasattr(model, f'norm_after_stage{i}')
+        norm_layer = getattr(model, f'norm_after_stage{i}')
+        if norm_after_stage[i]:
+            assert isinstance(norm_layer, nn.LayerNorm)
+        else:
+            assert isinstance(norm_layer, nn.Identity)
+
+    # test norm_after_stage is not bool list
+    norm_after_stage = [False, 'True', False, True]
+    with pytest.raises(AssertionError):
+        model = SVT('small', norm_after_stage=norm_after_stage)
--- a/tests/test_models/test_backbones/test_van.py
+++ b/tests/test_models/test_backbones/test_van.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from copy import deepcopy
+from itertools import chain
+from unittest import TestCase
+
+import torch
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
+from torch import nn
+
+from mmpretrain.models.backbones import VAN
+
+
+def check_norm_state(modules, train_state):
+    """Check if norm layer is in correct train state."""
+    for mod in modules:
+        if isinstance(mod, _BatchNorm):
+            if mod.training != train_state:
+                return False
+    return True
+
+
+class TestVAN(TestCase):
+
+    def setUp(self):
+        self.cfg = dict(arch='t', drop_path_rate=0.1)
+
+    def test_arch(self):
+        # Test invalid default arch
+        with self.assertRaisesRegex(AssertionError, 'not in default archs'):
+            cfg = deepcopy(self.cfg)
+            cfg['arch'] = 'unknown'
+            VAN(**cfg)
+
+        # Test invalid custom arch
+        with self.assertRaisesRegex(AssertionError, 'Custom arch needs'):
+            cfg = deepcopy(self.cfg)
+            cfg['arch'] = {
+                'embed_dims': [32, 64, 160, 256],
+                'ffn_ratios': [8, 8, 4, 4],
+            }
+            VAN(**cfg)
+
+        # Test custom arch
+        cfg = deepcopy(self.cfg)
+        embed_dims = [32, 64, 160, 256]
+        depths = [3, 3, 5, 2]
+        ffn_ratios = [8, 8, 4, 4]
+        cfg['arch'] = {
+            'embed_dims': embed_dims,
+            'depths': depths,
+            'ffn_ratios': ffn_ratios
+        }
+        model = VAN(**cfg)
+
+        for i in range(len(depths)):
+            stage = getattr(model, f'blocks{i + 1}')
+            self.assertEqual(stage[-1].out_channels, embed_dims[i])
+            self.assertEqual(len(stage), depths[i])
+
+    def test_init_weights(self):
+        # test weight init cfg
+        cfg = deepcopy(self.cfg)
+        cfg['init_cfg'] = [
+            dict(
+                type='Kaiming',
+                layer='Conv2d',
+                mode='fan_in',
+                nonlinearity='linear')
+        ]
+        model = VAN(**cfg)
+        ori_weight = model.patch_embed1.projection.weight.clone().detach()
+
+        model.init_weights()
+        initialized_weight = model.patch_embed1.projection.weight
+        self.assertFalse(torch.allclose(ori_weight, initialized_weight))
+
+    def test_forward(self):
+        imgs = torch.randn(3, 3, 224, 224)
+
+        cfg = deepcopy(self.cfg)
+        model = VAN(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 1)
+        feat = outs[-1]
+        self.assertEqual(feat.shape, (3, 256, 7, 7))
+
+        # test with patch_sizes
+        cfg = deepcopy(self.cfg)
+        cfg['patch_sizes'] = [7, 5, 5, 5]
+        model = VAN(**cfg)
+        outs = model(torch.randn(3, 3, 224, 224))
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 1)
+        feat = outs[-1]
+        self.assertEqual(feat.shape, (3, 256, 3, 3))
+
+        # test multiple output indices
+        cfg = deepcopy(self.cfg)
+        cfg['out_indices'] = (0, 1, 2, 3)
+        model = VAN(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 4)
+        for emb_size, stride, out in zip([32, 64, 160, 256], [1, 2, 4, 8],
+                                         outs):
+            self.assertEqual(out.shape,
+                             (3, emb_size, 56 // stride, 56 // stride))
+
+        # test with dynamic input shape
+        imgs1 = torch.randn(3, 3, 224, 224)
+        imgs2 = torch.randn(3, 3, 256, 256)
+        imgs3 = torch.randn(3, 3, 256, 309)
+        cfg = deepcopy(self.cfg)
+        model = VAN(**cfg)
+        for imgs in [imgs1, imgs2, imgs3]:
+            outs = model(imgs)
+            self.assertIsInstance(outs, tuple)
+            self.assertEqual(len(outs), 1)
+            feat = outs[-1]
+            expect_feat_shape = (math.ceil(imgs.shape[2] / 32),
+                                 math.ceil(imgs.shape[3] / 32))
+            self.assertEqual(feat.shape, (3, 256, *expect_feat_shape))
+
+    def test_structure(self):
+        # test drop_path_rate decay
+        cfg = deepcopy(self.cfg)
+        cfg['drop_path_rate'] = 0.2
+        model = VAN(**cfg)
+        depths = model.arch_settings['depths']
+        stages = [model.blocks1, model.blocks2, model.blocks3, model.blocks4]
+        blocks = chain(*[stage for stage in stages])
+        total_depth = sum(depths)
+        dpr = [
+            x.item()
+            for x in torch.linspace(0, cfg['drop_path_rate'], total_depth)
+        ]
+        for i, (block, expect_prob) in enumerate(zip(blocks, dpr)):
+            if expect_prob == 0:
+                assert isinstance(block.drop_path, nn.Identity)
+            else:
+                self.assertAlmostEqual(block.drop_path.drop_prob, expect_prob)
+
+        # test VAN with norm_eval=True
+        cfg = deepcopy(self.cfg)
+        cfg['norm_eval'] = True
+        cfg['norm_cfg'] = dict(type='BN')
+        model = VAN(**cfg)
+        model.init_weights()
+        model.train()
+        self.assertTrue(check_norm_state(model.modules(), False))
+
+        # test VAN with first stage frozen.
+        cfg = deepcopy(self.cfg)
+        frozen_stages = 0
+        cfg['frozen_stages'] = frozen_stages
+        cfg['out_indices'] = (0, 1, 2, 3)
+        model = VAN(**cfg)
+        model.init_weights()
+        model.train()
+
+        # the patch_embed and first stage should not require grad.
+        self.assertFalse(model.patch_embed1.training)
+        for param in model.patch_embed1.parameters():
+            self.assertFalse(param.requires_grad)
+        for i in range(frozen_stages + 1):
+            patch = getattr(model, f'patch_embed{i+1}')
+            for param in patch.parameters():
+                self.assertFalse(param.requires_grad)
+            blocks = getattr(model, f'blocks{i + 1}')
+            for param in blocks.parameters():
+                self.assertFalse(param.requires_grad)
+            norm = getattr(model, f'norm{i + 1}')
+            for param in norm.parameters():
+                self.assertFalse(param.requires_grad)
+
+        # the second stage should require grad.
+        for i in range(frozen_stages + 1, 4):
+            patch = getattr(model, f'patch_embed{i + 1}')
+            for param in patch.parameters():
+                self.assertTrue(param.requires_grad)
+            blocks = getattr(model, f'blocks{i+1}')
+            for param in blocks.parameters():
+                self.assertTrue(param.requires_grad)
+            norm = getattr(model, f'norm{i + 1}')
+            for param in norm.parameters():
+                self.assertTrue(param.requires_grad)
--- a/tests/test_models/test_backbones/test_vgg.py
+++ b/tests/test_models/test_backbones/test_vgg.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
+
+from mmpretrain.models.backbones import VGG
+
+
+def check_norm_state(modules, train_state):
+    """Check if norm layer is in correct train state."""
+    for mod in modules:
+        if isinstance(mod, _BatchNorm):
+            if mod.training != train_state:
+                return False
+    return True
+
+
+def test_vgg():
+    """Test VGG backbone."""
+    with pytest.raises(KeyError):
+        # VGG depth should be in [11, 13, 16, 19]
+        VGG(18)
+
+    with pytest.raises(AssertionError):
+        # In VGG: 1 <= num_stages <= 5
+        VGG(11, num_stages=0)
+
+    with pytest.raises(AssertionError):
+        # In VGG: 1 <= num_stages <= 5
+        VGG(11, num_stages=6)
+
+    with pytest.raises(AssertionError):
+        # len(dilations) == num_stages
+        VGG(11, dilations=(1, 1), num_stages=3)
+
+    with pytest.raises(TypeError):
+        # pretrained must be a string path
+        model = VGG(11)
+        model.init_weights(pretrained=0)
+
+    # Test VGG11 norm_eval=True
+    model = VGG(11, norm_eval=True)
+    model.init_weights()
+    model.train()
+    assert check_norm_state(model.modules(), False)
+
+    # Test VGG11 forward without classifiers
+    model = VGG(11, out_indices=(0, 1, 2, 3, 4))
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 5
+    assert feat[0].shape == (1, 64, 112, 112)
+    assert feat[1].shape == (1, 128, 56, 56)
+    assert feat[2].shape == (1, 256, 28, 28)
+    assert feat[3].shape == (1, 512, 14, 14)
+    assert feat[4].shape == (1, 512, 7, 7)
+
+    # Test VGG11 forward with classifiers
+    model = VGG(11, num_classes=10, out_indices=(0, 1, 2, 3, 4, 5))
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 6
+    assert feat[0].shape == (1, 64, 112, 112)
+    assert feat[1].shape == (1, 128, 56, 56)
+    assert feat[2].shape == (1, 256, 28, 28)
+    assert feat[3].shape == (1, 512, 14, 14)
+    assert feat[4].shape == (1, 512, 7, 7)
+    assert feat[5].shape == (1, 10)
+
+    # Test VGG11BN forward
+    model = VGG(11, norm_cfg=dict(type='BN'), out_indices=(0, 1, 2, 3, 4))
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 5
+    assert feat[0].shape == (1, 64, 112, 112)
+    assert feat[1].shape == (1, 128, 56, 56)
+    assert feat[2].shape == (1, 256, 28, 28)
+    assert feat[3].shape == (1, 512, 14, 14)
+    assert feat[4].shape == (1, 512, 7, 7)
+
+    # Test VGG11BN forward with classifiers
+    model = VGG(
+        11,
+        num_classes=10,
+        norm_cfg=dict(type='BN'),
+        out_indices=(0, 1, 2, 3, 4, 5))
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 6
+    assert feat[0].shape == (1, 64, 112, 112)
+    assert feat[1].shape == (1, 128, 56, 56)
+    assert feat[2].shape == (1, 256, 28, 28)
+    assert feat[3].shape == (1, 512, 14, 14)
+    assert feat[4].shape == (1, 512, 7, 7)
+    assert feat[5].shape == (1, 10)
+
+    # Test VGG13 with layers 1, 2, 3 out forward
+    model = VGG(13, out_indices=(0, 1, 2))
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 3
+    assert feat[0].shape == (1, 64, 112, 112)
+    assert feat[1].shape == (1, 128, 56, 56)
+    assert feat[2].shape == (1, 256, 28, 28)
+
+    # Test VGG16 with top feature maps out forward
+    model = VGG(16)
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 1
+    assert feat[0].shape == (1, 512, 7, 7)
+
+    # Test VGG19 with classification score out forward
+    model = VGG(19, num_classes=10)
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 1
+    assert feat[0].shape == (1, 10)
--- a/tests/test_models/test_backbones/test_vision_transformer.py
+++ b/tests/test_models/test_backbones/test_vision_transformer.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import os
+import tempfile
+from copy import deepcopy
+from unittest import TestCase
+
+import torch
+from mmengine.runner import load_checkpoint, save_checkpoint
+
+from mmpretrain.models.backbones import VisionTransformer
+from .utils import timm_resize_pos_embed
+
+
+class TestVisionTransformer(TestCase):
+
+    def setUp(self):
+        self.cfg = dict(
+            arch='b', img_size=224, patch_size=16, drop_path_rate=0.1)
+
+    def test_structure(self):
+        # Test invalid default arch
+        with self.assertRaisesRegex(AssertionError, 'not in default archs'):
+            cfg = deepcopy(self.cfg)
+            cfg['arch'] = 'unknown'
+            VisionTransformer(**cfg)
+
+        # Test invalid custom arch
+        with self.assertRaisesRegex(AssertionError, 'Custom arch needs'):
+            cfg = deepcopy(self.cfg)
+            cfg['arch'] = {
+                'num_layers': 24,
+                'num_heads': 16,
+                'feedforward_channels': 4096
+            }
+            VisionTransformer(**cfg)
+
+        # Test custom arch
+        cfg = deepcopy(self.cfg)
+        cfg['arch'] = {
+            'embed_dims': 128,
+            'num_layers': 24,
+            'num_heads': 16,
+            'feedforward_channels': 1024
+        }
+        model = VisionTransformer(**cfg)
+        self.assertEqual(model.embed_dims, 128)
+        self.assertEqual(model.num_layers, 24)
+        for layer in model.layers:
+            self.assertEqual(layer.attn.num_heads, 16)
+            self.assertEqual(layer.ffn.feedforward_channels, 1024)
+
+        # Test out_indices
+        cfg = deepcopy(self.cfg)
+        cfg['out_indices'] = {1: 1}
+        with self.assertRaisesRegex(AssertionError, "get <class 'dict'>"):
+            VisionTransformer(**cfg)
+        cfg['out_indices'] = [0, 13]
+        with self.assertRaisesRegex(AssertionError, 'Invalid out_indices 13'):
+            VisionTransformer(**cfg)
+
+        # Test model structure
+        cfg = deepcopy(self.cfg)
+        model = VisionTransformer(**cfg)
+        self.assertEqual(len(model.layers), 12)
+        dpr_inc = 0.1 / (12 - 1)
+        dpr = 0
+        for layer in model.layers:
+            self.assertEqual(layer.attn.embed_dims, 768)
+            self.assertEqual(layer.attn.num_heads, 12)
+            self.assertEqual(layer.ffn.feedforward_channels, 3072)
+            self.assertAlmostEqual(layer.attn.out_drop.drop_prob, dpr)
+            self.assertAlmostEqual(layer.ffn.dropout_layer.drop_prob, dpr)
+            dpr += dpr_inc
+
+        # Test model structure:  prenorm
+        cfg = deepcopy(self.cfg)
+        cfg['pre_norm'] = True
+        model = VisionTransformer(**cfg)
+        self.assertNotEqual(model.pre_norm.__class__, torch.nn.Identity)
+
+    def test_init_weights(self):
+        # test weight init cfg
+        cfg = deepcopy(self.cfg)
+        cfg['init_cfg'] = [
+            dict(
+                type='Kaiming',
+                layer='Conv2d',
+                mode='fan_in',
+                nonlinearity='linear')
+        ]
+        model = VisionTransformer(**cfg)
+        ori_weight = model.patch_embed.projection.weight.clone().detach()
+        # The pos_embed is all zero before initialize
+        self.assertTrue(torch.allclose(model.pos_embed, torch.tensor(0.)))
+
+        model.init_weights()
+        initialized_weight = model.patch_embed.projection.weight
+        self.assertFalse(torch.allclose(ori_weight, initialized_weight))
+        self.assertFalse(torch.allclose(model.pos_embed, torch.tensor(0.)))
+
+        # test load checkpoint
+        pretrain_pos_embed = model.pos_embed.clone().detach()
+        tmpdir = tempfile.gettempdir()
+        checkpoint = os.path.join(tmpdir, 'test.pth')
+        save_checkpoint(model.state_dict(), checkpoint)
+        cfg = deepcopy(self.cfg)
+        model = VisionTransformer(**cfg)
+        load_checkpoint(model, checkpoint, strict=True)
+        self.assertTrue(torch.allclose(model.pos_embed, pretrain_pos_embed))
+
+        # test load checkpoint with different img_size
+        cfg = deepcopy(self.cfg)
+        cfg['img_size'] = 384
+        model = VisionTransformer(**cfg)
+        load_checkpoint(model, checkpoint, strict=True)
+        resized_pos_embed = timm_resize_pos_embed(pretrain_pos_embed,
+                                                  model.pos_embed)
+        self.assertTrue(torch.allclose(model.pos_embed, resized_pos_embed))
+
+        os.remove(checkpoint)
+
+    def test_forward(self):
+        imgs = torch.randn(1, 3, 224, 224)
+
+        # test with_cls_token=False
+        cfg = deepcopy(self.cfg)
+        cfg['with_cls_token'] = False
+        cfg['out_type'] = 'cls_token'
+        with self.assertRaisesRegex(ValueError, 'must be True'):
+            VisionTransformer(**cfg)
+
+        cfg = deepcopy(self.cfg)
+        cfg['with_cls_token'] = False
+        cfg['out_type'] = 'featmap'
+        model = VisionTransformer(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 1)
+        patch_token = outs[-1]
+        self.assertEqual(patch_token.shape, (1, 768, 14, 14))
+
+        # test with output cls_token
+        cfg = deepcopy(self.cfg)
+        model = VisionTransformer(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 1)
+        cls_token = outs[-1]
+        self.assertEqual(cls_token.shape, (1, 768))
+
+        # Test forward with multi out indices
+        cfg = deepcopy(self.cfg)
+        cfg['out_indices'] = [-3, -2, -1]
+        model = VisionTransformer(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 3)
+        for out in outs:
+            self.assertEqual(out.shape, (1, 768))
+
+        # Test forward with dynamic input size
+        imgs1 = torch.randn(1, 3, 224, 224)
+        imgs2 = torch.randn(1, 3, 256, 256)
+        imgs3 = torch.randn(1, 3, 256, 309)
+        cfg = deepcopy(self.cfg)
+        cfg['out_type'] = 'featmap'
+        model = VisionTransformer(**cfg)
+        for imgs in [imgs1, imgs2, imgs3]:
+            outs = model(imgs)
+            self.assertIsInstance(outs, tuple)
+            self.assertEqual(len(outs), 1)
+            patch_token = outs[-1]
+            expect_feat_shape = (math.ceil(imgs.shape[2] / 16),
+                                 math.ceil(imgs.shape[3] / 16))
+            self.assertEqual(patch_token.shape, (1, 768, *expect_feat_shape))
--- a/tests/test_models/test_backbones/test_xcit.py
+++ b/tests/test_models/test_backbones/test_xcit.py
+# Copyright (c) OpenMMLab. All rights reserved.
+# The basic forward/backward tests are in ../test_models.py
+import torch
+
+from mmpretrain.apis import get_model
+
+
+def test_out_type():
+    inputs = torch.rand(1, 3, 224, 224)
+
+    model = get_model(
+        'xcit-nano-12-p16_3rdparty_in1k',
+        backbone=dict(out_type='raw'),
+        neck=None,
+        head=None)
+    outputs = model(inputs)[0]
+    assert outputs.shape == (1, 197, 128)
+
+    model = get_model(
+        'xcit-nano-12-p16_3rdparty_in1k',
+        backbone=dict(out_type='featmap'),
+        neck=None,
+        head=None)
+    outputs = model(inputs)[0]
+    assert outputs.shape == (1, 128, 14, 14)
+
+    model = get_model(
+        'xcit-nano-12-p16_3rdparty_in1k',
+        backbone=dict(out_type='cls_token'),
+        neck=None,
+        head=None)
+    outputs = model(inputs)[0]
+    assert outputs.shape == (1, 128)
+
+    model = get_model(
+        'xcit-nano-12-p16_3rdparty_in1k',
+        backbone=dict(out_type='avg_featmap'),
+        neck=None,
+        head=None)
+    outputs = model(inputs)[0]
+    assert outputs.shape == (1, 128)
--- a/tests/test_models/test_backbones/utils.py
+++ b/tests/test_models/test_backbones/utils.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.nn.functional as F
+
+
+def timm_resize_pos_embed(posemb, posemb_new, num_tokens=1, gs_new=()):
+    """Timm version pos embed resize function.
+
+    copied from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+    """  # noqa:E501
+    ntok_new = posemb_new.shape[1]
+    if num_tokens:
+        posemb_tok, posemb_grid = posemb[:, :num_tokens], posemb[0,
+                                                                 num_tokens:]
+        ntok_new -= num_tokens
+    else:
+        posemb_tok, posemb_grid = posemb[:, :0], posemb[0]
+    gs_old = int(math.sqrt(len(posemb_grid)))
+    if not len(gs_new):  # backwards compatibility
+        gs_new = [int(math.sqrt(ntok_new))] * 2
+    assert len(gs_new) >= 2
+    posemb_grid = posemb_grid.reshape(1, gs_old, gs_old,
+                                      -1).permute(0, 3, 1, 2)
+    posemb_grid = F.interpolate(
+        posemb_grid, size=gs_new, mode='bicubic', align_corners=False)
+    posemb_grid = posemb_grid.permute(0, 2, 3,
+                                      1).reshape(1, gs_new[0] * gs_new[1], -1)
+    posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
+    return posemb
--- a/tests/test_models/test_classifiers.py
+++ b/tests/test_models/test_classifiers.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import unittest
+from unittest import TestCase
+from unittest.mock import MagicMock
+
+import torch
+import torch.nn as nn
+from mmengine import ConfigDict
+
+from mmpretrain.models import ImageClassifier
+from mmpretrain.registry import MODELS
+from mmpretrain.structures import DataSample
+
+
+def has_timm() -> bool:
+    try:
+        import timm  # noqa: F401
+        return True
+    except ImportError:
+        return False
+
+
+def has_huggingface() -> bool:
+    try:
+        import transformers  # noqa: F401
+        return True
+    except ImportError:
+        return False
+
+
+class TestImageClassifier(TestCase):
+    DEFAULT_ARGS = dict(
+        type='ImageClassifier',
+        backbone=dict(type='ResNet', depth=18),
+        neck=dict(type='GlobalAveragePooling'),
+        head=dict(
+            type='LinearClsHead',
+            num_classes=10,
+            in_channels=512,
+            loss=dict(type='CrossEntropyLoss')))
+
+    def test_initialize(self):
+        model = MODELS.build(self.DEFAULT_ARGS)
+        self.assertTrue(model.with_neck)
+        self.assertTrue(model.with_head)
+
+        cfg = {**self.DEFAULT_ARGS, 'pretrained': 'checkpoint'}
+        model = MODELS.build(cfg)
+        self.assertDictEqual(model.init_cfg,
+                             dict(type='Pretrained', checkpoint='checkpoint'))
+
+        cfg = ConfigDict(self.DEFAULT_ARGS)
+        cfg.pop('neck')
+        model = MODELS.build(cfg)
+        self.assertFalse(model.with_neck)
+
+        cfg = ConfigDict(self.DEFAULT_ARGS)
+        cfg.pop('head')
+        model = MODELS.build(cfg)
+        self.assertFalse(model.with_head)
+
+        # test set batch augmentation from train_cfg
+        cfg = {
+            **self.DEFAULT_ARGS, 'train_cfg':
+            dict(augments=dict(type='Mixup', alpha=1.))
+        }
+        model: ImageClassifier = MODELS.build(cfg)
+        self.assertIsNotNone(model.data_preprocessor.batch_augments)
+
+        cfg = {**self.DEFAULT_ARGS, 'train_cfg': dict()}
+        model: ImageClassifier = MODELS.build(cfg)
+        self.assertIsNone(model.data_preprocessor.batch_augments)
+
+    def test_extract_feat(self):
+        inputs = torch.rand(1, 3, 224, 224)
+        cfg = ConfigDict(self.DEFAULT_ARGS)
+        cfg.backbone.out_indices = (0, 1, 2, 3)
+        model: ImageClassifier = MODELS.build(cfg)
+
+        # test backbone output
+        feats = model.extract_feat(inputs, stage='backbone')
+        self.assertEqual(len(feats), 4)
+        self.assertEqual(feats[0].shape, (1, 64, 56, 56))
+        self.assertEqual(feats[1].shape, (1, 128, 28, 28))
+        self.assertEqual(feats[2].shape, (1, 256, 14, 14))
+        self.assertEqual(feats[3].shape, (1, 512, 7, 7))
+
+        # test neck output
+        feats = model.extract_feat(inputs, stage='neck')
+        self.assertEqual(len(feats), 4)
+        self.assertEqual(feats[0].shape, (1, 64))
+        self.assertEqual(feats[1].shape, (1, 128))
+        self.assertEqual(feats[2].shape, (1, 256))
+        self.assertEqual(feats[3].shape, (1, 512))
+
+        # test pre_logits output
+        feats = model.extract_feat(inputs, stage='pre_logits')
+        self.assertEqual(feats.shape, (1, 512))
+
+        # TODO: test transformer style feature extraction
+
+        # test extract_feats
+        multi_feats = model.extract_feats([inputs, inputs], stage='backbone')
+        self.assertEqual(len(multi_feats), 2)
+        for feats in multi_feats:
+            self.assertEqual(len(feats), 4)
+            self.assertEqual(feats[0].shape, (1, 64, 56, 56))
+            self.assertEqual(feats[1].shape, (1, 128, 28, 28))
+            self.assertEqual(feats[2].shape, (1, 256, 14, 14))
+            self.assertEqual(feats[3].shape, (1, 512, 7, 7))
+
+        # Without neck, return backbone
+        cfg = ConfigDict(self.DEFAULT_ARGS)
+        cfg.backbone.out_indices = (0, 1, 2, 3)
+        cfg.pop('neck')
+        model: ImageClassifier = MODELS.build(cfg)
+        feats = model.extract_feat(inputs, stage='neck')
+        self.assertEqual(len(feats), 4)
+        self.assertEqual(feats[0].shape, (1, 64, 56, 56))
+        self.assertEqual(feats[1].shape, (1, 128, 28, 28))
+        self.assertEqual(feats[2].shape, (1, 256, 14, 14))
+        self.assertEqual(feats[3].shape, (1, 512, 7, 7))
+
+        # Without head, raise error
+        cfg = ConfigDict(self.DEFAULT_ARGS)
+        cfg.backbone.out_indices = (0, 1, 2, 3)
+        cfg.pop('head')
+        model: ImageClassifier = MODELS.build(cfg)
+        with self.assertRaisesRegex(AssertionError, 'No head or the head'):
+            model.extract_feat(inputs, stage='pre_logits')
+
+        with self.assertRaisesRegex(AssertionError, 'use `extract_feat`'):
+            model.extract_feats(inputs)
+
+    def test_loss(self):
+        inputs = torch.rand(1, 3, 224, 224)
+        data_samples = [DataSample().set_gt_label(1)]
+
+        model: ImageClassifier = MODELS.build(self.DEFAULT_ARGS)
+        losses = model.loss(inputs, data_samples)
+        self.assertGreater(losses['loss'].item(), 0)
+
+    def test_predict(self):
+        inputs = torch.rand(1, 3, 224, 224)
+        data_samples = [DataSample().set_gt_label(1)]
+
+        model: ImageClassifier = MODELS.build(self.DEFAULT_ARGS)
+        predictions = model.predict(inputs)
+        self.assertEqual(predictions[0].pred_score.shape, (10, ))
+
+        predictions = model.predict(inputs, data_samples)
+        self.assertEqual(predictions[0].pred_score.shape, (10, ))
+        self.assertEqual(data_samples[0].pred_score.shape, (10, ))
+        torch.testing.assert_allclose(data_samples[0].pred_score,
+                                      predictions[0].pred_score)
+
+    def test_forward(self):
+        inputs = torch.rand(1, 3, 224, 224)
+        data_samples = [DataSample().set_gt_label(1)]
+        model: ImageClassifier = MODELS.build(self.DEFAULT_ARGS)
+
+        # test pure forward
+        outs = model(inputs)
+        self.assertIsInstance(outs, torch.Tensor)
+
+        # test forward train
+        losses = model(inputs, data_samples, mode='loss')
+        self.assertGreater(losses['loss'].item(), 0)
+
+        # test forward test
+        predictions = model(inputs, mode='predict')
+        self.assertEqual(predictions[0].pred_score.shape, (10, ))
+
+        predictions = model(inputs, data_samples, mode='predict')
+        self.assertEqual(predictions[0].pred_score.shape, (10, ))
+        self.assertEqual(data_samples[0].pred_score.shape, (10, ))
+        torch.testing.assert_allclose(data_samples[0].pred_score,
+                                      predictions[0].pred_score)
+
+        # test forward with invalid mode
+        with self.assertRaisesRegex(RuntimeError, 'Invalid mode "unknown"'):
+            model(inputs, mode='unknown')
+
+    def test_train_step(self):
+        cfg = {
+            **self.DEFAULT_ARGS, 'data_preprocessor':
+            dict(mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5])
+        }
+        model: ImageClassifier = MODELS.build(cfg)
+
+        data = {
+            'inputs': torch.randint(0, 256, (1, 3, 224, 224)),
+            'data_samples': [DataSample().set_gt_label(1)]
+        }
+
+        optim_wrapper = MagicMock()
+        log_vars = model.train_step(data, optim_wrapper)
+        self.assertIn('loss', log_vars)
+        optim_wrapper.update_params.assert_called_once()
+
+    def test_val_step(self):
+        cfg = {
+            **self.DEFAULT_ARGS, 'data_preprocessor':
+            dict(mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5])
+        }
+        model: ImageClassifier = MODELS.build(cfg)
+
+        data = {
+            'inputs': torch.randint(0, 256, (1, 3, 224, 224)),
+            'data_samples': [DataSample().set_gt_label(1)]
+        }
+
+        predictions = model.val_step(data)
+        self.assertEqual(predictions[0].pred_score.shape, (10, ))
+
+    def test_test_step(self):
+        cfg = {
+            **self.DEFAULT_ARGS, 'data_preprocessor':
+            dict(mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5])
+        }
+        model: ImageClassifier = MODELS.build(cfg)
+
+        data = {
+            'inputs': torch.randint(0, 256, (1, 3, 224, 224)),
+            'data_samples': [DataSample().set_gt_label(1)]
+        }
+
+        predictions = model.test_step(data)
+        self.assertEqual(predictions[0].pred_score.shape, (10, ))
+
+
+@unittest.skipIf(not has_timm(), 'timm is not installed.')
+class TestTimmClassifier(TestCase):
+    DEFAULT_ARGS = dict(
+        type='TimmClassifier',
+        model_name='resnet18',
+        loss=dict(type='CrossEntropyLoss'),
+    )
+
+    def test_initialize(self):
+        model = MODELS.build(self.DEFAULT_ARGS)
+        assert isinstance(model.model, nn.Module)
+
+        # test set batch augmentation from train_cfg
+        cfg = {
+            **self.DEFAULT_ARGS, 'train_cfg':
+            dict(augments=dict(type='Mixup', alpha=1.))
+        }
+        model: ImageClassifier = MODELS.build(cfg)
+        self.assertIsNotNone(model.data_preprocessor.batch_augments)
+
+        cfg = {**self.DEFAULT_ARGS, 'train_cfg': dict()}
+        model: ImageClassifier = MODELS.build(cfg)
+        self.assertIsNone(model.data_preprocessor.batch_augments)
+
+    def test_loss(self):
+        inputs = torch.rand(1, 3, 224, 224)
+        data_samples = [DataSample().set_gt_label(1)]
+
+        model: ImageClassifier = MODELS.build(self.DEFAULT_ARGS)
+        losses = model.loss(inputs, data_samples)
+        self.assertGreater(losses['loss'].item(), 0)
+
+    def test_predict(self):
+        inputs = torch.rand(1, 3, 224, 224)
+        data_samples = [DataSample().set_gt_label(1)]
+
+        model: ImageClassifier = MODELS.build(self.DEFAULT_ARGS)
+        predictions = model.predict(inputs)
+        self.assertEqual(predictions[0].pred_score.shape, (1000, ))
+
+        predictions = model.predict(inputs, data_samples)
+        self.assertEqual(predictions[0].pred_score.shape, (1000, ))
+        self.assertEqual(data_samples[0].pred_score.shape, (1000, ))
+        torch.testing.assert_allclose(data_samples[0].pred_score,
+                                      predictions[0].pred_score)
+
+    def test_forward(self):
+        inputs = torch.rand(1, 3, 224, 224)
+        data_samples = [DataSample().set_gt_label(1)]
+        model: ImageClassifier = MODELS.build(self.DEFAULT_ARGS)
+
+        # test pure forward
+        outs = model(inputs)
+        self.assertIsInstance(outs, torch.Tensor)
+
+        # test forward train
+        losses = model(inputs, data_samples, mode='loss')
+        self.assertGreater(losses['loss'].item(), 0)
+
+        # test forward test
+        predictions = model(inputs, mode='predict')
+        self.assertEqual(predictions[0].pred_score.shape, (1000, ))
+
+        predictions = model(inputs, data_samples, mode='predict')
+        self.assertEqual(predictions[0].pred_score.shape, (1000, ))
+        self.assertEqual(data_samples[0].pred_score.shape, (1000, ))
+        torch.testing.assert_allclose(data_samples[0].pred_score,
+                                      predictions[0].pred_score)
+
+        # test forward with invalid mode
+        with self.assertRaisesRegex(RuntimeError, 'Invalid mode "unknown"'):
+            model(inputs, mode='unknown')
+
+    def test_train_step(self):
+        cfg = {
+            **self.DEFAULT_ARGS, 'data_preprocessor':
+            dict(mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5])
+        }
+        model: ImageClassifier = MODELS.build(cfg)
+
+        data = {
+            'inputs': torch.randint(0, 256, (1, 3, 224, 224)),
+            'data_samples': [DataSample().set_gt_label(1)]
+        }
+
+        optim_wrapper = MagicMock()
+        log_vars = model.train_step(data, optim_wrapper)
+        self.assertIn('loss', log_vars)
+        optim_wrapper.update_params.assert_called_once()
+
+    def test_val_step(self):
+        cfg = {
+            **self.DEFAULT_ARGS, 'data_preprocessor':
+            dict(mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5])
+        }
+        model: ImageClassifier = MODELS.build(cfg)
+
+        data = {
+            'inputs': torch.randint(0, 256, (1, 3, 224, 224)),
+            'data_samples': [DataSample().set_gt_label(1)]
+        }
+
+        predictions = model.val_step(data)
+        self.assertEqual(predictions[0].pred_score.shape, (1000, ))
+
+    def test_test_step(self):
+        cfg = {
+            **self.DEFAULT_ARGS, 'data_preprocessor':
+            dict(mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5])
+        }
+        model: ImageClassifier = MODELS.build(cfg)
+
+        data = {
+            'inputs': torch.randint(0, 256, (1, 3, 224, 224)),
+            'data_samples': [DataSample().set_gt_label(1)]
+        }
+
+        predictions = model.test_step(data)
+        self.assertEqual(predictions[0].pred_score.shape, (1000, ))
+
+
+@unittest.skipIf(not has_huggingface(), 'huggingface is not installed.')
+class TestHuggingFaceClassifier(TestCase):
+    DEFAULT_ARGS = dict(
+        type='HuggingFaceClassifier',
+        model_name='microsoft/resnet-18',
+        loss=dict(type='CrossEntropyLoss'),
+    )
+
+    def test_initialize(self):
+        model = MODELS.build(self.DEFAULT_ARGS)
+        assert isinstance(model.model, nn.Module)
+
+        # test set batch augmentation from train_cfg
+        cfg = {
+            **self.DEFAULT_ARGS, 'train_cfg':
+            dict(augments=dict(type='Mixup', alpha=1.))
+        }
+        model: ImageClassifier = MODELS.build(cfg)
+        self.assertIsNotNone(model.data_preprocessor.batch_augments)
+
+        cfg = {**self.DEFAULT_ARGS, 'train_cfg': dict()}
+        model: ImageClassifier = MODELS.build(cfg)
+        self.assertIsNone(model.data_preprocessor.batch_augments)
+
+    def test_loss(self):
+        inputs = torch.rand(1, 3, 224, 224)
+        data_samples = [DataSample().set_gt_label(1)]
+
+        model: ImageClassifier = MODELS.build(self.DEFAULT_ARGS)
+        losses = model.loss(inputs, data_samples)
+        self.assertGreater(losses['loss'].item(), 0)
+
+    def test_predict(self):
+        inputs = torch.rand(1, 3, 224, 224)
+        data_samples = [DataSample().set_gt_label(1)]
+
+        model: ImageClassifier = MODELS.build(self.DEFAULT_ARGS)
+        predictions = model.predict(inputs)
+        self.assertEqual(predictions[0].pred_score.shape, (1000, ))
+
+        predictions = model.predict(inputs, data_samples)
+        self.assertEqual(predictions[0].pred_score.shape, (1000, ))
+        self.assertEqual(data_samples[0].pred_score.shape, (1000, ))
+        torch.testing.assert_allclose(data_samples[0].pred_score,
+                                      predictions[0].pred_score)
+
+    def test_forward(self):
+        inputs = torch.rand(1, 3, 224, 224)
+        data_samples = [DataSample().set_gt_label(1)]
+        model: ImageClassifier = MODELS.build(self.DEFAULT_ARGS)
+
+        # test pure forward
+        outs = model(inputs)
+        self.assertIsInstance(outs, torch.Tensor)
+
+        # test forward train
+        losses = model(inputs, data_samples, mode='loss')
+        self.assertGreater(losses['loss'].item(), 0)
+
+        # test forward test
+        predictions = model(inputs, mode='predict')
+        self.assertEqual(predictions[0].pred_score.shape, (1000, ))
+
+        predictions = model(inputs, data_samples, mode='predict')
+        self.assertEqual(predictions[0].pred_score.shape, (1000, ))
+        self.assertEqual(data_samples[0].pred_score.shape, (1000, ))
+        torch.testing.assert_allclose(data_samples[0].pred_score,
+                                      predictions[0].pred_score)
+
+        # test forward with invalid mode
+        with self.assertRaisesRegex(RuntimeError, 'Invalid mode "unknown"'):
+            model(inputs, mode='unknown')
+
+    def test_train_step(self):
+        cfg = {
+            **self.DEFAULT_ARGS, 'data_preprocessor':
+            dict(mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5])
+        }
+        model: ImageClassifier = MODELS.build(cfg)
+
+        data = {
+            'inputs': torch.randint(0, 256, (1, 3, 224, 224)),
+            'data_samples': [DataSample().set_gt_label(1)]
+        }
+
+        optim_wrapper = MagicMock()
+        log_vars = model.train_step(data, optim_wrapper)
+        self.assertIn('loss', log_vars)
+        optim_wrapper.update_params.assert_called_once()
+
+    def test_val_step(self):
+        cfg = {
+            **self.DEFAULT_ARGS, 'data_preprocessor':
+            dict(mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5])
+        }
+        model: ImageClassifier = MODELS.build(cfg)
+
+        data = {
+            'inputs': torch.randint(0, 256, (1, 3, 224, 224)),
+            'data_samples': [DataSample().set_gt_label(1)]
+        }
+
+        predictions = model.val_step(data)
+        self.assertEqual(predictions[0].pred_score.shape, (1000, ))
+
+    def test_test_step(self):
+        cfg = {
+            **self.DEFAULT_ARGS, 'data_preprocessor':
+            dict(mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5])
+        }
+        model: ImageClassifier = MODELS.build(cfg)
+
+        data = {
+            'inputs': torch.randint(0, 256, (1, 3, 224, 224)),
+            'data_samples': [DataSample().set_gt_label(1)]
+        }
+
+        predictions = model.test_step(data)
+        self.assertEqual(predictions[0].pred_score.shape, (1000, ))