add tests part

1baf0566 · limm · 495d9ed9 · 1baf0566 · 1baf0566 · 1baf0566
Commit 1baf0566 authored Jun 24, 2025 by limm
20 changed files
--- a/tests/test_evaluation/test_metrics/test_retrieval.py
+++ b/tests/test_evaluation/test_metrics/test_retrieval.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import numpy as np
+import torch
+
+from mmpretrain.evaluation.metrics import (RetrievalAveragePrecision,
+                                           RetrievalRecall)
+from mmpretrain.registry import METRICS
+from mmpretrain.structures import DataSample
+
+
+class TestRetrievalRecall(TestCase):
+
+    def test_evaluate(self):
+        """Test using the metric in the same way as Evalutor."""
+        pred = [
+            DataSample().set_pred_score(i).set_gt_label(k).to_dict()
+            for i, k in zip([
+                torch.tensor([0.7, 0.0, 0.3]),
+                torch.tensor([0.5, 0.2, 0.3]),
+                torch.tensor([0.4, 0.5, 0.1]),
+                torch.tensor([0.0, 0.0, 1.0]),
+                torch.tensor([0.0, 0.0, 1.0]),
+                torch.tensor([0.0, 0.0, 1.0]),
+            ], [[0], [0, 1], [1], [2], [1, 2], [0, 1]])
+        ]
+
+        # Test with score (use score instead of label if score exists)
+        metric = METRICS.build(dict(type='RetrievalRecall', topk=1))
+        metric.process(None, pred)
+        recall = metric.evaluate(6)
+        self.assertIsInstance(recall, dict)
+        self.assertAlmostEqual(
+            recall['retrieval/Recall@1'], 5 / 6 * 100, places=4)
+
+        # Test with invalid topk
+        with self.assertRaisesRegex(RuntimeError, 'selected index k'):
+            metric = METRICS.build(dict(type='RetrievalRecall', topk=10))
+            metric.process(None, pred)
+            metric.evaluate(6)
+
+        with self.assertRaisesRegex(ValueError, '`topk` must be a'):
+            METRICS.build(dict(type='RetrievalRecall', topk=-1))
+
+        # Test initialization
+        metric = METRICS.build(dict(type='RetrievalRecall', topk=5))
+        self.assertEqual(metric.topk, (5, ))
+
+        # Test initialization
+        metric = METRICS.build(dict(type='RetrievalRecall', topk=(1, 2, 5)))
+        self.assertEqual(metric.topk, (1, 2, 5))
+
+    def test_calculate(self):
+        """Test using the metric from static method."""
+
+        # seq of indices format
+        y_true = [[0, 2, 5, 8, 9], [1, 4, 6]]
+        y_pred = [np.arange(10)] * 2
+
+        # test with average is 'macro'
+        recall_score = RetrievalRecall.calculate(
+            y_pred, y_true, topk=1, pred_indices=True, target_indices=True)
+        expect_recall = 50.
+        self.assertEqual(recall_score[0].item(), expect_recall)
+
+        # test with tensor input
+        y_true = torch.Tensor([[1, 0, 1, 0, 0, 1, 0, 0, 1, 1],
+                               [0, 1, 0, 0, 1, 0, 1, 0, 0, 0]])
+        y_pred = np.array([np.linspace(0.95, 0.05, 10)] * 2)
+        recall_score = RetrievalRecall.calculate(y_pred, y_true, topk=1)
+        expect_recall = 50.
+        self.assertEqual(recall_score[0].item(), expect_recall)
+
+        # test with topk is 5
+        y_pred = np.array([np.linspace(0.95, 0.05, 10)] * 2)
+        recall_score = RetrievalRecall.calculate(y_pred, y_true, topk=2)
+        expect_recall = 100.
+        self.assertEqual(recall_score[0].item(), expect_recall)
+
+        # test with topk is (1, 5)
+        y_pred = np.array([np.linspace(0.95, 0.05, 10)] * 2)
+        recall_score = RetrievalRecall.calculate(y_pred, y_true, topk=(1, 5))
+        expect_recalls = [50., 100.]
+        self.assertEqual(len(recall_score), len(expect_recalls))
+        for i in range(len(expect_recalls)):
+            self.assertEqual(recall_score[i].item(), expect_recalls[i])
+
+        # Test with invalid pred
+        y_pred = dict()
+        y_true = [[0, 2, 5, 8, 9], [1, 4, 6]]
+        with self.assertRaisesRegex(AssertionError, '`pred` must be Seq'):
+            RetrievalRecall.calculate(y_pred, y_true, True, True)
+
+        # Test with invalid target
+        y_true = dict()
+        y_pred = [np.arange(10)] * 2
+        with self.assertRaisesRegex(AssertionError, '`target` must be Seq'):
+            RetrievalRecall.calculate(
+                y_pred, y_true, topk=1, pred_indices=True, target_indices=True)
+
+        # Test with different length `pred` with `target`
+        y_true = [[0, 2, 5, 8, 9], [1, 4, 6]]
+        y_pred = [np.arange(10)] * 3
+        with self.assertRaisesRegex(AssertionError, 'Length of `pred`'):
+            RetrievalRecall.calculate(
+                y_pred, y_true, topk=1, pred_indices=True, target_indices=True)
+
+        # Test with invalid pred
+        y_true = [[0, 2, 5, 8, 9], dict()]
+        y_pred = [np.arange(10)] * 2
+        with self.assertRaisesRegex(AssertionError, '`target` should be'):
+            RetrievalRecall.calculate(
+                y_pred, y_true, topk=1, pred_indices=True, target_indices=True)
+
+        # Test with invalid target
+        y_true = [[0, 2, 5, 8, 9], [1, 4, 6]]
+        y_pred = [np.arange(10), dict()]
+        with self.assertRaisesRegex(AssertionError, '`pred` should be'):
+            RetrievalRecall.calculate(
+                y_pred, y_true, topk=1, pred_indices=True, target_indices=True)
+
+
+class TestRetrievalAveragePrecision(TestCase):
+
+    def test_evaluate(self):
+        """Test using the metric in the same way as Evalutor."""
+        y_true = torch.tensor([[1, 0, 1, 0, 0, 1, 0, 0, 1, 1],
+                               [0, 1, 0, 0, 1, 0, 1, 0, 0, 0]])
+        y_pred = torch.tensor([np.linspace(0.95, 0.05, 10)] * 2)
+
+        pred = [
+            DataSample().set_pred_score(i).set_gt_score(j)
+            for i, j in zip(y_pred, y_true)
+        ]
+
+        # Test with default macro avergae
+        metric = METRICS.build(dict(type='RetrievalAveragePrecision', topk=10))
+        metric.process([], pred)
+        res = metric.evaluate(len(pred))
+        self.assertIsInstance(res, dict)
+        self.assertAlmostEqual(
+            res['retrieval/mAP@10'], 53.25396825396825, places=4)
+
+        # Test with invalid topk
+        with self.assertRaisesRegex(ValueError, '`topk` must be a'):
+            METRICS.build(dict(type='RetrievalAveragePrecision', topk=-1))
+
+        # Test with invalid mode
+        with self.assertRaisesRegex(AssertionError, 'Invalid `mode` '):
+            METRICS.build(
+                dict(type='RetrievalAveragePrecision', topk=5, mode='m'))
+
+    def test_calculate(self):
+        """Test using the metric from static method."""
+        # Test IR mode
+        # example from https://zhuanlan.zhihu.com/p/35983818
+        # or https://www.youtube.com/watch?v=pM6DJ0ZZee0
+
+        # seq of indices format
+        y_true = [[0, 2, 5, 8, 9], [1, 4, 6]]
+        y_pred = [np.arange(10)] * 2
+
+        # test with average is 'macro'
+        ap_score = RetrievalAveragePrecision.calculate(y_pred, y_true, 10,
+                                                       True, True)
+        expect_ap = 53.25396825396825
+        self.assertEqual(ap_score.item(), expect_ap)
+
+        # test with tensor input
+        y_true = torch.Tensor([[1, 0, 1, 0, 0, 1, 0, 0, 1, 1],
+                               [0, 1, 0, 0, 1, 0, 1, 0, 0, 0]])
+        y_pred = np.array([np.linspace(0.95, 0.05, 10)] * 2)
+        ap_score = RetrievalAveragePrecision.calculate(y_pred, y_true, 10)
+        expect_ap = 53.25396825396825
+        self.assertEqual(ap_score.item(), expect_ap)
+
+        # test with topk is 5
+        y_pred = np.array([np.linspace(0.95, 0.05, 10)] * 2)
+        ap_score = RetrievalAveragePrecision.calculate(y_pred, y_true, topk=5)
+        expect_ap = 31.666666666666664
+        self.assertEqual(ap_score.item(), expect_ap)
+
+        # Test with invalid mode
+        with self.assertRaisesRegex(AssertionError, 'Invalid `mode` '):
+            RetrievalAveragePrecision.calculate(
+                y_pred, y_true, True, True, mode='m')
+
+        # Test with invalid pred
+        y_pred = dict()
+        y_true = [[0, 2, 5, 8, 9], [1, 4, 6]]
+        with self.assertRaisesRegex(AssertionError, '`pred` must be Seq'):
+            RetrievalAveragePrecision.calculate(y_pred, y_true, 10, True, True)
+
+        # Test with invalid target
+        y_true = dict()
+        y_pred = [np.arange(10)] * 2
+        with self.assertRaisesRegex(AssertionError, '`target` must be Seq'):
+            RetrievalAveragePrecision.calculate(y_pred, y_true, 10, True, True)
+
+        # Test with different length `pred` with `target`
+        y_true = [[0, 2, 5, 8, 9], [1, 4, 6]]
+        y_pred = [np.arange(10)] * 3
+        with self.assertRaisesRegex(AssertionError, 'Length of `pred`'):
+            RetrievalAveragePrecision.calculate(y_pred, y_true, 10, True, True)
+
+        # Test with invalid pred
+        y_true = [[0, 2, 5, 8, 9], dict()]
+        y_pred = [np.arange(10)] * 2
+        with self.assertRaisesRegex(AssertionError, '`target` should be'):
+            RetrievalAveragePrecision.calculate(y_pred, y_true, 10, True, True)
+
+        # Test with invalid target
+        y_true = [[0, 2, 5, 8, 9], [1, 4, 6]]
+        y_pred = [np.arange(10), dict()]
+        with self.assertRaisesRegex(AssertionError, '`pred` should be'):
+            RetrievalAveragePrecision.calculate(y_pred, y_true, 10, True, True)
+
+        # Test with mode 'integrate'
+        y_true = torch.Tensor([[1, 0, 1, 0, 0, 1, 0, 0, 1, 1],
+                               [0, 1, 0, 0, 1, 0, 1, 0, 0, 0]])
+        y_pred = np.array([np.linspace(0.95, 0.05, 10)] * 2)
+
+        ap_score = RetrievalAveragePrecision.calculate(
+            y_pred, y_true, topk=5, mode='integrate')
+        expect_ap = 25.416666666666664
+        self.assertEqual(ap_score.item(), expect_ap)
--- a/tests/test_evaluation/test_metrics/test_scienceqa.py
+++ b/tests/test_evaluation/test_metrics/test_scienceqa.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.evaluator import Evaluator
+
+from mmpretrain.structures import DataSample
+
+
+class TestScienceQAMetric:
+
+    def test_evaluate(self):
+        meta_info = {
+            'choices': ['A', 'B', 'C', 'D'],
+            'pred_answer': 'A',
+            'grade': 'grade1',
+            'subject': 'language science',
+            'gt_answer': 1,
+            'hint': 'hint',
+            'has_image': True
+        }
+        data_sample = DataSample(metainfo=meta_info)
+        data_samples = [data_sample for _ in range(10)]
+        evaluator = Evaluator(dict(type='mmpretrain.ScienceQAMetric'))
+        evaluator.process(data_samples)
+        res = evaluator.evaluate(4)
+        assert res['acc_grade_1_6'] == 0.0
+        assert res['acc_language'] == 0.0
+        assert res['all_acc'] == 0.0
+
+        meta_info = {
+            'choices': ['A', 'B', 'C', 'D'],
+            'pred_answer': 'A',
+            'grade': 'grade1',
+            'subject': 'language science',
+            'gt_answer': 0,
+            'hint': 'hint',
+            'has_image': True
+        }
+        data_sample = DataSample(metainfo=meta_info)
+        data_samples = [data_sample for _ in range(10)]
+        evaluator = Evaluator(dict(type='mmpretrain.ScienceQAMetric'))
+        evaluator.process(data_samples)
+        res = evaluator.evaluate(4)
+        assert res['acc_grade_1_6'] == 1.0
+        assert res['acc_language'] == 1.0
+        assert res['all_acc'] == 1.0
--- a/tests/test_evaluation/test_metrics/test_shape_bias_metric.py
+++ b/tests/test_evaluation/test_metrics/test_shape_bias_metric.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmpretrain.evaluation import ShapeBiasMetric
+
+
+def test_shape_bias_metric():
+    data_sample = dict()
+    data_sample['pred_score'] = torch.rand(1000, )
+    data_sample['pred_label'] = torch.tensor(1)
+    data_sample['gt_label'] = torch.tensor(1)
+    data_sample['img_path'] = 'tests/airplane/test.JPEG'
+    evaluator = ShapeBiasMetric(
+        csv_dir='tests/data', dataset_name='test', model_name='test')
+    evaluator.process(None, [data_sample])
--- a/tests/test_evaluation/test_metrics/test_single_label.py
+++ b/tests/test_evaluation/test_metrics/test_single_label.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from unittest import TestCase
+
+import numpy as np
+import torch
+
+from mmpretrain.evaluation.metrics import (Accuracy, ConfusionMatrix,
+                                           SingleLabelMetric)
+from mmpretrain.registry import METRICS
+from mmpretrain.structures import DataSample
+
+
+class TestAccuracy(TestCase):
+
+    def test_evaluate(self):
+        """Test using the metric in the same way as Evalutor."""
+        pred = [
+            DataSample().set_pred_score(i).set_pred_label(j).set_gt_label(
+                k).to_dict() for i, j, k in zip([
+                    torch.tensor([0.7, 0.0, 0.3]),
+                    torch.tensor([0.5, 0.2, 0.3]),
+                    torch.tensor([0.4, 0.5, 0.1]),
+                    torch.tensor([0.0, 0.0, 1.0]),
+                    torch.tensor([0.0, 0.0, 1.0]),
+                    torch.tensor([0.0, 0.0, 1.0]),
+                ], [0, 0, 1, 2, 2, 2], [0, 0, 1, 2, 1, 0])
+        ]
+
+        # Test with score (use score instead of label if score exists)
+        metric = METRICS.build(dict(type='Accuracy', thrs=0.6))
+        metric.process(None, pred)
+        acc = metric.evaluate(6)
+        self.assertIsInstance(acc, dict)
+        self.assertAlmostEqual(acc['accuracy/top1'], 2 / 6 * 100, places=4)
+
+        # Test with multiple thrs
+        metric = METRICS.build(dict(type='Accuracy', thrs=(0., 0.6, None)))
+        metric.process(None, pred)
+        acc = metric.evaluate(6)
+        self.assertSetEqual(
+            set(acc.keys()), {
+                'accuracy/top1_thr-0.00', 'accuracy/top1_thr-0.60',
+                'accuracy/top1_no-thr'
+            })
+
+        # Test with invalid topk
+        with self.assertRaisesRegex(ValueError, 'check the `val_evaluator`'):
+            metric = METRICS.build(dict(type='Accuracy', topk=(1, 5)))
+            metric.process(None, pred)
+            metric.evaluate(6)
+
+        # Test with label
+        for sample in pred:
+            del sample['pred_score']
+        metric = METRICS.build(dict(type='Accuracy', thrs=(0., 0.6, None)))
+        metric.process(None, pred)
+        acc = metric.evaluate(6)
+        self.assertIsInstance(acc, dict)
+        self.assertAlmostEqual(acc['accuracy/top1'], 4 / 6 * 100, places=4)
+
+        # Test initialization
+        metric = METRICS.build(dict(type='Accuracy', thrs=0.6))
+        self.assertTupleEqual(metric.thrs, (0.6, ))
+        metric = METRICS.build(dict(type='Accuracy', thrs=[0.6]))
+        self.assertTupleEqual(metric.thrs, (0.6, ))
+        metric = METRICS.build(dict(type='Accuracy', topk=5))
+        self.assertTupleEqual(metric.topk, (5, ))
+        metric = METRICS.build(dict(type='Accuracy', topk=[5]))
+        self.assertTupleEqual(metric.topk, (5, ))
+
+    def test_calculate(self):
+        """Test using the metric from static method."""
+
+        # Test with score
+        y_true = np.array([0, 0, 1, 2, 1, 0])
+        y_label = torch.tensor([0, 0, 1, 2, 2, 2])
+        y_score = [
+            [0.7, 0.0, 0.3],
+            [0.5, 0.2, 0.3],
+            [0.4, 0.5, 0.1],
+            [0.0, 0.0, 1.0],
+            [0.0, 0.0, 1.0],
+            [0.0, 0.0, 1.0],
+        ]
+
+        # Test with score
+        acc = Accuracy.calculate(y_score, y_true, thrs=(0.6, ))
+        self.assertIsInstance(acc, list)
+        self.assertIsInstance(acc[0], list)
+        self.assertIsInstance(acc[0][0], torch.Tensor)
+        self.assertTensorEqual(acc[0][0], 2 / 6 * 100)
+
+        # Test with label
+        acc = Accuracy.calculate(y_label, y_true, thrs=(0.6, ))
+        self.assertIsInstance(acc, torch.Tensor)
+        # the thrs will be ignored
+        self.assertTensorEqual(acc, 4 / 6 * 100)
+
+        # Test with invalid inputs
+        with self.assertRaisesRegex(TypeError, "<class 'str'> is not"):
+            Accuracy.calculate(y_label, 'hi')
+
+        # Test with invalid topk
+        with self.assertRaisesRegex(ValueError, 'Top-5 accuracy .* is 3'):
+            Accuracy.calculate(y_score, y_true, topk=(1, 5))
+
+    def assertTensorEqual(self,
+                          tensor: torch.Tensor,
+                          value: float,
+                          msg=None,
+                          **kwarg):
+        tensor = tensor.to(torch.float32)
+        value = torch.FloatTensor([value])
+        try:
+            torch.testing.assert_allclose(tensor, value, **kwarg)
+        except AssertionError as e:
+            self.fail(self._formatMessage(msg, str(e)))
+
+
+class TestSingleLabel(TestCase):
+
+    def test_evaluate(self):
+        """Test using the metric in the same way as Evalutor."""
+        pred = [
+            DataSample().set_pred_score(i).set_pred_label(j).set_gt_label(
+                k).to_dict() for i, j, k in zip([
+                    torch.tensor([0.7, 0.0, 0.3]),
+                    torch.tensor([0.5, 0.2, 0.3]),
+                    torch.tensor([0.4, 0.5, 0.1]),
+                    torch.tensor([0.0, 0.0, 1.0]),
+                    torch.tensor([0.0, 0.0, 1.0]),
+                    torch.tensor([0.0, 0.0, 1.0]),
+                ], [0, 0, 1, 2, 2, 2], [0, 0, 1, 2, 1, 0])
+        ]
+
+        # Test with score (use score instead of label if score exists)
+        metric = METRICS.build(
+            dict(
+                type='SingleLabelMetric',
+                thrs=0.6,
+                items=('precision', 'recall', 'f1-score', 'support')))
+        metric.process(None, pred)
+        res = metric.evaluate(6)
+        self.assertIsInstance(res, dict)
+        self.assertAlmostEqual(
+            res['single-label/precision'], (1 + 0 + 1 / 3) / 3 * 100, places=4)
+        self.assertAlmostEqual(
+            res['single-label/recall'], (1 / 3 + 0 + 1) / 3 * 100, places=4)
+        self.assertAlmostEqual(
+            res['single-label/f1-score'], (1 / 2 + 0 + 1 / 2) / 3 * 100,
+            places=4)
+        self.assertEqual(res['single-label/support'], 6)
+
+        # Test with multiple thrs
+        metric = METRICS.build(
+            dict(type='SingleLabelMetric', thrs=(0., 0.6, None)))
+        metric.process(None, pred)
+        res = metric.evaluate(6)
+        self.assertSetEqual(
+            set(res.keys()), {
+                'single-label/precision_thr-0.00',
+                'single-label/recall_thr-0.00',
+                'single-label/f1-score_thr-0.00',
+                'single-label/precision_thr-0.60',
+                'single-label/recall_thr-0.60',
+                'single-label/f1-score_thr-0.60',
+                'single-label/precision_no-thr', 'single-label/recall_no-thr',
+                'single-label/f1-score_no-thr'
+            })
+
+        # Test with average mode "micro"
+        metric = METRICS.build(
+            dict(
+                type='SingleLabelMetric',
+                average='micro',
+                items=('precision', 'recall', 'f1-score', 'support')))
+        metric.process(None, pred)
+        res = metric.evaluate(6)
+        self.assertIsInstance(res, dict)
+        self.assertAlmostEqual(
+            res['single-label/precision_micro'], 66.666, places=2)
+        self.assertAlmostEqual(
+            res['single-label/recall_micro'], 66.666, places=2)
+        self.assertAlmostEqual(
+            res['single-label/f1-score_micro'], 66.666, places=2)
+        self.assertEqual(res['single-label/support_micro'], 6)
+
+        # Test with average mode None
+        metric = METRICS.build(
+            dict(
+                type='SingleLabelMetric',
+                average=None,
+                items=('precision', 'recall', 'f1-score', 'support')))
+        metric.process(None, pred)
+        res = metric.evaluate(6)
+        self.assertIsInstance(res, dict)
+        precision = res['single-label/precision_classwise']
+        self.assertAlmostEqual(precision[0], 100., places=4)
+        self.assertAlmostEqual(precision[1], 100., places=4)
+        self.assertAlmostEqual(precision[2], 1 / 3 * 100, places=4)
+        recall = res['single-label/recall_classwise']
+        self.assertAlmostEqual(recall[0], 2 / 3 * 100, places=4)
+        self.assertAlmostEqual(recall[1], 50., places=4)
+        self.assertAlmostEqual(recall[2], 100., places=4)
+        f1_score = res['single-label/f1-score_classwise']
+        self.assertAlmostEqual(f1_score[0], 80., places=4)
+        self.assertAlmostEqual(f1_score[1], 2 / 3 * 100, places=4)
+        self.assertAlmostEqual(f1_score[2], 50., places=4)
+        self.assertEqual(res['single-label/support_classwise'], [3, 2, 1])
+
+        # Test with label, the thrs will be ignored
+        pred_no_score = copy.deepcopy(pred)
+        for sample in pred_no_score:
+            del sample['pred_score']
+            del sample['num_classes']
+        metric = METRICS.build(
+            dict(type='SingleLabelMetric', thrs=(0., 0.6), num_classes=3))
+        metric.process(None, pred_no_score)
+        res = metric.evaluate(6)
+        self.assertIsInstance(res, dict)
+        # Expected values come from sklearn
+        self.assertAlmostEqual(res['single-label/precision'], 77.777, places=2)
+        self.assertAlmostEqual(res['single-label/recall'], 72.222, places=2)
+        self.assertAlmostEqual(res['single-label/f1-score'], 65.555, places=2)
+
+        metric = METRICS.build(dict(type='SingleLabelMetric', thrs=(0., 0.6)))
+        with self.assertRaisesRegex(AssertionError, 'must be specified'):
+            metric.process(None, pred_no_score)
+
+        # Test with empty items
+        metric = METRICS.build(
+            dict(type='SingleLabelMetric', items=tuple(), num_classes=3))
+        metric.process(None, pred)
+        res = metric.evaluate(6)
+        self.assertIsInstance(res, dict)
+        self.assertEqual(len(res), 0)
+
+        metric.process(None, pred_no_score)
+        res = metric.evaluate(6)
+        self.assertIsInstance(res, dict)
+        self.assertEqual(len(res), 0)
+
+        # Test initialization
+        metric = METRICS.build(dict(type='SingleLabelMetric', thrs=0.6))
+        self.assertTupleEqual(metric.thrs, (0.6, ))
+        metric = METRICS.build(dict(type='SingleLabelMetric', thrs=[0.6]))
+        self.assertTupleEqual(metric.thrs, (0.6, ))
+
+    def test_calculate(self):
+        """Test using the metric from static method."""
+
+        # Test with score
+        y_true = np.array([0, 0, 1, 2, 1, 0])
+        y_label = torch.tensor([0, 0, 1, 2, 2, 2])
+        y_score = [
+            [0.7, 0.0, 0.3],
+            [0.5, 0.2, 0.3],
+            [0.4, 0.5, 0.1],
+            [0.0, 0.0, 1.0],
+            [0.0, 0.0, 1.0],
+            [0.0, 0.0, 1.0],
+        ]
+
+        # Test with score
+        res = SingleLabelMetric.calculate(y_score, y_true, thrs=(0.6, ))
+        self.assertIsInstance(res, list)
+        self.assertIsInstance(res[0], tuple)
+        precision, recall, f1_score, support = res[0]
+        self.assertTensorEqual(precision, (1 + 0 + 1 / 3) / 3 * 100)
+        self.assertTensorEqual(recall, (1 / 3 + 0 + 1) / 3 * 100)
+        self.assertTensorEqual(f1_score, (1 / 2 + 0 + 1 / 2) / 3 * 100)
+        self.assertTensorEqual(support, 6)
+
+        # Test with label
+        res = SingleLabelMetric.calculate(y_label, y_true, num_classes=3)
+        self.assertIsInstance(res, tuple)
+        precision, recall, f1_score, support = res
+        # Expected values come from sklearn
+        self.assertTensorEqual(precision, 77.7777)
+        self.assertTensorEqual(recall, 72.2222)
+        self.assertTensorEqual(f1_score, 65.5555)
+        self.assertTensorEqual(support, 6)
+
+        # Test with invalid inputs
+        with self.assertRaisesRegex(TypeError, "<class 'str'> is not"):
+            SingleLabelMetric.calculate(y_label, 'hi')
+
+    def assertTensorEqual(self,
+                          tensor: torch.Tensor,
+                          value: float,
+                          msg=None,
+                          **kwarg):
+        tensor = tensor.to(torch.float32)
+        value = torch.tensor(value).float()
+        try:
+            torch.testing.assert_allclose(tensor, value, **kwarg)
+        except AssertionError as e:
+            self.fail(self._formatMessage(msg, str(e)))
+
+
+class TestConfusionMatrix(TestCase):
+
+    def test_evaluate(self):
+        """Test using the metric in the same way as Evalutor."""
+        pred = [
+            DataSample().set_pred_score(i).set_pred_label(j).set_gt_label(
+                k).to_dict() for i, j, k in zip([
+                    torch.tensor([0.7, 0.0, 0.3]),
+                    torch.tensor([0.5, 0.2, 0.3]),
+                    torch.tensor([0.4, 0.5, 0.1]),
+                    torch.tensor([0.0, 0.0, 1.0]),
+                    torch.tensor([0.0, 0.0, 1.0]),
+                    torch.tensor([0.0, 0.0, 1.0]),
+                ], [0, 0, 1, 2, 2, 2], [0, 0, 1, 2, 1, 0])
+        ]
+
+        # Test with score (use score instead of label if score exists)
+        metric = METRICS.build(dict(type='ConfusionMatrix'))
+        metric.process(None, pred)
+        res = metric.evaluate(6)
+        self.assertIsInstance(res, dict)
+        self.assertTensorEqual(
+            res['confusion_matrix/result'],
+            torch.tensor([
+                [2, 0, 1],
+                [0, 1, 1],
+                [0, 0, 1],
+            ]))
+
+        # Test with label
+        for sample in pred:
+            del sample['pred_score']
+        metric = METRICS.build(dict(type='ConfusionMatrix'))
+        metric.process(None, pred)
+        with self.assertRaisesRegex(AssertionError,
+                                    'Please specify the `num_classes`'):
+            metric.evaluate(6)
+
+        metric = METRICS.build(dict(type='ConfusionMatrix', num_classes=3))
+        metric.process(None, pred)
+        self.assertIsInstance(res, dict)
+        self.assertTensorEqual(
+            res['confusion_matrix/result'],
+            torch.tensor([
+                [2, 0, 1],
+                [0, 1, 1],
+                [0, 0, 1],
+            ]))
+
+    def test_calculate(self):
+        y_true = np.array([0, 0, 1, 2, 1, 0])
+        y_label = torch.tensor([0, 0, 1, 2, 2, 2])
+        y_score = [
+            [0.7, 0.0, 0.3],
+            [0.5, 0.2, 0.3],
+            [0.4, 0.5, 0.1],
+            [0.0, 0.0, 1.0],
+            [0.0, 0.0, 1.0],
+            [0.0, 0.0, 1.0],
+        ]
+
+        # Test with score
+        cm = ConfusionMatrix.calculate(y_score, y_true)
+        self.assertIsInstance(cm, torch.Tensor)
+        self.assertTensorEqual(
+            cm, torch.tensor([
+                [2, 0, 1],
+                [0, 1, 1],
+                [0, 0, 1],
+            ]))
+
+        # Test with label
+        with self.assertRaisesRegex(AssertionError,
+                                    'Please specify the `num_classes`'):
+            ConfusionMatrix.calculate(y_label, y_true)
+
+        cm = ConfusionMatrix.calculate(y_label, y_true, num_classes=3)
+        self.assertIsInstance(cm, torch.Tensor)
+        self.assertTensorEqual(
+            cm, torch.tensor([
+                [2, 0, 1],
+                [0, 1, 1],
+                [0, 0, 1],
+            ]))
+
+        # Test with invalid inputs
+        with self.assertRaisesRegex(TypeError, "<class 'str'> is not"):
+            ConfusionMatrix.calculate(y_label, 'hi')
+
+    def test_plot(self):
+        import matplotlib.pyplot as plt
+
+        cm = torch.tensor([[2, 0, 1], [0, 1, 1], [0, 0, 1]])
+        fig = ConfusionMatrix.plot(cm, include_values=True, show=False)
+
+        self.assertIsInstance(fig, plt.Figure)
+
+    def assertTensorEqual(self,
+                          tensor: torch.Tensor,
+                          value: float,
+                          msg=None,
+                          **kwarg):
+        tensor = tensor.to(torch.float32)
+        value = torch.tensor(value).float()
+        try:
+            torch.testing.assert_allclose(tensor, value, **kwarg)
+        except AssertionError as e:
+            self.fail(self._formatMessage(msg, str(e)))
--- a/tests/test_evaluation/test_metrics/test_voc_metrics.py
+++ b/tests/test_evaluation/test_metrics/test_voc_metrics.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import numpy as np
+import sklearn.metrics
+import torch
+from mmengine.evaluator import Evaluator
+from mmengine.registry import init_default_scope
+
+from mmpretrain.structures import DataSample
+
+init_default_scope('mmpretrain')
+
+
+class TestVOCMultiLabel(TestCase):
+
+    def test_evaluate(self):
+        # prepare input data
+        y_true_label = [[0], [1, 3], [0, 1, 2], [3]]
+        y_true_difficult = [[0], [2], [1], []]
+        y_pred_score = torch.tensor([
+            [0.8, 0, 0, 0.6],
+            [0.2, 0, 0.6, 0],
+            [0, 0.9, 0.6, 0],
+            [0, 0, 0.2, 0.3],
+        ])
+
+        # generate data samples
+        pred = [
+            DataSample(num_classes=4).set_pred_score(i).set_gt_label(j)
+            for i, j in zip(y_pred_score, y_true_label)
+        ]
+        for sample, difficult_label in zip(pred, y_true_difficult):
+            sample.set_metainfo({'gt_label_difficult': difficult_label})
+
+        # 1. Test with default argument
+        evaluator = Evaluator(dict(type='VOCMultiLabelMetric'))
+        evaluator.process(pred)
+        res = evaluator.evaluate(4)
+        self.assertIsInstance(res, dict)
+
+        # generate sklearn input
+        y_true = np.array([
+            [1, 0, 0, 0],
+            [0, 1, -1, 1],
+            [1, 1, 1, 0],
+            [0, 0, 0, 1],
+        ])
+        ignored_index = y_true == -1
+        y_true[ignored_index] = 0
+        thr05_y_pred = np.array([
+            [1, 0, 0, 1],
+            [0, 0, 1, 0],
+            [0, 1, 1, 0],
+            [0, 0, 0, 0],
+        ])
+        thr05_y_pred[ignored_index] = 0
+
+        expect_precision = sklearn.metrics.precision_score(
+            y_true, thr05_y_pred, average='macro') * 100
+        expect_recall = sklearn.metrics.recall_score(
+            y_true, thr05_y_pred, average='macro') * 100
+        expect_f1 = sklearn.metrics.f1_score(
+            y_true, thr05_y_pred, average='macro') * 100
+        self.assertEqual(res['multi-label/precision'], expect_precision)
+        self.assertEqual(res['multi-label/recall'], expect_recall)
+        # precision is different between torch and sklearn
+        self.assertAlmostEqual(res['multi-label/f1-score'], expect_f1, 5)
+
+        # 2. Test with `difficult_as_positive`=False argument
+        evaluator = Evaluator(
+            dict(type='VOCMultiLabelMetric', difficult_as_positive=False))
+        evaluator.process(pred)
+        res = evaluator.evaluate(4)
+        self.assertIsInstance(res, dict)
+
+        # generate sklearn input
+        y_true = np.array([
+            [1, 0, 0, 0],
+            [0, 1, 0, 1],
+            [1, 1, 1, 0],
+            [0, 0, 0, 1],
+        ])
+        thr05_y_pred = np.array([
+            [1, 0, 0, 1],
+            [0, 0, 1, 0],
+            [0, 1, 1, 0],
+            [0, 0, 0, 0],
+        ])
+
+        expect_precision = sklearn.metrics.precision_score(
+            y_true, thr05_y_pred, average='macro') * 100
+        expect_recall = sklearn.metrics.recall_score(
+            y_true, thr05_y_pred, average='macro') * 100
+        expect_f1 = sklearn.metrics.f1_score(
+            y_true, thr05_y_pred, average='macro') * 100
+        self.assertEqual(res['multi-label/precision'], expect_precision)
+        self.assertEqual(res['multi-label/recall'], expect_recall)
+        # precision is different between torch and sklearn
+        self.assertAlmostEqual(res['multi-label/f1-score'], expect_f1, 5)
+
+        # 3. Test with `difficult_as_positive`=True argument
+        evaluator = Evaluator(
+            dict(type='VOCMultiLabelMetric', difficult_as_positive=True))
+        evaluator.process(pred)
+        res = evaluator.evaluate(4)
+        self.assertIsInstance(res, dict)
+
+        # generate sklearn input
+        y_true = np.array([
+            [1, 0, 0, 0],
+            [0, 1, 1, 1],
+            [1, 1, 1, 0],
+            [0, 0, 0, 1],
+        ])
+        thr05_y_pred = np.array([
+            [1, 0, 0, 1],
+            [0, 0, 1, 0],
+            [0, 1, 1, 0],
+            [0, 0, 0, 0],
+        ])
+
+        expect_precision = sklearn.metrics.precision_score(
+            y_true, thr05_y_pred, average='macro') * 100
+        expect_recall = sklearn.metrics.recall_score(
+            y_true, thr05_y_pred, average='macro') * 100
+        expect_f1 = sklearn.metrics.f1_score(
+            y_true, thr05_y_pred, average='macro') * 100
+        self.assertEqual(res['multi-label/precision'], expect_precision)
+        self.assertEqual(res['multi-label/recall'], expect_recall)
+        # precision is different between torch and sklearn
+        self.assertAlmostEqual(res['multi-label/f1-score'], expect_f1, 5)
+
+
+class TestVOCAveragePrecision(TestCase):
+
+    def test_evaluate(self):
+        """Test using the metric in the same way as Evalutor."""
+        # prepare input data
+        y_true_difficult = [[0], [2], [1], []]
+        y_pred_score = torch.tensor([
+            [0.8, 0.1, 0, 0.6],
+            [0.2, 0.2, 0.7, 0],
+            [0.1, 0.9, 0.6, 0.1],
+            [0, 0, 0.2, 0.3],
+        ])
+        y_true_label = [[0], [1, 3], [0, 1, 2], [3]]
+        y_true = torch.tensor([
+            [1, 0, 0, 0],
+            [0, 1, 0, 1],
+            [1, 1, 1, 0],
+            [0, 0, 0, 1],
+        ])
+        y_true_difficult = [[0], [2], [1], []]
+
+        # generate data samples
+        pred = [
+            DataSample(num_classes=4).set_pred_score(i).set_gt_score(
+                j).set_gt_label(k)
+            for i, j, k in zip(y_pred_score, y_true, y_true_label)
+        ]
+        for sample, difficult_label in zip(pred, y_true_difficult):
+            sample.set_metainfo({'gt_label_difficult': difficult_label})
+
+        # 1. Test with default
+        evaluator = Evaluator(dict(type='VOCAveragePrecision'))
+        evaluator.process(pred)
+        res = evaluator.evaluate(4)
+        self.assertIsInstance(res, dict)
+
+        # prepare inputs for sklearn for this case
+        y_pred_score = [[0.8, 0.2, 0.1, 0], [0.1, 0.2, 0.9, 0], [0, 0.6, 0.2],
+                        [0.6, 0, 0.1, 0.3]]
+        y_true = [[1, 0, 1, 0], [0, 1, 1, 0], [0, 1, 0], [0, 1, 0, 1]]
+        expected_res = []
+        for pred_per_class, gt_per_class in zip(y_pred_score, y_true):
+            expected_res.append(
+                sklearn.metrics.average_precision_score(
+                    gt_per_class, pred_per_class))
+
+        self.assertAlmostEqual(
+            res['multi-label/mAP'],
+            sum(expected_res) * 100 / len(expected_res),
+            places=4)
+
+        # 2. Test with `difficult_as_positive`=False argument
+        evaluator = Evaluator(
+            dict(type='VOCAveragePrecision', difficult_as_positive=False))
+        evaluator.process(pred)
+        res = evaluator.evaluate(4)
+        self.assertIsInstance(res, dict)
+
+        # prepare inputs for sklearn for this case
+        y_pred_score = [[0.8, 0.2, 0.1, 0], [0.1, 0.2, 0.9, 0],
+                        [0, 0.7, 0.6, 0.2], [0.6, 0, 0.1, 0.3]]
+        y_true = [[1, 0, 1, 0], [0, 1, 1, 0], [0, 0, 1, 0], [0, 1, 0, 1]]
+        expected_res = []
+        for pred_per_class, gt_per_class in zip(y_pred_score, y_true):
+            expected_res.append(
+                sklearn.metrics.average_precision_score(
+                    gt_per_class, pred_per_class))
+
+        self.assertAlmostEqual(
+            res['multi-label/mAP'],
+            sum(expected_res) * 100 / len(expected_res),
+            places=4)
+
+        # 3. Test with `difficult_as_positive`=True argument
+        evaluator = Evaluator(
+            dict(type='VOCAveragePrecision', difficult_as_positive=True))
+        evaluator.process(pred)
+        res = evaluator.evaluate(4)
+        self.assertIsInstance(res, dict)
+
+        # prepare inputs for sklearn for this case
+        y_pred_score = [[0.8, 0.2, 0.1, 0], [0.1, 0.2, 0.9, 0],
+                        [0, 0.7, 0.6, 0.2], [0.6, 0, 0.1, 0.3]]
+        y_true = [[1, 0, 1, 0], [0, 1, 1, 0], [0, 1, 1, 0], [0, 1, 0, 1]]
+        expected_res = []
+        for pred_per_class, gt_per_class in zip(y_pred_score, y_true):
+            expected_res.append(
+                sklearn.metrics.average_precision_score(
+                    gt_per_class, pred_per_class))
+
+        self.assertAlmostEqual(
+            res['multi-label/mAP'],
+            sum(expected_res) * 100 / len(expected_res),
+            places=4)
--- a/tests/test_models/test_backbones/__init__.py
+++ b/tests/test_models/test_backbones/__init__.py
+# Copyright (c) OpenMMLab. All rights reserved.
--- a/tests/test_models/test_backbones/test_beit.py
+++ b/tests/test_models/test_backbones/test_beit.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from copy import deepcopy
+from unittest import TestCase
+
+import torch
+
+from mmpretrain.models.backbones import BEiTViT
+
+
+class TestBEiT(TestCase):
+
+    def setUp(self):
+        self.cfg = dict(
+            arch='b', img_size=224, patch_size=16, drop_path_rate=0.1)
+
+    def test_structure(self):
+        # Test invalid default arch
+        with self.assertRaisesRegex(AssertionError, 'not in default archs'):
+            cfg = deepcopy(self.cfg)
+            cfg['arch'] = 'unknown'
+            BEiTViT(**cfg)
+
+        # Test invalid custom arch
+        with self.assertRaisesRegex(AssertionError, 'Custom arch needs'):
+            cfg = deepcopy(self.cfg)
+            cfg['arch'] = {
+                'num_layers': 24,
+                'num_heads': 16,
+                'feedforward_channels': 4096
+            }
+            BEiTViT(**cfg)
+
+        # Test custom arch
+        cfg = deepcopy(self.cfg)
+        cfg['arch'] = {
+            'embed_dims': 128,
+            'num_layers': 24,
+            'num_heads': 16,
+            'feedforward_channels': 1024
+        }
+        model = BEiTViT(**cfg)
+        self.assertEqual(model.embed_dims, 128)
+        self.assertEqual(model.num_layers, 24)
+        self.assertIsNone(model.pos_embed)
+        self.assertIsNone(model.rel_pos_bias)
+        for layer in model.layers:
+            self.assertEqual(layer.attn.num_heads, 16)
+            self.assertEqual(layer.ffn.feedforward_channels, 1024)
+
+        # Test out_indices
+        cfg = deepcopy(self.cfg)
+        cfg['out_indices'] = {1: 1}
+        with self.assertRaisesRegex(AssertionError, "get <class 'dict'>"):
+            BEiTViT(**cfg)
+        cfg['out_indices'] = [0, 13]
+        with self.assertRaisesRegex(AssertionError, 'Invalid out_indices 13'):
+            BEiTViT(**cfg)
+
+        # Test pos_embed
+        cfg = deepcopy(self.cfg)
+        cfg['use_abs_pos_emb'] = True
+        model = BEiTViT(**cfg)
+        self.assertEqual(model.pos_embed.shape, (1, 197, 768))
+
+        # Test model structure
+        cfg = deepcopy(self.cfg)
+        cfg['drop_path_rate'] = 0.1
+        model = BEiTViT(**cfg)
+        self.assertEqual(len(model.layers), 12)
+        dpr_inc = 0.1 / (12 - 1)
+        dpr = 0
+        for layer in model.layers:
+            self.assertEqual(layer.gamma_1.shape, (768, ))
+            self.assertEqual(layer.gamma_2.shape, (768, ))
+            self.assertEqual(layer.attn.embed_dims, 768)
+            self.assertEqual(layer.attn.num_heads, 12)
+            self.assertEqual(layer.ffn.feedforward_channels, 3072)
+            self.assertFalse(layer.ffn.add_identity)
+            self.assertAlmostEqual(layer.ffn.dropout_layer.drop_prob, dpr)
+            dpr += dpr_inc
+
+    def test_forward(self):
+        imgs = torch.randn(1, 3, 224, 224)
+
+        cfg = deepcopy(self.cfg)
+        cfg['out_type'] = 'cls_token'
+        model = BEiTViT(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 1)
+        cls_token = outs[-1]
+        self.assertEqual(cls_token.shape, (1, 768))
+
+        # test without output cls_token
+        cfg = deepcopy(self.cfg)
+        model = BEiTViT(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 1)
+        patch_token = outs[-1]
+        self.assertEqual(patch_token.shape, (1, 768))
+
+        # test without average
+        cfg = deepcopy(self.cfg)
+        cfg['out_type'] = 'featmap'
+        model = BEiTViT(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 1)
+        patch_token = outs[-1]
+        self.assertEqual(patch_token.shape, (1, 768, 14, 14))
+
+        # Test forward with multi out indices
+        cfg = deepcopy(self.cfg)
+        cfg['out_indices'] = [-3, -2, -1]
+        model = BEiTViT(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 3)
+        for out in outs:
+            patch_token = out
+            self.assertEqual(patch_token.shape, (1, 768))
--- a/tests/test_models/test_backbones/test_conformer.py
+++ b/tests/test_models/test_backbones/test_conformer.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from copy import deepcopy
+
+import pytest
+import torch
+from torch.nn.modules import GroupNorm
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmpretrain.models.backbones import Conformer
+
+
+def is_norm(modules):
+    """Check if is one of the norms."""
+    if isinstance(modules, (GroupNorm, _BatchNorm)):
+        return True
+    return False
+
+
+def check_norm_state(modules, train_state):
+    """Check if norm layer is in correct train state."""
+    for mod in modules:
+        if isinstance(mod, _BatchNorm):
+            if mod.training != train_state:
+                return False
+    return True
+
+
+@torch.no_grad()  # To save memory
+def test_conformer_backbone():
+
+    cfg_ori = dict(
+        arch='T',
+        drop_path_rate=0.1,
+    )
+
+    with pytest.raises(AssertionError):
+        # test invalid arch
+        cfg = deepcopy(cfg_ori)
+        cfg['arch'] = 'unknown'
+        Conformer(**cfg)
+
+    with pytest.raises(AssertionError):
+        # test arch without essential keys
+        cfg = deepcopy(cfg_ori)
+        cfg['arch'] = {'embed_dims': 24, 'channel_ratio': 6, 'num_heads': 9}
+        Conformer(**cfg)
+
+    # Test Conformer small model with patch size of 16
+    model = Conformer(**cfg_ori)
+    model.init_weights()
+    model.train()
+
+    assert check_norm_state(model.modules(), True)
+
+    imgs = torch.randn(1, 3, 224, 224)
+    conv_feature, transformer_feature = model(imgs)[-1]
+    assert conv_feature.shape == (1, 64 * 1 * 4
+                                  )  # base_channels * channel_ratio * 4
+    assert transformer_feature.shape == (1, 384)
+
+    # Test Conformer with irregular input size.
+    model = Conformer(**cfg_ori)
+    model.init_weights()
+    model.train()
+
+    assert check_norm_state(model.modules(), True)
+
+    imgs = torch.randn(1, 3, 241, 241)
+    conv_feature, transformer_feature = model(imgs)[-1]
+    assert conv_feature.shape == (1, 64 * 1 * 4
+                                  )  # base_channels * channel_ratio * 4
+    assert transformer_feature.shape == (1, 384)
+
+    imgs = torch.randn(1, 3, 321, 221)
+    conv_feature, transformer_feature = model(imgs)[-1]
+    assert conv_feature.shape == (1, 64 * 1 * 4
+                                  )  # base_channels * channel_ratio * 4
+    assert transformer_feature.shape == (1, 384)
+
+    # Test custom arch Conformer without output cls token
+    cfg = deepcopy(cfg_ori)
+    cfg['arch'] = {
+        'embed_dims': 128,
+        'depths': 15,
+        'num_heads': 16,
+        'channel_ratio': 3,
+    }
+    cfg['with_cls_token'] = False
+    cfg['base_channels'] = 32
+    model = Conformer(**cfg)
+    conv_feature, transformer_feature = model(imgs)[-1]
+    assert conv_feature.shape == (1, 32 * 3 * 4)
+    assert transformer_feature.shape == (1, 128)
+
+    # Test Conformer with multi out indices
+    cfg = deepcopy(cfg_ori)
+    cfg['out_indices'] = [4, 8, 12]
+    model = Conformer(**cfg)
+    outs = model(imgs)
+    assert len(outs) == 3
+    # stage 1
+    conv_feature, transformer_feature = outs[0]
+    assert conv_feature.shape == (1, 64 * 1)
+    assert transformer_feature.shape == (1, 384)
+    # stage 2
+    conv_feature, transformer_feature = outs[1]
+    assert conv_feature.shape == (1, 64 * 1 * 2)
+    assert transformer_feature.shape == (1, 384)
+    # stage 3
+    conv_feature, transformer_feature = outs[2]
+    assert conv_feature.shape == (1, 64 * 1 * 4)
+    assert transformer_feature.shape == (1, 384)
--- a/tests/test_models/test_backbones/test_convmixer.py
+++ b/tests/test_models/test_backbones/test_convmixer.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmpretrain.models.backbones import ConvMixer
+
+
+def test_assertion():
+    with pytest.raises(AssertionError):
+        ConvMixer(arch='unknown')
+
+    with pytest.raises(AssertionError):
+        # ConvMixer arch dict should include essential_keys,
+        ConvMixer(arch=dict(channels=[2, 3, 4, 5]))
+
+    with pytest.raises(AssertionError):
+        # ConvMixer out_indices should be valid depth.
+        ConvMixer(out_indices=-100)
+
+
+@torch.no_grad()  # To save memory
+def test_convmixer():
+
+    # Test forward
+    model = ConvMixer(arch='768/32')
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 1
+    assert feat[0].shape == torch.Size([1, 768, 32, 32])
+
+    # Test forward with multiple outputs
+    model = ConvMixer(arch='768/32', out_indices=range(32))
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 32
+    for f in feat:
+        assert f.shape == torch.Size([1, 768, 32, 32])
+
+    # Test with custom arch
+    model = ConvMixer(
+        arch={
+            'embed_dims': 99,
+            'depth': 5,
+            'patch_size': 5,
+            'kernel_size': 9
+        },
+        out_indices=range(5))
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 5
+    for f in feat:
+        assert f.shape == torch.Size([1, 99, 44, 44])
+
+    # Test with even kernel size arch
+    model = ConvMixer(arch={
+        'embed_dims': 99,
+        'depth': 5,
+        'patch_size': 5,
+        'kernel_size': 8
+    })
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 1
+    assert feat[0].shape == torch.Size([1, 99, 44, 44])
+
+    # Test frozen_stages
+    model = ConvMixer(arch='768/32', frozen_stages=10)
+    model.init_weights()
+    model.train()
+
+    for i in range(10):
+        assert not model.stages[i].training
+
+    for i in range(10, 32):
+        assert model.stages[i].training
--- a/tests/test_models/test_backbones/test_convnext.py
+++ b/tests/test_models/test_backbones/test_convnext.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmpretrain.models.backbones import ConvNeXt
+
+
+def test_assertion():
+    with pytest.raises(AssertionError):
+        ConvNeXt(arch='unknown')
+
+    with pytest.raises(AssertionError):
+        # ConvNeXt arch dict should include 'embed_dims',
+        ConvNeXt(arch=dict(channels=[2, 3, 4, 5]))
+
+    with pytest.raises(AssertionError):
+        # ConvNeXt arch dict should include 'embed_dims',
+        ConvNeXt(arch=dict(depths=[2, 3, 4], channels=[2, 3, 4, 5]))
+
+
+def test_convnext():
+
+    # Test forward
+    model = ConvNeXt(arch='tiny', out_indices=-1)
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 1
+    assert feat[0].shape == torch.Size([1, 768])
+
+    # Test forward with multiple outputs
+    model = ConvNeXt(arch='small', out_indices=(0, 1, 2, 3))
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 4
+    assert feat[0].shape == torch.Size([1, 96])
+    assert feat[1].shape == torch.Size([1, 192])
+    assert feat[2].shape == torch.Size([1, 384])
+    assert feat[3].shape == torch.Size([1, 768])
+
+    # Test with custom arch
+    model = ConvNeXt(
+        arch={
+            'depths': [2, 3, 4, 5, 6],
+            'channels': [16, 32, 64, 128, 256]
+        },
+        out_indices=(0, 1, 2, 3, 4))
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 5
+    assert feat[0].shape == torch.Size([1, 16])
+    assert feat[1].shape == torch.Size([1, 32])
+    assert feat[2].shape == torch.Size([1, 64])
+    assert feat[3].shape == torch.Size([1, 128])
+    assert feat[4].shape == torch.Size([1, 256])
+
+    # Test without gap before final norm
+    model = ConvNeXt(
+        arch='small', out_indices=(0, 1, 2, 3), gap_before_final_norm=False)
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 4
+    assert feat[0].shape == torch.Size([1, 96, 56, 56])
+    assert feat[1].shape == torch.Size([1, 192, 28, 28])
+    assert feat[2].shape == torch.Size([1, 384, 14, 14])
+    assert feat[3].shape == torch.Size([1, 768, 7, 7])
+
+    # Test frozen_stages
+    model = ConvNeXt(arch='small', out_indices=(0, 1, 2, 3), frozen_stages=2)
+    model.init_weights()
+    model.train()
+
+    for i in range(2):
+        assert not model.downsample_layers[i].training
+        assert not model.stages[i].training
+
+    for i in range(2, 4):
+        assert model.downsample_layers[i].training
+        assert model.stages[i].training
+
+    # Test Activation Checkpointing
+    model = ConvNeXt(arch='tiny', out_indices=-1, with_cp=True)
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 1
+    assert feat[0].shape == torch.Size([1, 768])
+
+    # Test linear_pw_conv=False
+    model = ConvNeXt(arch='tiny', out_indices=-1, linear_pw_conv=False)
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 1
+    assert feat[0].shape == torch.Size([1, 768])
--- a/tests/test_models/test_backbones/test_cspnet.py
+++ b/tests/test_models/test_backbones/test_cspnet.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from copy import deepcopy
+from functools import partial
+from unittest import TestCase
+
+import torch
+from mmcv.cnn import ConvModule
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
+
+from mmpretrain.models.backbones import CSPDarkNet, CSPResNet, CSPResNeXt
+from mmpretrain.models.backbones.cspnet import (CSPNet, DarknetBottleneck,
+                                                ResNetBottleneck,
+                                                ResNeXtBottleneck)
+
+
+class TestCSPNet(TestCase):
+
+    def setUp(self):
+        self.arch = dict(
+            block_fn=(DarknetBottleneck, ResNetBottleneck, ResNeXtBottleneck),
+            in_channels=(32, 64, 128),
+            out_channels=(64, 128, 256),
+            num_blocks=(1, 2, 8),
+            expand_ratio=(2, 1, 1),
+            bottle_ratio=(3, 1, 1),
+            has_downsampler=True,
+            down_growth=True,
+            block_args=({}, {}, dict(base_channels=32)))
+        self.stem_fn = partial(torch.nn.Conv2d, out_channels=32, kernel_size=3)
+
+    def test_structure(self):
+        # Test with attribute arch_setting.
+        model = CSPNet(arch=self.arch, stem_fn=self.stem_fn, out_indices=[-1])
+        self.assertEqual(len(model.stages), 3)
+        self.assertEqual(type(model.stages[0].blocks[0]), DarknetBottleneck)
+        self.assertEqual(type(model.stages[1].blocks[0]), ResNetBottleneck)
+        self.assertEqual(type(model.stages[2].blocks[0]), ResNeXtBottleneck)
+
+
+class TestCSPDarkNet(TestCase):
+
+    def setUp(self):
+        self.class_name = CSPDarkNet
+        self.cfg = dict(depth=53)
+        self.out_channels = [64, 128, 256, 512, 1024]
+        self.all_out_indices = [0, 1, 2, 3, 4]
+        self.frozen_stages = 2
+        self.stem_down = (1, 1)
+        self.num_stages = 5
+
+    def test_structure(self):
+        # Test invalid default depths
+        with self.assertRaisesRegex(AssertionError, 'depth must be one of'):
+            cfg = deepcopy(self.cfg)
+            cfg['depth'] = 'unknown'
+            self.class_name(**cfg)
+
+        # Test out_indices
+        cfg = deepcopy(self.cfg)
+        cfg['out_indices'] = {1: 1}
+        with self.assertRaisesRegex(AssertionError, "get <class 'dict'>"):
+            self.class_name(**cfg)
+        cfg['out_indices'] = [0, 13]
+        with self.assertRaisesRegex(AssertionError, 'Invalid out_indices 13'):
+            self.class_name(**cfg)
+
+        # Test model structure
+        cfg = deepcopy(self.cfg)
+        model = self.class_name(**cfg)
+        self.assertEqual(len(model.stages), self.num_stages)
+
+    def test_forward(self):
+        imgs = torch.randn(3, 3, 224, 224)
+
+        cfg = deepcopy(self.cfg)
+        model = self.class_name(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 1)
+        self.assertEqual(outs[-1].size(), (3, self.out_channels[-1], 7, 7))
+
+        # Test forward with multi out indices
+        cfg = deepcopy(self.cfg)
+        cfg['out_indices'] = self.all_out_indices
+        model = self.class_name(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), len(self.all_out_indices))
+        w, h = 224 / self.stem_down[0], 224 / self.stem_down[1]
+        for i, out in enumerate(outs):
+            self.assertEqual(
+                out.size(),
+                (3, self.out_channels[i], w // 2**(i + 1), h // 2**(i + 1)))
+
+        # Test frozen stages
+        cfg = deepcopy(self.cfg)
+        cfg['frozen_stages'] = self.frozen_stages
+        model = self.class_name(**cfg)
+        model.init_weights()
+        model.train()
+        assert model.stem.training is False
+        for param in model.stem.parameters():
+            assert param.requires_grad is False
+        for i in range(self.frozen_stages + 1):
+            stage = model.stages[i]
+            for mod in stage.modules():
+                if isinstance(mod, _BatchNorm):
+                    assert mod.training is False, i
+            for param in stage.parameters():
+                assert param.requires_grad is False
+
+
+class TestCSPResNet(TestCSPDarkNet):
+
+    def setUp(self):
+        self.class_name = CSPResNet
+        self.cfg = dict(depth=50)
+        self.out_channels = [128, 256, 512, 1024]
+        self.all_out_indices = [0, 1, 2, 3]
+        self.frozen_stages = 2
+        self.stem_down = (2, 2)
+        self.num_stages = 4
+
+    def test_deep_stem(self, ):
+        cfg = deepcopy(self.cfg)
+        cfg['deep_stem'] = True
+        model = self.class_name(**cfg)
+        self.assertEqual(len(model.stem), 3)
+        for i in range(3):
+            self.assertEqual(type(model.stem[i]), ConvModule)
+
+
+class TestCSPResNeXt(TestCSPDarkNet):
+
+    def setUp(self):
+        self.class_name = CSPResNeXt
+        self.cfg = dict(depth=50)
+        self.out_channels = [256, 512, 1024, 2048]
+        self.all_out_indices = [0, 1, 2, 3]
+        self.frozen_stages = 2
+        self.stem_down = (2, 2)
+        self.num_stages = 4
+
+
+if __name__ == '__main__':
+    import unittest
+    unittest.main()
--- a/tests/test_models/test_backbones/test_davit.py
+++ b/tests/test_models/test_backbones/test_davit.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from copy import deepcopy
+from unittest import TestCase
+
+import torch
+
+from mmpretrain.models.backbones import DaViT
+from mmpretrain.models.backbones.davit import SpatialBlock
+
+
+class TestDaViT(TestCase):
+
+    def setUp(self):
+        self.cfg = dict(arch='t', patch_size=4, drop_path_rate=0.1)
+
+    def test_structure(self):
+        # Test invalid default arch
+        with self.assertRaisesRegex(AssertionError, 'not in default archs'):
+            cfg = deepcopy(self.cfg)
+            cfg['arch'] = 'unknown'
+            DaViT(**cfg)
+
+        # Test invalid custom arch
+        with self.assertRaisesRegex(AssertionError, 'Custom arch needs'):
+            cfg = deepcopy(self.cfg)
+            cfg['arch'] = {
+                'num_layers': 24,
+                'num_heads': 16,
+                'feedforward_channels': 4096
+            }
+            DaViT(**cfg)
+
+        # Test custom arch
+        cfg = deepcopy(self.cfg)
+        cfg['arch'] = {
+            'embed_dims': 64,
+            'num_heads': [3, 3, 3, 3],
+            'depths': [1, 1, 2, 1]
+        }
+        model = DaViT(**cfg)
+        self.assertEqual(model.embed_dims, 64)
+        self.assertEqual(model.num_layers, 4)
+        for layer in model.stages:
+            self.assertEqual(
+                layer.blocks[0].spatial_block.attn.w_msa.num_heads, 3)
+
+    def test_init_weights(self):
+        # test weight init cfg
+        cfg = deepcopy(self.cfg)
+        cfg['init_cfg'] = [
+            dict(
+                type='Kaiming',
+                layer='Conv2d',
+                mode='fan_in',
+                nonlinearity='linear')
+        ]
+        model = DaViT(**cfg)
+        ori_weight = model.patch_embed.projection.weight.clone().detach()
+
+        model.init_weights()
+        initialized_weight = model.patch_embed.projection.weight
+        self.assertFalse(torch.allclose(ori_weight, initialized_weight))
+
+    def test_forward(self):
+        imgs = torch.randn(1, 3, 224, 224)
+
+        cfg = deepcopy(self.cfg)
+        model = DaViT(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 1)
+        self.assertEqual(outs[0].shape, (1, 768, 7, 7))
+
+        # Test forward with multi out indices
+        cfg = deepcopy(self.cfg)
+        cfg['out_indices'] = [2, 3]
+        model = DaViT(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 2)
+        self.assertEqual(outs[0].shape, (1, 384, 14, 14))
+        self.assertEqual(outs[1].shape, (1, 768, 7, 7))
+
+        # test with checkpoint forward
+        cfg = deepcopy(self.cfg)
+        cfg['with_cp'] = True
+        model = DaViT(**cfg)
+        for m in model.modules():
+            if isinstance(m, SpatialBlock):
+                self.assertTrue(m.with_cp)
+        model.init_weights()
+        model.train()
+
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 1)
+        self.assertEqual(outs[0].shape, (1, 768, 7, 7))
+
+        # Test forward with dynamic input size
+        imgs1 = torch.randn(1, 3, 224, 224)
+        imgs2 = torch.randn(1, 3, 256, 256)
+        imgs3 = torch.randn(1, 3, 256, 309)
+        cfg = deepcopy(self.cfg)
+        model = DaViT(**cfg)
+        for imgs in [imgs1, imgs2, imgs3]:
+            outs = model(imgs)
+            self.assertIsInstance(outs, tuple)
+            self.assertEqual(len(outs), 1)
+            expect_feat_shape = (imgs.shape[2] // 32, imgs.shape[3] // 32)
+            self.assertEqual(outs[0].shape, (1, 768, *expect_feat_shape))
--- a/tests/test_models/test_backbones/test_deit.py
+++ b/tests/test_models/test_backbones/test_deit.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import os
+import tempfile
+from copy import deepcopy
+from unittest import TestCase
+
+import torch
+from mmengine.runner import load_checkpoint, save_checkpoint
+
+from mmpretrain.models.backbones import DistilledVisionTransformer
+from .utils import timm_resize_pos_embed
+
+
+class TestDeiT(TestCase):
+
+    def setUp(self):
+        self.cfg = dict(
+            arch='deit-tiny', img_size=224, patch_size=16, drop_rate=0.1)
+
+    def test_init_weights(self):
+        # test weight init cfg
+        cfg = deepcopy(self.cfg)
+        cfg['init_cfg'] = [
+            dict(
+                type='Kaiming',
+                layer='Conv2d',
+                mode='fan_in',
+                nonlinearity='linear')
+        ]
+        model = DistilledVisionTransformer(**cfg)
+        ori_weight = model.patch_embed.projection.weight.clone().detach()
+        # The pos_embed is all zero before initialize
+        self.assertTrue(torch.allclose(model.dist_token, torch.tensor(0.)))
+
+        model.init_weights()
+        initialized_weight = model.patch_embed.projection.weight
+        self.assertFalse(torch.allclose(ori_weight, initialized_weight))
+        self.assertFalse(torch.allclose(model.dist_token, torch.tensor(0.)))
+
+        # test load checkpoint
+        pretrain_pos_embed = model.pos_embed.clone().detach()
+        tmpdir = tempfile.gettempdir()
+        checkpoint = os.path.join(tmpdir, 'test.pth')
+        save_checkpoint(model.state_dict(), checkpoint)
+        cfg = deepcopy(self.cfg)
+        model = DistilledVisionTransformer(**cfg)
+        load_checkpoint(model, checkpoint, strict=True)
+        self.assertTrue(torch.allclose(model.pos_embed, pretrain_pos_embed))
+
+        # test load checkpoint with different img_size
+        cfg = deepcopy(self.cfg)
+        cfg['img_size'] = 384
+        model = DistilledVisionTransformer(**cfg)
+        load_checkpoint(model, checkpoint, strict=True)
+        resized_pos_embed = timm_resize_pos_embed(
+            pretrain_pos_embed, model.pos_embed, num_tokens=2)
+        self.assertTrue(torch.allclose(model.pos_embed, resized_pos_embed))
+
+        os.remove(checkpoint)
+
+    def test_forward(self):
+        imgs = torch.randn(1, 3, 224, 224)
+
+        # test with output cls_token
+        cfg = deepcopy(self.cfg)
+        model = DistilledVisionTransformer(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 1)
+        cls_token, dist_token = outs[-1]
+        self.assertEqual(cls_token.shape, (1, 192))
+        self.assertEqual(dist_token.shape, (1, 192))
+
+        # test without output cls_token
+        cfg = deepcopy(self.cfg)
+        cfg['out_type'] = 'featmap'
+        model = DistilledVisionTransformer(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 1)
+        patch_token = outs[-1]
+        self.assertEqual(patch_token.shape, (1, 192, 14, 14))
+
+        # Test forward with multi out indices
+        cfg = deepcopy(self.cfg)
+        cfg['out_indices'] = [-3, -2, -1]
+        model = DistilledVisionTransformer(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 3)
+        for out in outs:
+            cls_token, dist_token = out
+            self.assertEqual(cls_token.shape, (1, 192))
+            self.assertEqual(dist_token.shape, (1, 192))
+
+        # Test forward with dynamic input size
+        imgs1 = torch.randn(1, 3, 224, 224)
+        imgs2 = torch.randn(1, 3, 256, 256)
+        imgs3 = torch.randn(1, 3, 256, 309)
+        cfg = deepcopy(self.cfg)
+        cfg['out_type'] = 'featmap'
+        model = DistilledVisionTransformer(**cfg)
+        for imgs in [imgs1, imgs2, imgs3]:
+            outs = model(imgs)
+            self.assertIsInstance(outs, tuple)
+            self.assertEqual(len(outs), 1)
+            featmap = outs[-1]
+            expect_feat_shape = (math.ceil(imgs.shape[2] / 16),
+                                 math.ceil(imgs.shape[3] / 16))
+            self.assertEqual(featmap.shape, (1, 192, *expect_feat_shape))
--- a/tests/test_models/test_backbones/test_deit3.py
+++ b/tests/test_models/test_backbones/test_deit3.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import os
+import tempfile
+from copy import deepcopy
+from unittest import TestCase
+
+import torch
+from mmengine.runner import load_checkpoint, save_checkpoint
+
+from mmpretrain.models.backbones import DeiT3
+
+
+class TestDeiT3(TestCase):
+
+    def setUp(self):
+        self.cfg = dict(
+            arch='b', img_size=224, patch_size=16, drop_path_rate=0.1)
+
+    def test_structure(self):
+        # Test invalid default arch
+        with self.assertRaisesRegex(AssertionError, 'not in default archs'):
+            cfg = deepcopy(self.cfg)
+            cfg['arch'] = 'unknown'
+            DeiT3(**cfg)
+
+        # Test invalid custom arch
+        with self.assertRaisesRegex(AssertionError, 'Custom arch needs'):
+            cfg = deepcopy(self.cfg)
+            cfg['arch'] = {
+                'num_layers': 24,
+                'num_heads': 16,
+                'feedforward_channels': 4096
+            }
+            DeiT3(**cfg)
+
+        # Test custom arch
+        cfg = deepcopy(self.cfg)
+        cfg['arch'] = {
+            'embed_dims': 128,
+            'num_layers': 24,
+            'num_heads': 16,
+            'feedforward_channels': 1024
+        }
+        model = DeiT3(**cfg)
+        self.assertEqual(model.embed_dims, 128)
+        self.assertEqual(model.num_layers, 24)
+        for layer in model.layers:
+            self.assertEqual(layer.attn.num_heads, 16)
+            self.assertEqual(layer.ffn.feedforward_channels, 1024)
+
+        # Test out_indices
+        cfg = deepcopy(self.cfg)
+        cfg['out_indices'] = {1: 1}
+        with self.assertRaisesRegex(AssertionError, "get <class 'dict'>"):
+            DeiT3(**cfg)
+        cfg['out_indices'] = [0, 13]
+        with self.assertRaisesRegex(AssertionError, 'Invalid out_indices 13'):
+            DeiT3(**cfg)
+
+        # Test model structure
+        cfg = deepcopy(self.cfg)
+        model = DeiT3(**cfg)
+        self.assertEqual(len(model.layers), 12)
+        dpr_inc = 0.1 / (12 - 1)
+        dpr = 0
+        for layer in model.layers:
+            self.assertEqual(layer.attn.embed_dims, 768)
+            self.assertEqual(layer.attn.num_heads, 12)
+            self.assertEqual(layer.ffn.feedforward_channels, 3072)
+            self.assertAlmostEqual(layer.attn.out_drop.drop_prob, dpr)
+            self.assertAlmostEqual(layer.ffn.dropout_layer.drop_prob, dpr)
+            dpr += dpr_inc
+
+    def test_init_weights(self):
+        # test weight init cfg
+        cfg = deepcopy(self.cfg)
+        cfg['init_cfg'] = [
+            dict(
+                type='Kaiming',
+                layer='Conv2d',
+                mode='fan_in',
+                nonlinearity='linear')
+        ]
+        model = DeiT3(**cfg)
+        ori_weight = model.patch_embed.projection.weight.clone().detach()
+        # The pos_embed is all zero before initialize
+        self.assertTrue(torch.allclose(model.pos_embed, torch.tensor(0.)))
+
+        model.init_weights()
+        initialized_weight = model.patch_embed.projection.weight
+        self.assertFalse(torch.allclose(ori_weight, initialized_weight))
+        self.assertFalse(torch.allclose(model.pos_embed, torch.tensor(0.)))
+
+        # test load checkpoint
+        pretrain_pos_embed = model.pos_embed.clone().detach()
+        tmpdir = tempfile.gettempdir()
+        checkpoint = os.path.join(tmpdir, 'test.pth')
+        save_checkpoint(model.state_dict(), checkpoint)
+        cfg = deepcopy(self.cfg)
+        model = DeiT3(**cfg)
+        load_checkpoint(model, checkpoint, strict=True)
+        self.assertTrue(torch.allclose(model.pos_embed, pretrain_pos_embed))
+
+        # test load checkpoint with different img_size
+        cfg = deepcopy(self.cfg)
+        cfg['img_size'] = 384
+        model = DeiT3(**cfg)
+        load_checkpoint(model, checkpoint, strict=True)
+
+        os.remove(checkpoint)
+
+    def test_forward(self):
+        imgs = torch.randn(1, 3, 224, 224)
+
+        # test with_cls_token=False
+        cfg = deepcopy(self.cfg)
+        cfg['with_cls_token'] = False
+        cfg['out_type'] = 'cls_token'
+        with self.assertRaisesRegex(ValueError, 'must be True'):
+            DeiT3(**cfg)
+
+        cfg = deepcopy(self.cfg)
+        cfg['with_cls_token'] = False
+        cfg['out_type'] = 'featmap'
+        model = DeiT3(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 1)
+        patch_token = outs[-1]
+        self.assertEqual(patch_token.shape, (1, 768, 14, 14))
+
+        # test with output cls_token
+        cfg = deepcopy(self.cfg)
+        model = DeiT3(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 1)
+        cls_token = outs[-1]
+        self.assertEqual(cls_token.shape, (1, 768))
+
+        # Test forward with multi out indices
+        cfg = deepcopy(self.cfg)
+        cfg['out_indices'] = [-3, -2, -1]
+        model = DeiT3(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 3)
+        for out in outs:
+            cls_token = out
+            self.assertEqual(cls_token.shape, (1, 768))
+
+        # Test forward with dynamic input size
+        imgs1 = torch.randn(1, 3, 224, 224)
+        imgs2 = torch.randn(1, 3, 256, 256)
+        imgs3 = torch.randn(1, 3, 256, 309)
+        cfg = deepcopy(self.cfg)
+        cfg['out_type'] = 'featmap'
+        model = DeiT3(**cfg)
+        for imgs in [imgs1, imgs2, imgs3]:
+            outs = model(imgs)
+            self.assertIsInstance(outs, tuple)
+            self.assertEqual(len(outs), 1)
+            featmap = outs[-1]
+            expect_feat_shape = (math.ceil(imgs.shape[2] / 16),
+                                 math.ceil(imgs.shape[3] / 16))
+            self.assertEqual(featmap.shape, (1, 768, *expect_feat_shape))
--- a/tests/test_models/test_backbones/test_densenet.py
+++ b/tests/test_models/test_backbones/test_densenet.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmpretrain.models.backbones import DenseNet
+
+
+def test_assertion():
+    with pytest.raises(AssertionError):
+        DenseNet(arch='unknown')
+
+    with pytest.raises(AssertionError):
+        # DenseNet arch dict should include essential_keys,
+        DenseNet(arch=dict(channels=[2, 3, 4, 5]))
+
+    with pytest.raises(AssertionError):
+        # DenseNet out_indices should be valid depth.
+        DenseNet(out_indices=-100)
+
+
+def test_DenseNet():
+
+    # Test forward
+    model = DenseNet(arch='121')
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 1
+    assert feat[0].shape == torch.Size([1, 1024, 7, 7])
+
+    # Test memory efficient option
+    model = DenseNet(arch='121', memory_efficient=True)
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 1
+    assert feat[0].shape == torch.Size([1, 1024, 7, 7])
+
+    # Test drop rate
+    model = DenseNet(arch='121', drop_rate=0.05)
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 1
+    assert feat[0].shape == torch.Size([1, 1024, 7, 7])
+
+    # Test forward with multiple outputs
+    model = DenseNet(arch='121', out_indices=(0, 1, 2, 3))
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 4
+    assert feat[0].shape == torch.Size([1, 128, 28, 28])
+    assert feat[1].shape == torch.Size([1, 256, 14, 14])
+    assert feat[2].shape == torch.Size([1, 512, 7, 7])
+    assert feat[3].shape == torch.Size([1, 1024, 7, 7])
+
+    # Test with custom arch
+    model = DenseNet(
+        arch={
+            'growth_rate': 20,
+            'depths': [4, 8, 12, 16, 20],
+            'init_channels': 40,
+        },
+        out_indices=(0, 1, 2, 3, 4))
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 5
+    assert feat[0].shape == torch.Size([1, 60, 28, 28])
+    assert feat[1].shape == torch.Size([1, 110, 14, 14])
+    assert feat[2].shape == torch.Size([1, 175, 7, 7])
+    assert feat[3].shape == torch.Size([1, 247, 3, 3])
+    assert feat[4].shape == torch.Size([1, 647, 3, 3])
+
+    # Test frozen_stages
+    model = DenseNet(arch='121', out_indices=(0, 1, 2, 3), frozen_stages=2)
+    model.init_weights()
+    model.train()
+
+    for i in range(2):
+        assert not model.stages[i].training
+        assert not model.transitions[i].training
+
+    for i in range(2, 4):
+        assert model.stages[i].training
+        assert model.transitions[i].training
--- a/tests/test_models/test_backbones/test_edgenext.py
+++ b/tests/test_models/test_backbones/test_edgenext.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmpretrain.models.backbones import EdgeNeXt
+
+
+def test_assertion():
+    with pytest.raises(AssertionError):
+        EdgeNeXt(arch='unknown')
+
+    with pytest.raises(AssertionError):
+        # EdgeNeXt arch dict should include 'embed_dims',
+        EdgeNeXt(arch=dict(channels=[24, 48, 88, 168]))
+
+    with pytest.raises(AssertionError):
+        # EdgeNeXt arch dict should include 'embed_dims',
+        EdgeNeXt(arch=dict(depths=[2, 2, 6, 2], channels=[24, 48, 88, 168]))
+
+
+def test_edgenext():
+
+    # Test forward
+    model = EdgeNeXt(arch='xxsmall', out_indices=-1)
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 1
+    assert feat[0].shape == torch.Size([1, 168])
+
+    # Test forward with multiple outputs
+    model = EdgeNeXt(arch='xxsmall', out_indices=(0, 1, 2, 3))
+    feat = model(imgs)
+    assert len(feat) == 4
+    assert feat[0].shape == torch.Size([1, 24])
+    assert feat[1].shape == torch.Size([1, 48])
+    assert feat[2].shape == torch.Size([1, 88])
+    assert feat[3].shape == torch.Size([1, 168])
+
+    # Test with custom arch
+    model = EdgeNeXt(
+        arch={
+            'depths': [2, 3, 4, 5],
+            'channels': [20, 40, 80, 160],
+            'num_heads': [4, 4, 4, 4]
+        },
+        out_indices=(0, 1, 2, 3))
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 4
+    assert feat[0].shape == torch.Size([1, 20])
+    assert feat[1].shape == torch.Size([1, 40])
+    assert feat[2].shape == torch.Size([1, 80])
+    assert feat[3].shape == torch.Size([1, 160])
+
+    # Test without gap before final norm
+    model = EdgeNeXt(
+        arch='small', out_indices=(0, 1, 2, 3), gap_before_final_norm=False)
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 4
+    assert feat[0].shape == torch.Size([1, 48, 56, 56])
+    assert feat[1].shape == torch.Size([1, 96, 28, 28])
+    assert feat[2].shape == torch.Size([1, 160, 14, 14])
+    assert feat[3].shape == torch.Size([1, 304, 7, 7])
+
+    # Test frozen_stages
+    model = EdgeNeXt(arch='small', out_indices=(0, 1, 2, 3), frozen_stages=2)
+    model.init_weights()
+    model.train()
+
+    for i in range(2):
+        assert not model.downsample_layers[i].training
+        assert not model.stages[i].training
+
+    for i in range(2, 4):
+        assert model.downsample_layers[i].training
+        assert model.stages[i].training
--- a/tests/test_models/test_backbones/test_efficientformer.py
+++ b/tests/test_models/test_backbones/test_efficientformer.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from copy import deepcopy
+from unittest import TestCase
+
+import torch
+from mmcv.cnn import ConvModule
+from torch import nn
+
+from mmpretrain.models.backbones import EfficientFormer
+from mmpretrain.models.backbones.efficientformer import (AttentionWithBias,
+                                                         Flat, Meta3D, Meta4D)
+from mmpretrain.models.backbones.poolformer import Pooling
+
+
+class TestEfficientFormer(TestCase):
+
+    def setUp(self):
+        self.cfg = dict(arch='l1', drop_path_rate=0.1)
+        self.arch = EfficientFormer.arch_settings['l1']
+        self.custom_arch = {
+            'layers': [1, 1, 1, 4],
+            'embed_dims': [48, 96, 224, 448],
+            'downsamples': [False, True, True, True],
+            'vit_num': 2,
+        }
+        self.custom_cfg = dict(arch=self.custom_arch)
+
+    def test_arch(self):
+        # Test invalid default arch
+        with self.assertRaisesRegex(AssertionError, 'Unavailable arch'):
+            cfg = deepcopy(self.cfg)
+            cfg['arch'] = 'unknown'
+            EfficientFormer(**cfg)
+
+        # Test invalid custom arch
+        with self.assertRaisesRegex(AssertionError, 'must have'):
+            cfg = deepcopy(self.custom_cfg)
+            cfg['arch'].pop('layers')
+            EfficientFormer(**cfg)
+
+        # Test vit_num < 0
+        with self.assertRaisesRegex(AssertionError, "'vit_num' must"):
+            cfg = deepcopy(self.custom_cfg)
+            cfg['arch']['vit_num'] = -1
+            EfficientFormer(**cfg)
+
+        # Test vit_num > last stage layers
+        with self.assertRaisesRegex(AssertionError, "'vit_num' must"):
+            cfg = deepcopy(self.custom_cfg)
+            cfg['arch']['vit_num'] = 10
+            EfficientFormer(**cfg)
+
+        # Test out_ind
+        with self.assertRaisesRegex(AssertionError, '"out_indices" must'):
+            cfg = deepcopy(self.custom_cfg)
+            cfg['out_indices'] = dict
+            EfficientFormer(**cfg)
+
+        # Test custom arch
+        cfg = deepcopy(self.custom_cfg)
+        model = EfficientFormer(**cfg)
+        self.assertEqual(len(model.patch_embed), 2)
+        layers = self.custom_arch['layers']
+        downsamples = self.custom_arch['downsamples']
+        vit_num = self.custom_arch['vit_num']
+
+        for i, stage in enumerate(model.network):
+            if downsamples[i]:
+                self.assertIsInstance(stage[0], ConvModule)
+                self.assertEqual(stage[0].conv.stride, (2, 2))
+                self.assertTrue(hasattr(stage[0].conv, 'bias'))
+                self.assertTrue(isinstance(stage[0].bn, nn.BatchNorm2d))
+
+            if i < len(model.network) - 1:
+                self.assertIsInstance(stage[-1], Meta4D)
+                self.assertIsInstance(stage[-1].token_mixer, Pooling)
+                self.assertEqual(len(stage) - downsamples[i], layers[i])
+            elif vit_num > 0:
+                self.assertIsInstance(stage[-1], Meta3D)
+                self.assertIsInstance(stage[-1].token_mixer, AttentionWithBias)
+                self.assertEqual(len(stage) - downsamples[i] - 1, layers[i])
+                flat_layer_idx = len(stage) - vit_num - downsamples[i]
+                self.assertIsInstance(stage[flat_layer_idx], Flat)
+                count = 0
+                for layer in stage:
+                    if isinstance(layer, Meta3D):
+                        count += 1
+                self.assertEqual(count, vit_num)
+
+    def test_init_weights(self):
+        # test weight init cfg
+        cfg = deepcopy(self.cfg)
+        cfg['init_cfg'] = [
+            dict(
+                type='Kaiming',
+                layer='Conv2d',
+                mode='fan_in',
+                nonlinearity='linear'),
+            dict(type='Constant', layer=['LayerScale'], val=1e-4)
+        ]
+        model = EfficientFormer(**cfg)
+        ori_weight = model.patch_embed[0].conv.weight.clone().detach()
+        ori_ls_weight = model.network[0][-1].ls1.weight.clone().detach()
+
+        model.init_weights()
+        initialized_weight = model.patch_embed[0].conv.weight
+        initialized_ls_weight = model.network[0][-1].ls1.weight
+        self.assertFalse(torch.allclose(ori_weight, initialized_weight))
+        self.assertFalse(torch.allclose(ori_ls_weight, initialized_ls_weight))
+
+    def test_forward(self):
+        imgs = torch.randn(1, 3, 224, 224)
+
+        # test last stage output
+        cfg = deepcopy(self.cfg)
+        model = EfficientFormer(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 1)
+        feat = outs[-1]
+        self.assertEqual(feat.shape, (1, 448, 49))
+        assert hasattr(model, 'norm3')
+        assert isinstance(getattr(model, 'norm3'), nn.LayerNorm)
+
+        # test multiple output indices
+        cfg = deepcopy(self.cfg)
+        cfg['out_indices'] = (0, 1, 2, 3)
+        cfg['reshape_last_feat'] = True
+        model = EfficientFormer(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 4)
+        # Test out features shape
+        for dim, stride, out in zip(self.arch['embed_dims'], [1, 2, 4, 8],
+                                    outs):
+            self.assertEqual(out.shape, (1, dim, 56 // stride, 56 // stride))
+
+        # Test norm layer
+        for i in range(4):
+            assert hasattr(model, f'norm{i}')
+            stage_norm = getattr(model, f'norm{i}')
+            assert isinstance(stage_norm, nn.GroupNorm)
+            assert stage_norm.num_groups == 1
+
+        # Test vit_num == 0
+        cfg = deepcopy(self.custom_cfg)
+        cfg['arch']['vit_num'] = 0
+        cfg['out_indices'] = (0, 1, 2, 3)
+        model = EfficientFormer(**cfg)
+        for i in range(4):
+            assert hasattr(model, f'norm{i}')
+            stage_norm = getattr(model, f'norm{i}')
+            assert isinstance(stage_norm, nn.GroupNorm)
+            assert stage_norm.num_groups == 1
+
+    def test_structure(self):
+        # test drop_path_rate decay
+        cfg = deepcopy(self.cfg)
+        cfg['drop_path_rate'] = 0.2
+        model = EfficientFormer(**cfg)
+        layers = self.arch['layers']
+        for i, block in enumerate(model.network):
+            expect_prob = 0.2 / (sum(layers) - 1) * i
+            if hasattr(block, 'drop_path'):
+                if expect_prob == 0:
+                    self.assertIsInstance(block.drop_path, torch.nn.Identity)
+                else:
+                    self.assertAlmostEqual(block.drop_path.drop_prob,
+                                           expect_prob)
+
+        # test with first stage frozen.
+        cfg = deepcopy(self.cfg)
+        frozen_stages = 1
+        cfg['frozen_stages'] = frozen_stages
+        cfg['out_indices'] = (0, 1, 2, 3)
+        model = EfficientFormer(**cfg)
+        model.init_weights()
+        model.train()
+
+        # the patch_embed and first stage should not require grad.
+        self.assertFalse(model.patch_embed.training)
+        for param in model.patch_embed.parameters():
+            self.assertFalse(param.requires_grad)
+        for i in range(frozen_stages):
+            module = model.network[i]
+            for param in module.parameters():
+                self.assertFalse(param.requires_grad)
+        for param in model.norm0.parameters():
+            self.assertFalse(param.requires_grad)
+
+        # the second stage should require grad.
+        for i in range(frozen_stages + 1, 4):
+            module = model.network[i]
+            for param in module.parameters():
+                self.assertTrue(param.requires_grad)
+            if hasattr(model, f'norm{i}'):
+                norm = getattr(model, f'norm{i}')
+                for param in norm.parameters():
+                    self.assertTrue(param.requires_grad)
--- a/tests/test_models/test_backbones/test_efficientnet.py
+++ b/tests/test_models/test_backbones/test_efficientnet.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+from torch.nn.modules import GroupNorm
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmpretrain.models.backbones import EfficientNet
+
+
+def is_norm(modules):
+    """Check if is one of the norms."""
+    if isinstance(modules, (GroupNorm, _BatchNorm)):
+        return True
+    return False
+
+
+def check_norm_state(modules, train_state):
+    """Check if norm layer is in correct train state."""
+    for mod in modules:
+        if isinstance(mod, _BatchNorm):
+            if mod.training != train_state:
+                return False
+    return True
+
+
+def test_efficientnet_backbone():
+    archs = ['b0', 'b1', 'b2', 'b3', 'b4', 'b5', 'b7', 'b8', 'es', 'em', 'el']
+    with pytest.raises(TypeError):
+        # pretrained must be a string path
+        model = EfficientNet()
+        model.init_weights(pretrained=0)
+
+    with pytest.raises(AssertionError):
+        # arch must in arc_settings
+        EfficientNet(arch='others')
+
+    for arch in archs:
+        with pytest.raises(ValueError):
+            # frozen_stages must less than 7
+            EfficientNet(arch=arch, frozen_stages=12)
+
+    # Test EfficientNet
+    model = EfficientNet()
+    model.init_weights()
+    model.train()
+
+    # Test EfficientNet with first stage frozen
+    frozen_stages = 7
+    model = EfficientNet(arch='b0', frozen_stages=frozen_stages)
+    model.init_weights()
+    model.train()
+    for i in range(frozen_stages):
+        layer = model.layers[i]
+        for mod in layer.modules():
+            if isinstance(mod, _BatchNorm):
+                assert mod.training is False
+        for param in layer.parameters():
+            assert param.requires_grad is False
+
+    # Test EfficientNet with norm eval
+    model = EfficientNet(norm_eval=True)
+    model.init_weights()
+    model.train()
+    assert check_norm_state(model.modules(), False)
+
+    # Test EfficientNet forward with 'b0' arch
+    out_channels = [32, 16, 24, 40, 112, 320, 1280]
+    model = EfficientNet(arch='b0', out_indices=(0, 1, 2, 3, 4, 5, 6))
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 7
+    assert feat[0].shape == torch.Size([1, out_channels[0], 112, 112])
+    assert feat[1].shape == torch.Size([1, out_channels[1], 112, 112])
+    assert feat[2].shape == torch.Size([1, out_channels[2], 56, 56])
+    assert feat[3].shape == torch.Size([1, out_channels[3], 28, 28])
+    assert feat[4].shape == torch.Size([1, out_channels[4], 14, 14])
+    assert feat[5].shape == torch.Size([1, out_channels[5], 7, 7])
+    assert feat[6].shape == torch.Size([1, out_channels[6], 7, 7])
+
+    # Test EfficientNet forward with 'b0' arch and GroupNorm
+    out_channels = [32, 16, 24, 40, 112, 320, 1280]
+    model = EfficientNet(
+        arch='b0',
+        out_indices=(0, 1, 2, 3, 4, 5, 6),
+        norm_cfg=dict(type='GN', num_groups=2, requires_grad=True))
+    for m in model.modules():
+        if is_norm(m):
+            assert isinstance(m, GroupNorm)
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 7
+    assert feat[0].shape == torch.Size([1, out_channels[0], 112, 112])
+    assert feat[1].shape == torch.Size([1, out_channels[1], 112, 112])
+    assert feat[2].shape == torch.Size([1, out_channels[2], 56, 56])
+    assert feat[3].shape == torch.Size([1, out_channels[3], 28, 28])
+    assert feat[4].shape == torch.Size([1, out_channels[4], 14, 14])
+    assert feat[5].shape == torch.Size([1, out_channels[5], 7, 7])
+    assert feat[6].shape == torch.Size([1, out_channels[6], 7, 7])
+
+    # Test EfficientNet forward with 'es' arch
+    out_channels = [32, 24, 32, 48, 144, 192, 1280]
+    model = EfficientNet(arch='es', out_indices=(0, 1, 2, 3, 4, 5, 6))
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 7
+    assert feat[0].shape == torch.Size([1, out_channels[0], 112, 112])
+    assert feat[1].shape == torch.Size([1, out_channels[1], 112, 112])
+    assert feat[2].shape == torch.Size([1, out_channels[2], 56, 56])
+    assert feat[3].shape == torch.Size([1, out_channels[3], 28, 28])
+    assert feat[4].shape == torch.Size([1, out_channels[4], 14, 14])
+    assert feat[5].shape == torch.Size([1, out_channels[5], 7, 7])
+    assert feat[6].shape == torch.Size([1, out_channels[6], 7, 7])
+
+    # Test EfficientNet forward with 'es' arch and GroupNorm
+    out_channels = [32, 24, 32, 48, 144, 192, 1280]
+    model = EfficientNet(
+        arch='es',
+        out_indices=(0, 1, 2, 3, 4, 5, 6),
+        norm_cfg=dict(type='GN', num_groups=2, requires_grad=True))
+    for m in model.modules():
+        if is_norm(m):
+            assert isinstance(m, GroupNorm)
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 7
+    assert feat[0].shape == torch.Size([1, out_channels[0], 112, 112])
+    assert feat[1].shape == torch.Size([1, out_channels[1], 112, 112])
+    assert feat[2].shape == torch.Size([1, out_channels[2], 56, 56])
+    assert feat[3].shape == torch.Size([1, out_channels[3], 28, 28])
+    assert feat[4].shape == torch.Size([1, out_channels[4], 14, 14])
+    assert feat[5].shape == torch.Size([1, out_channels[5], 7, 7])
+    assert feat[6].shape == torch.Size([1, out_channels[6], 7, 7])
--- a/tests/test_models/test_backbones/test_efficientnet_v2.py
+++ b/tests/test_models/test_backbones/test_efficientnet_v2.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+from torch.nn.modules import GroupNorm
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmpretrain.models.backbones import EfficientNetV2
+
+
+def is_norm(modules):
+    """Check if is one of the norms."""
+    if isinstance(modules, (GroupNorm, _BatchNorm)):
+        return True
+    return False
+
+
+def check_norm_state(modules, train_state):
+    """Check if norm layer is in correct train state."""
+    for mod in modules:
+        if isinstance(mod, _BatchNorm):
+            if mod.training != train_state:
+                return False
+    return True
+
+
+def test_efficientnet_v2_backbone():
+    with pytest.raises(TypeError):
+        # pretrained must be a string path
+        model = EfficientNetV2()
+        model.init_weights(pretrained=0)
+
+    with pytest.raises(AssertionError):
+        # arch must in arc_settings
+        EfficientNetV2(arch='others')
+
+    with pytest.raises(ValueError):
+        # frozen_stages must less than 8
+        EfficientNetV2(arch='b1', frozen_stages=12)
+
+    # Test EfficientNetV2
+    model = EfficientNetV2()
+    model.init_weights()
+    model.train()
+    x = torch.rand((1, 3, 224, 224))
+    model(x)
+
+    # Test EfficientNetV2 with first stage frozen
+    frozen_stages = 7
+    model = EfficientNetV2(arch='b0', frozen_stages=frozen_stages)
+    model.init_weights()
+    model.train()
+    for i in range(frozen_stages):
+        layer = model.layers[i]
+        for mod in layer.modules():
+            if isinstance(mod, _BatchNorm):
+                assert mod.training is False
+        for param in layer.parameters():
+            assert param.requires_grad is False
+
+    # Test EfficientNetV2 with norm eval
+    model = EfficientNetV2(norm_eval=True)
+    model.init_weights()
+    model.train()
+    assert check_norm_state(model.modules(), False)
+
+    # Test EfficientNetV2 forward with 'b0' arch
+    out_channels = [32, 16, 32, 48, 96, 112, 192, 1280]
+    model = EfficientNetV2(arch='b0', out_indices=(0, 1, 2, 3, 4, 5, 6, 7))
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 8
+    assert feat[0].shape == torch.Size([1, out_channels[0], 112, 112])
+    assert feat[1].shape == torch.Size([1, out_channels[1], 112, 112])
+    assert feat[2].shape == torch.Size([1, out_channels[2], 56, 56])
+    assert feat[3].shape == torch.Size([1, out_channels[3], 28, 28])
+    assert feat[4].shape == torch.Size([1, out_channels[4], 14, 14])
+    assert feat[5].shape == torch.Size([1, out_channels[5], 14, 14])
+    assert feat[6].shape == torch.Size([1, out_channels[6], 7, 7])
+    assert feat[6].shape == torch.Size([1, out_channels[6], 7, 7])
+
+    # Test EfficientNetV2 forward with 'b0' arch and GroupNorm
+    out_channels = [32, 16, 32, 48, 96, 112, 192, 1280]
+    model = EfficientNetV2(
+        arch='b0',
+        out_indices=(0, 1, 2, 3, 4, 5, 6, 7),
+        norm_cfg=dict(type='GN', num_groups=2, requires_grad=True))
+    for m in model.modules():
+        if is_norm(m):
+            assert isinstance(m, GroupNorm)
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 64, 64)
+    feat = model(imgs)
+    assert len(feat) == 8
+    assert feat[0].shape == torch.Size([1, out_channels[0], 32, 32])
+    assert feat[1].shape == torch.Size([1, out_channels[1], 32, 32])
+    assert feat[2].shape == torch.Size([1, out_channels[2], 16, 16])
+    assert feat[3].shape == torch.Size([1, out_channels[3], 8, 8])
+    assert feat[4].shape == torch.Size([1, out_channels[4], 4, 4])
+    assert feat[5].shape == torch.Size([1, out_channels[5], 4, 4])
+    assert feat[6].shape == torch.Size([1, out_channels[6], 2, 2])
+    assert feat[7].shape == torch.Size([1, out_channels[7], 2, 2])
+
+    # Test EfficientNetV2 forward with 'm' arch
+    out_channels = [24, 24, 48, 80, 160, 176, 304, 512, 1280]
+    model = EfficientNetV2(arch='m', out_indices=(0, 1, 2, 3, 4, 5, 6, 7, 8))
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 64, 64)
+    feat = model(imgs)
+    assert len(feat) == 9
+    assert feat[0].shape == torch.Size([1, out_channels[0], 32, 32])
+    assert feat[1].shape == torch.Size([1, out_channels[1], 32, 32])
+    assert feat[2].shape == torch.Size([1, out_channels[2], 16, 16])
+    assert feat[3].shape == torch.Size([1, out_channels[3], 8, 8])
+    assert feat[4].shape == torch.Size([1, out_channels[4], 4, 4])
+    assert feat[5].shape == torch.Size([1, out_channels[5], 4, 4])
+    assert feat[6].shape == torch.Size([1, out_channels[6], 2, 2])
+    assert feat[7].shape == torch.Size([1, out_channels[7], 2, 2])
+    assert feat[8].shape == torch.Size([1, out_channels[8], 2, 2])
+
+    # Test EfficientNetV2 forward with 'm' arch and GroupNorm
+    out_channels = [24, 24, 48, 80, 160, 176, 304, 512, 1280]
+    model = EfficientNetV2(
+        arch='m',
+        out_indices=(0, 1, 2, 3, 4, 5, 6, 7, 8),
+        norm_cfg=dict(type='GN', num_groups=2, requires_grad=True))
+    for m in model.modules():
+        if is_norm(m):
+            assert isinstance(m, GroupNorm)
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 64, 64)
+    feat = model(imgs)
+    assert len(feat) == 9
+    assert feat[0].shape == torch.Size([1, out_channels[0], 32, 32])
+    assert feat[1].shape == torch.Size([1, out_channels[1], 32, 32])
+    assert feat[2].shape == torch.Size([1, out_channels[2], 16, 16])
+    assert feat[3].shape == torch.Size([1, out_channels[3], 8, 8])
+    assert feat[4].shape == torch.Size([1, out_channels[4], 4, 4])
+    assert feat[5].shape == torch.Size([1, out_channels[5], 4, 4])
+    assert feat[6].shape == torch.Size([1, out_channels[6], 2, 2])
+    assert feat[7].shape == torch.Size([1, out_channels[7], 2, 2])
+    assert feat[8].shape == torch.Size([1, out_channels[8], 2, 2])
--- a/tests/test_models/test_backbones/test_eva02.py
+++ b/tests/test_models/test_backbones/test_eva02.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from copy import deepcopy
+from unittest import TestCase
+
+import torch
+
+from mmpretrain.models.backbones import ViTEVA02
+
+
+class TestEVA02(TestCase):
+
+    def setUp(self):
+        self.cfg = dict(
+            arch='t',
+            img_size=336,
+            patch_size=14,
+            drop_path_rate=0.1,
+            drop_rate=0.1,
+            attn_drop_rate=0.2,
+            proj_drop_rate=0.3,
+        )
+
+    def test_structure(self):
+        # Test invalid default arch
+        with self.assertRaisesRegex(AssertionError, 'not in default archs'):
+            cfg = deepcopy(self.cfg)
+            cfg['arch'] = 'unknown'
+            ViTEVA02(**cfg)
+
+        # Test invalid custom arch
+        with self.assertRaisesRegex(AssertionError, 'Custom arch needs'):
+            cfg = deepcopy(self.cfg)
+            cfg['arch'] = {
+                'num_layers': 24,
+                'num_heads': 16,
+                'feedforward_channels': int(24 * 4 * 2 / 3)
+            }
+            ViTEVA02(**cfg)
+
+        # Test custom arch
+        cfg = deepcopy(self.cfg)
+        cfg['arch'] = {
+            'embed_dims': 128,
+            'num_layers': 6,
+            'num_heads': 16,
+            'feedforward_channels': int(128 * 4 * 2 / 3)
+        }
+        model = ViTEVA02(**cfg)
+        self.assertEqual(model.embed_dims, 128)
+        self.assertEqual(model.num_layers, 6)
+        for layer in model.layers:
+            self.assertEqual(layer.attn.num_heads, 16)
+
+        # Test out_indices
+        cfg = deepcopy(self.cfg)
+        cfg['out_indices'] = {1: 1}
+        with self.assertRaisesRegex(AssertionError, "get <class 'dict'>"):
+            ViTEVA02(**cfg)
+        cfg['out_indices'] = [0, 13]
+        with self.assertRaisesRegex(AssertionError, 'Invalid out_indices 13'):
+            ViTEVA02(**cfg)
+
+        # Test model structure
+        cfg = deepcopy(self.cfg)
+        model = ViTEVA02(**cfg)
+        self.assertEqual(len(model.layers), 12)
+        self.assertEqual(model.cls_token.shape, (1, 1, 192))
+        self.assertEqual(model.pos_embed.shape, (1, 577, 192))
+        dpr_inc = 0.1 / (12 - 1)
+        dpr = 0
+        for layer in model.layers:
+            self.assertEqual(layer.attn.embed_dims, 192)
+            self.assertEqual(layer.attn.num_heads, 3)
+            self.assertAlmostEqual(layer.drop_path.drop_prob, dpr)
+            self.assertAlmostEqual(layer.mlp.dropout_layer.p, 0.1)
+            self.assertAlmostEqual(layer.attn.attn_drop.p, 0.2)
+            self.assertAlmostEqual(layer.attn.proj_drop.p, 0.3)
+            dpr += dpr_inc
+
+        # Test model structure: final_norm
+        cfg = deepcopy(self.cfg)
+        cfg['final_norm'] = True
+        model = ViTEVA02(**cfg)
+        self.assertNotEqual(model.norm1.__class__, torch.nn.Identity)
+
+    def test_forward(self):
+        imgs = torch.randn(1, 3, 336, 336)
+
+        # test with_cls_token=False
+        cfg = deepcopy(self.cfg)
+        cfg['with_cls_token'] = False
+        cfg['out_type'] = 'cls_token'
+        with self.assertRaisesRegex(ValueError, 'must be True'):
+            ViTEVA02(**cfg)
+
+        cfg = deepcopy(self.cfg)
+        cfg['with_cls_token'] = False
+        cfg['out_type'] = 'raw'
+        model = ViTEVA02(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 1)
+        patch_token = outs[-1]
+        self.assertEqual(patch_token.shape, (1, 24 * 24, 192))
+
+        cfg = deepcopy(self.cfg)
+        cfg['with_cls_token'] = False
+        cfg['out_type'] = 'featmap'
+        model = ViTEVA02(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 1)
+        patch_token = outs[-1]
+        self.assertEqual(patch_token.shape, (1, 192, 24, 24))
+
+        cfg = deepcopy(self.cfg)
+        cfg['with_cls_token'] = False
+        cfg['out_type'] = 'avg_featmap'
+        model = ViTEVA02(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 1)
+        patch_token = outs[-1]
+        self.assertEqual(patch_token.shape, (1, 192))
+
+        # test with output cls_token
+        cfg = deepcopy(self.cfg)
+        model = ViTEVA02(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 1)
+        cls_token = outs[-1]
+        self.assertEqual(cls_token.shape, (1, 192))
+
+        # Test forward with multi out indices
+        cfg = deepcopy(self.cfg)
+        cfg['out_indices'] = [-3, -2, -1]
+        model = ViTEVA02(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 3)
+        for out in outs:
+            self.assertEqual(out.shape, (1, 192))