add mmpretrain/ part

cbc25585 · limm · 1baf0566 · cbc25585 · cbc25585 · cbc25585
Commit cbc25585 authored Jun 24, 2025 by limm
20 changed files
--- a/mmpretrain/engine/hooks/visualization_hook.py
+++ b/mmpretrain/engine/hooks/visualization_hook.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import os.path as osp
+from typing import Optional, Sequence
+
+from mmengine.fileio import join_path
+from mmengine.hooks import Hook
+from mmengine.runner import EpochBasedTrainLoop, Runner
+from mmengine.visualization import Visualizer
+
+from mmpretrain.registry import HOOKS
+from mmpretrain.structures import DataSample
+
+
+@HOOKS.register_module()
+class VisualizationHook(Hook):
+    """Classification Visualization Hook. Used to visualize validation and
+    testing prediction results.
+
+    - If ``out_dir`` is specified, all storage backends are ignored
+      and save the image to the ``out_dir``.
+    - If ``show`` is True, plot the result image in a window, please
+      confirm you are able to access the graphical interface.
+
+    Args:
+        enable (bool): Whether to enable this hook. Defaults to False.
+        interval (int): The interval of samples to visualize. Defaults to 5000.
+        show (bool): Whether to display the drawn image. Defaults to False.
+        out_dir (str, optional): directory where painted images will be saved
+            in the testing process. If None, handle with the backends of the
+            visualizer. Defaults to None.
+        **kwargs: other keyword arguments of
+            :meth:`mmpretrain.visualization.UniversalVisualizer.visualize_cls`.
+    """
+
+    def __init__(self,
+                 enable=False,
+                 interval: int = 5000,
+                 show: bool = False,
+                 out_dir: Optional[str] = None,
+                 **kwargs):
+        self._visualizer: Visualizer = Visualizer.get_current_instance()
+
+        self.enable = enable
+        self.interval = interval
+        self.show = show
+        self.out_dir = out_dir
+
+        self.draw_args = {**kwargs, 'show': show}
+
+    def _draw_samples(self,
+                      batch_idx: int,
+                      data_batch: dict,
+                      data_samples: Sequence[DataSample],
+                      step: int = 0) -> None:
+        """Visualize every ``self.interval`` samples from a data batch.
+
+        Args:
+            batch_idx (int): The index of the current batch in the val loop.
+            data_batch (dict): Data from dataloader.
+            outputs (Sequence[:obj:`DataSample`]): Outputs from model.
+            step (int): Global step value to record. Defaults to 0.
+        """
+        if self.enable is False:
+            return
+
+        batch_size = len(data_samples)
+        images = data_batch['inputs']
+        start_idx = batch_size * batch_idx
+        end_idx = start_idx + batch_size
+
+        # The first index divisible by the interval, after the start index
+        first_sample_id = math.ceil(start_idx / self.interval) * self.interval
+
+        for sample_id in range(first_sample_id, end_idx, self.interval):
+            image = images[sample_id - start_idx]
+            image = image.permute(1, 2, 0).cpu().numpy().astype('uint8')
+
+            data_sample = data_samples[sample_id - start_idx]
+            if 'img_path' in data_sample:
+                # osp.basename works on different platforms even file clients.
+                sample_name = osp.basename(data_sample.get('img_path'))
+            else:
+                sample_name = str(sample_id)
+
+            draw_args = self.draw_args
+            if self.out_dir is not None:
+                draw_args['out_file'] = join_path(self.out_dir,
+                                                  f'{sample_name}_{step}.png')
+
+            self._visualizer.visualize_cls(
+                image=image,
+                data_sample=data_sample,
+                step=step,
+                name=sample_name,
+                **self.draw_args,
+            )
+
+    def after_val_iter(self, runner: Runner, batch_idx: int, data_batch: dict,
+                       outputs: Sequence[DataSample]) -> None:
+        """Visualize every ``self.interval`` samples during validation.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the validation process.
+            batch_idx (int): The index of the current batch in the val loop.
+            data_batch (dict): Data from dataloader.
+            outputs (Sequence[:obj:`DataSample`]): Outputs from model.
+        """
+        if isinstance(runner.train_loop, EpochBasedTrainLoop):
+            step = runner.epoch
+        else:
+            step = runner.iter
+
+        self._draw_samples(batch_idx, data_batch, outputs, step=step)
+
+    def after_test_iter(self, runner: Runner, batch_idx: int, data_batch: dict,
+                        outputs: Sequence[DataSample]) -> None:
+        """Visualize every ``self.interval`` samples during test.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the testing process.
+            batch_idx (int): The index of the current batch in the test loop.
+            data_batch (dict): Data from dataloader.
+            outputs (Sequence[:obj:`DetDataSample`]): Outputs from model.
+        """
+        self._draw_samples(batch_idx, data_batch, outputs, step=0)
--- a/mmpretrain/engine/hooks/warmup_param_hook.py
+++ b/mmpretrain/engine/hooks/warmup_param_hook.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import operator as op
+from typing import Any, Optional, Union
+
+from mmengine.hooks import Hook
+
+from mmpretrain.registry import HOOKS
+from mmpretrain.utils import get_ori_model
+
+
+@HOOKS.register_module()
+class WarmupParamHook(Hook):
+    """This is a hook used for changing the parameters other than optimizations
+    that need to warmup inside the module.
+
+    This hook can extend with more detailed warmup rule if necessary.
+
+    Args:
+        param_name (str): The parameter name that needs to be altered.
+        module_name (str): Module name that belongs to the model. Such as
+            `head`, `head.loss`, etc.
+        warmup_epochs (int): The warmup epochs for this parameter.
+    """
+
+    def __init__(
+        self,
+        param_name: str,
+        module_name: str,
+        warmup_epochs: int,
+    ) -> None:
+        self.param_name = param_name
+        self.warmup_epochs = warmup_epochs
+        # getter for module which saves the changed parameter
+        self.module_getter = op.attrgetter(module_name)
+
+    def get_param(self, runner) -> Any:
+        """Get the parameter."""
+        try:
+            module = self.module_getter(get_ori_model(runner.model))
+            return getattr(module, self.param_name)
+        except AttributeError as e:
+            raise AttributeError(f'{e}. Please check hook settings.')
+
+    def set_param(self, runner, value) -> None:
+        """Set the parameter."""
+        try:
+            module = self.module_getter(get_ori_model(runner.model))
+            setattr(module, self.param_name, value)
+        except AttributeError as e:
+            raise AttributeError(f'{e}. Please check hook settings.')
+
+    def before_train(self, runner) -> None:
+        """Get the original value before train."""
+        self.ori_val = self.get_param(runner)
+
+    def before_train_iter(
+            self,
+            runner,
+            batch_idx: int,
+            data_batch: Optional[Union[dict, tuple, list]] = None) -> None:
+        """Set the warmup value before each train iter."""
+        cur_iter = runner.iter
+        iters_per_epoch = runner.max_iters / runner.max_epochs
+        new_val = self.ori_val * min(
+            1, cur_iter / (self.warmup_epochs * iters_per_epoch))
+        self.set_param(runner, new_val)
--- a/mmpretrain/engine/optimizers/__init__.py
+++ b/mmpretrain/engine/optimizers/__init__.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from .adan_t import Adan
+from .lamb import Lamb
+from .lars import LARS
+from .layer_decay_optim_wrapper_constructor import \
+    LearningRateDecayOptimWrapperConstructor
+
+__all__ = ['Lamb', 'Adan', 'LARS', 'LearningRateDecayOptimWrapperConstructor']
--- a/mmpretrain/engine/optimizers/adan_t.py
+++ b/mmpretrain/engine/optimizers/adan_t.py
+# Copyright 2022 Garena Online Private Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import List
+
+import torch
+from torch import Tensor
+from torch.optim.optimizer import Optimizer
+
+from mmpretrain.registry import OPTIMIZERS
+
+
+@OPTIMIZERS.register_module()
+class Adan(Optimizer):
+    """Implements a pytorch variant of Adan.
+
+    Adan was proposed in
+    Adan : Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models. # noqa
+    https://arxiv.org/abs/2208.06677
+    Arguments:
+        params (iterable): iterable of parameters to optimize
+            or dicts defining parameter groups.
+        lr (float, optional): learning rate. (default: 1e-3)
+        betas (Tuple[float, float, flot], optional): coefficients used
+            for computing running averages of gradient.
+            (default: (0.98, 0.92, 0.99))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability. (default: 1e-8)
+        weight_decay (float, optional): decoupled weight decay
+            (L2 penalty) (default: 0)
+        max_grad_norm (float, optional): value used to clip
+            global grad norm (default: 0.0 no clip)
+        no_prox (bool): how to perform the decoupled weight decay
+            (default: False)
+        foreach (bool): if True would use torch._foreach implementation.
+            It's faster but uses slightly more memory.
+    """
+
+    def __init__(self,
+                 params,
+                 lr=1e-3,
+                 betas=(0.98, 0.92, 0.99),
+                 eps=1e-8,
+                 weight_decay=0.0,
+                 max_grad_norm=0.0,
+                 no_prox=False,
+                 foreach: bool = True):
+        if not 0.0 <= max_grad_norm:
+            raise ValueError('Invalid Max grad norm: {}'.format(max_grad_norm))
+        if not 0.0 <= lr:
+            raise ValueError('Invalid learning rate: {}'.format(lr))
+        if not 0.0 <= eps:
+            raise ValueError('Invalid epsilon value: {}'.format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError('Invalid beta parameter at index 0: {}'.format(
+                betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError('Invalid beta parameter at index 1: {}'.format(
+                betas[1]))
+        if not 0.0 <= betas[2] < 1.0:
+            raise ValueError('Invalid beta parameter at index 2: {}'.format(
+                betas[2]))
+        defaults = dict(
+            lr=lr,
+            betas=betas,
+            eps=eps,
+            weight_decay=weight_decay,
+            max_grad_norm=max_grad_norm,
+            no_prox=no_prox,
+            foreach=foreach)
+        super().__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(Adan, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('no_prox', False)
+
+    @torch.no_grad()
+    def restart_opt(self):
+        for group in self.param_groups:
+            group['step'] = 0
+            for p in group['params']:
+                if p.requires_grad:
+                    state = self.state[p]
+                    # State initialization
+
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p)
+                    # Exponential moving average of gradient difference
+                    state['exp_avg_diff'] = torch.zeros_like(p)
+
+    @torch.no_grad()
+    def step(self):
+        """Performs a single optimization step."""
+        if self.defaults['max_grad_norm'] > 0:
+            device = self.param_groups[0]['params'][0].device
+            global_grad_norm = torch.zeros(1, device=device)
+
+            max_grad_norm = torch.tensor(
+                self.defaults['max_grad_norm'], device=device)
+            for group in self.param_groups:
+
+                for p in group['params']:
+                    if p.grad is not None:
+                        grad = p.grad
+                        global_grad_norm.add_(grad.pow(2).sum())
+
+            global_grad_norm = torch.sqrt(global_grad_norm) + group['eps']
+
+            clip_global_grad_norm = \
+                torch.clamp(max_grad_norm / global_grad_norm, max=1.0)
+        else:
+            clip_global_grad_norm = 1.0
+
+        for group in self.param_groups:
+            params_with_grad = []
+            grads = []
+            exp_avgs = []
+            exp_avg_sqs = []
+            exp_avg_diffs = []
+            pre_grads = []
+
+            beta1, beta2, beta3 = group['betas']
+            # assume same step across group now to simplify things
+            # per parameter step can be easily support
+            # by making it tensor, or pass list into kernel
+            if 'step' in group:
+                group['step'] += 1
+            else:
+                group['step'] = 1
+
+            bias_correction1 = 1.0 - beta1**group['step']
+            bias_correction2 = 1.0 - beta2**group['step']
+            bias_correction3 = 1.0 - beta3**group['step']
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                params_with_grad.append(p)
+                grads.append(p.grad)
+
+                state = self.state[p]
+                if len(state) == 0:
+                    state['exp_avg'] = torch.zeros_like(p)
+                    state['exp_avg_sq'] = torch.zeros_like(p)
+                    state['exp_avg_diff'] = torch.zeros_like(p)
+
+                if 'pre_grad' not in state or group['step'] == 1:
+                    # at first step grad wouldn't be clipped
+                    # by `clip_global_grad_norm`
+                    # this is only to simplify implementation
+                    state['pre_grad'] = p.grad
+
+                exp_avgs.append(state['exp_avg'])
+                exp_avg_sqs.append(state['exp_avg_sq'])
+                exp_avg_diffs.append(state['exp_avg_diff'])
+                pre_grads.append(state['pre_grad'])
+
+            kwargs = dict(
+                params=params_with_grad,
+                grads=grads,
+                exp_avgs=exp_avgs,
+                exp_avg_sqs=exp_avg_sqs,
+                exp_avg_diffs=exp_avg_diffs,
+                pre_grads=pre_grads,
+                beta1=beta1,
+                beta2=beta2,
+                beta3=beta3,
+                bias_correction1=bias_correction1,
+                bias_correction2=bias_correction2,
+                bias_correction3_sqrt=math.sqrt(bias_correction3),
+                lr=group['lr'],
+                weight_decay=group['weight_decay'],
+                eps=group['eps'],
+                no_prox=group['no_prox'],
+                clip_global_grad_norm=clip_global_grad_norm,
+            )
+            if group['foreach']:
+                copy_grads = _multi_tensor_adan(**kwargs)
+            else:
+                copy_grads = _single_tensor_adan(**kwargs)
+
+            for p, copy_grad in zip(params_with_grad, copy_grads):
+                self.state[p]['pre_grad'] = copy_grad
+
+
+def _single_tensor_adan(
+    params: List[Tensor],
+    grads: List[Tensor],
+    exp_avgs: List[Tensor],
+    exp_avg_sqs: List[Tensor],
+    exp_avg_diffs: List[Tensor],
+    pre_grads: List[Tensor],
+    *,
+    beta1: float,
+    beta2: float,
+    beta3: float,
+    bias_correction1: float,
+    bias_correction2: float,
+    bias_correction3_sqrt: float,
+    lr: float,
+    weight_decay: float,
+    eps: float,
+    no_prox: bool,
+    clip_global_grad_norm: Tensor,
+):
+    copy_grads = []
+    for i, param in enumerate(params):
+        grad = grads[i]
+        exp_avg = exp_avgs[i]
+        exp_avg_sq = exp_avg_sqs[i]
+        exp_avg_diff = exp_avg_diffs[i]
+        pre_grad = pre_grads[i]
+
+        grad = grad.mul_(clip_global_grad_norm)
+        copy_grads.append(grad.clone())
+
+        diff = grad - pre_grad
+        update = grad + beta2 * diff
+
+        exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)  # m_t
+        exp_avg_diff.mul_(beta2).add_(diff, alpha=1 - beta2)  # diff_t
+        exp_avg_sq.mul_(beta3).addcmul_(update, update, value=1 - beta3)  # n_t
+
+        denom = (exp_avg_sq.sqrt() / bias_correction3_sqrt).add_(eps)
+        update = exp_avg / bias_correction1
+        update.add_(beta2 * exp_avg_diff / bias_correction2).div_(denom)
+
+        if no_prox:
+            param.mul_(1 - lr * weight_decay)
+            param.add_(update, alpha=-lr)
+        else:
+            param.add_(update, alpha=-lr)
+            param.div_(1 + lr * weight_decay)
+    return copy_grads
+
+
+def _multi_tensor_adan(
+    params: List[Tensor],
+    grads: List[Tensor],
+    exp_avgs: List[Tensor],
+    exp_avg_sqs: List[Tensor],
+    exp_avg_diffs: List[Tensor],
+    pre_grads: List[Tensor],
+    *,
+    beta1: float,
+    beta2: float,
+    beta3: float,
+    bias_correction1: float,
+    bias_correction2: float,
+    bias_correction3_sqrt: float,
+    lr: float,
+    weight_decay: float,
+    eps: float,
+    no_prox: bool,
+    clip_global_grad_norm: Tensor,
+):
+    if clip_global_grad_norm < 1.0:
+        torch._foreach_mul_(grads, clip_global_grad_norm.item())
+    copy_grads = [g.clone() for g in grads]
+
+    diff = torch._foreach_sub(grads, pre_grads)
+    # NOTE: line below while looking identical gives different result,
+    # due to float precision errors.
+    # using mul+add produces identical results to single-tensor,
+    # using add+alpha doesn't
+    # update = torch._foreach_add(grads, torch._foreach_mul(diff, beta2))
+    update = torch._foreach_add(grads, diff, alpha=beta2)
+
+    torch._foreach_mul_(exp_avgs, beta1)
+    torch._foreach_add_(exp_avgs, grads, alpha=1 - beta1)  # m_t
+
+    torch._foreach_mul_(exp_avg_diffs, beta2)
+    torch._foreach_add_(exp_avg_diffs, diff, alpha=1 - beta2)  # diff_t
+
+    torch._foreach_mul_(exp_avg_sqs, beta3)
+    torch._foreach_addcmul_(
+        exp_avg_sqs, update, update, value=1 - beta3)  # n_t
+
+    denom = torch._foreach_sqrt(exp_avg_sqs)
+    torch._foreach_div_(denom, bias_correction3_sqrt)
+    torch._foreach_add_(denom, eps)
+
+    update = torch._foreach_div(exp_avgs, bias_correction1)
+    # NOTE: same issue as above.
+    # beta2 * diff / bias_correction2 != diff * (beta2 / bias_correction2)  # noqa
+    # using faster version by default. uncomment for tests to pass
+    # torch._foreach_add_(update, torch._foreach_div(torch._foreach_mul(exp_avg_diffs, beta2), bias_correction2))  # noqa
+    torch._foreach_add_(
+        update, torch._foreach_mul(exp_avg_diffs, beta2 / bias_correction2))
+    torch._foreach_div_(update, denom)
+
+    if no_prox:
+        torch._foreach_mul_(params, 1 - lr * weight_decay)
+    else:
+        torch._foreach_add_(params, update, alpha=-lr)
+        torch._foreach_div_(params, 1 + lr * weight_decay)
+    return copy_grads
--- a/mmpretrain/engine/optimizers/lamb.py
+++ b/mmpretrain/engine/optimizers/lamb.py
+"""PyTorch Lamb optimizer w/ behaviour similar to NVIDIA FusedLamb.
+
+This optimizer code was adapted from the following (starting with latest)
+* https://github.com/HabanaAI/Model-References/blob/
+2b435114fe8e31f159b1d3063b8280ae37af7423/PyTorch/nlp/bert/pretraining/lamb.py
+* https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/
+LanguageModeling/Transformer-XL/pytorch/lamb.py
+* https://github.com/cybertronai/pytorch-lamb
+
+Use FusedLamb if you can (GPU). The reason for including this variant of Lamb
+is to have a version that is
+similar in behaviour to APEX FusedLamb if you aren't using NVIDIA GPUs or
+cannot install/use APEX.
+
+In addition to some cleanup, this Lamb impl has been modified to support
+PyTorch XLA and has been tested on TPU.
+
+Original copyrights for above sources are below.
+
+Modifications Copyright 2021 Ross Wightman
+"""
+# Copyright (c) 2021, Habana Labs Ltd.  All rights reserved.
+
+# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# MIT License
+#
+# Copyright (c) 2019 cybertronai
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import math
+
+import torch
+from torch.optim import Optimizer
+
+from mmpretrain.registry import OPTIMIZERS
+
+
+@OPTIMIZERS.register_module()
+class Lamb(Optimizer):
+    """A pure pytorch variant of FuseLAMB (NvLamb variant) optimizer.
+
+    This class is copied from `timm`_. The LAMB was proposed in `Large Batch
+    Optimization for Deep Learning - Training BERT in 76 minutes`_.
+
+    .. _timm:
+        https://github.com/rwightman/pytorch-image-models/blob/master/timm/optim/lamb.py
+    .. _Large Batch Optimization for Deep Learning - Training BERT in 76 minutes:
+        https://arxiv.org/abs/1904.00962
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+        parameter groups.
+        lr (float, optional): learning rate. (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its norm. (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability. (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        grad_averaging (bool, optional): whether apply (1-beta2) to grad when
+            calculating running averages of gradient. (default: True)
+        max_grad_norm (float, optional): value used to clip global grad norm
+            (default: 1.0)
+        trust_clip (bool): enable LAMBC trust ratio clipping (default: False)
+        always_adapt (boolean, optional): Apply adaptive learning rate to 0.0
+            weight decay parameter (default: False)
+    """  # noqa: E501
+
+    def __init__(self,
+                 params,
+                 lr=1e-3,
+                 bias_correction=True,
+                 betas=(0.9, 0.999),
+                 eps=1e-6,
+                 weight_decay=0.01,
+                 grad_averaging=True,
+                 max_grad_norm=1.0,
+                 trust_clip=False,
+                 always_adapt=False):
+        defaults = dict(
+            lr=lr,
+            bias_correction=bias_correction,
+            betas=betas,
+            eps=eps,
+            weight_decay=weight_decay,
+            grad_averaging=grad_averaging,
+            max_grad_norm=max_grad_norm,
+            trust_clip=trust_clip,
+            always_adapt=always_adapt)
+        super().__init__(params, defaults)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        device = self.param_groups[0]['params'][0].device
+        one_tensor = torch.tensor(
+            1.0, device=device
+        )  # because torch.where doesn't handle scalars correctly
+        global_grad_norm = torch.zeros(1, device=device)
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad
+                if grad.is_sparse:
+                    raise RuntimeError(
+                        'Lamb does not support sparse gradients, consider '
+                        'SparseAdam instead.')
+                global_grad_norm.add_(grad.pow(2).sum())
+
+        global_grad_norm = torch.sqrt(global_grad_norm)
+        # FIXME it'd be nice to remove explicit tensor conversion of scalars
+        #  when torch.where promotes
+        # scalar types properly https://github.com/pytorch/pytorch/issues/9190
+        max_grad_norm = torch.tensor(
+            self.defaults['max_grad_norm'], device=device)
+        clip_global_grad_norm = torch.where(global_grad_norm > max_grad_norm,
+                                            global_grad_norm / max_grad_norm,
+                                            one_tensor)
+
+        for group in self.param_groups:
+            bias_correction = 1 if group['bias_correction'] else 0
+            beta1, beta2 = group['betas']
+            grad_averaging = 1 if group['grad_averaging'] else 0
+            beta3 = 1 - beta1 if grad_averaging else 1.0
+
+            # assume same step across group now to simplify things
+            # per parameter step can be easily support by making it tensor, or
+            # pass list into kernel
+            if 'step' in group:
+                group['step'] += 1
+            else:
+                group['step'] = 1
+
+            if bias_correction:
+                bias_correction1 = 1 - beta1**group['step']
+                bias_correction2 = 1 - beta2**group['step']
+            else:
+                bias_correction1, bias_correction2 = 1.0, 1.0
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.div_(clip_global_grad_norm)
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    # Exponential moving average of gradient valuesa
+                    state['exp_avg'] = torch.zeros_like(p)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p)
+
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+
+                # Decay the first and second moment running average coefficient
+                exp_avg.mul_(beta1).add_(grad, alpha=beta3)  # m_t
+                exp_avg_sq.mul_(beta2).addcmul_(
+                    grad, grad, value=1 - beta2)  # v_t
+
+                denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(
+                    group['eps'])
+                update = (exp_avg / bias_correction1).div_(denom)
+
+                weight_decay = group['weight_decay']
+                if weight_decay != 0:
+                    update.add_(p, alpha=weight_decay)
+
+                if weight_decay != 0 or group['always_adapt']:
+                    # Layer-wise LR adaptation. By default, skip adaptation on
+                    # parameters that are
+                    # excluded from weight decay, unless always_adapt == True,
+                    # then always enabled.
+                    w_norm = p.norm(2.0)
+                    g_norm = update.norm(2.0)
+                    # FIXME nested where required since logical and/or not
+                    #  working in PT XLA
+                    trust_ratio = torch.where(
+                        w_norm > 0,
+                        torch.where(g_norm > 0, w_norm / g_norm, one_tensor),
+                        one_tensor,
+                    )
+                    if group['trust_clip']:
+                        # LAMBC trust clipping, upper bound fixed at one
+                        trust_ratio = torch.minimum(trust_ratio, one_tensor)
+                    update.mul_(trust_ratio)
+
+                p.add_(update, alpha=-group['lr'])
+
+        return loss
--- a/mmpretrain/engine/optimizers/lars.py
+++ b/mmpretrain/engine/optimizers/lars.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Iterable
+
+import torch
+from torch.optim.optimizer import Optimizer
+
+from mmpretrain.registry import OPTIMIZERS
+
+
+@OPTIMIZERS.register_module()
+class LARS(Optimizer):
+    """Implements layer-wise adaptive rate scaling for SGD.
+
+    Based on Algorithm 1 of the following paper by You, Gitman, and Ginsburg.
+    `Large Batch Training of Convolutional Networks:
+    <https://arxiv.org/abs/1708.03888>`_.
+
+    Args:
+        params (Iterable): Iterable of parameters to optimize or dicts defining
+            parameter groups.
+        lr (float): Base learning rate.
+        momentum (float): Momentum factor. Defaults to 0.
+        weight_decay (float): Weight decay (L2 penalty). Defaults to 0.
+        dampening (float): Dampening for momentum. Defaults to 0.
+        eta (float): LARS coefficient. Defaults to 0.001.
+        nesterov (bool): Enables Nesterov momentum. Defaults to False.
+        eps (float): A small number to avoid dviding zero. Defaults to 1e-8.
+
+    Example:
+        >>> optimizer = LARS(model.parameters(), lr=0.1, momentum=0.9,
+        >>>                  weight_decay=1e-4, eta=1e-3)
+        >>> optimizer.zero_grad()
+        >>> loss_fn(model(input), target).backward()
+        >>> optimizer.step()
+    """
+
+    def __init__(self,
+                 params: Iterable,
+                 lr: float,
+                 momentum: float = 0,
+                 weight_decay: float = 0,
+                 dampening: float = 0,
+                 eta: float = 0.001,
+                 nesterov: bool = False,
+                 eps: float = 1e-8) -> None:
+        if not isinstance(lr, float) and lr < 0.0:
+            raise ValueError(f'Invalid learning rate: {lr}')
+        if momentum < 0.0:
+            raise ValueError(f'Invalid momentum value: {momentum}')
+        if weight_decay < 0.0:
+            raise ValueError(f'Invalid weight_decay value: {weight_decay}')
+        if eta < 0.0:
+            raise ValueError(f'Invalid LARS coefficient value: {eta}')
+
+        defaults = dict(
+            lr=lr,
+            momentum=momentum,
+            dampening=dampening,
+            weight_decay=weight_decay,
+            nesterov=nesterov,
+            eta=eta)
+        if nesterov and (momentum <= 0 or dampening != 0):
+            raise ValueError(
+                'Nesterov momentum requires a momentum and zero dampening')
+
+        self.eps = eps
+        super().__init__(params, defaults)
+
+    def __setstate__(self, state) -> None:
+        super().__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('nesterov', False)
+
+    @torch.no_grad()
+    def step(self, closure=None) -> torch.Tensor:
+        """Performs a single optimization step.
+
+        Args:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            weight_decay = group['weight_decay']
+            momentum = group['momentum']
+            dampening = group['dampening']
+            eta = group['eta']
+            nesterov = group['nesterov']
+            lr = group['lr']
+            lars_exclude = group.get('lars_exclude', False)
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+
+                d_p = p.grad
+
+                if lars_exclude:
+                    local_lr = 1.
+                else:
+                    weight_norm = torch.norm(p).item()
+                    grad_norm = torch.norm(d_p).item()
+                    if weight_norm != 0 and grad_norm != 0:
+                        # Compute local learning rate for this layer
+                        local_lr = eta * weight_norm / \
+                            (grad_norm + weight_decay * weight_norm + self.eps)
+                    else:
+                        local_lr = 1.
+
+                actual_lr = local_lr * lr
+                d_p = d_p.add(p, alpha=weight_decay).mul(actual_lr)
+                if momentum != 0:
+                    param_state = self.state[p]
+                    if 'momentum_buffer' not in param_state:
+                        buf = param_state['momentum_buffer'] = \
+                                torch.clone(d_p).detach()
+                    else:
+                        buf = param_state['momentum_buffer']
+                        buf.mul_(momentum).add_(d_p, alpha=1 - dampening)
+                    if nesterov:
+                        d_p = d_p.add(buf, alpha=momentum)
+                    else:
+                        d_p = buf
+                p.add_(-d_p)
+
+        return loss
--- a/mmpretrain/engine/optimizers/layer_decay_optim_wrapper_constructor.py
+++ b/mmpretrain/engine/optimizers/layer_decay_optim_wrapper_constructor.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import defaultdict
+from typing import Callable, List, Optional
+
+from mmengine.logging import MMLogger
+from mmengine.optim import DefaultOptimWrapperConstructor
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm, _InstanceNorm
+from torch import nn
+from torch.nn import GroupNorm, LayerNorm
+
+from mmpretrain.registry import OPTIM_WRAPPER_CONSTRUCTORS
+
+
+@OPTIM_WRAPPER_CONSTRUCTORS.register_module()
+class LearningRateDecayOptimWrapperConstructor(DefaultOptimWrapperConstructor):
+    """Different learning rates are set for different layers of backbone.
+
+    By default, each parameter share the same optimizer settings, and we
+    provide an argument ``paramwise_cfg`` to specify parameter-wise settings.
+    It is a dict and may contain the following fields:
+
+    - ``layer_decay_rate`` (float): The learning rate of a parameter will
+      multiply it by multiple times according to the layer depth of the
+      parameter. Usually, it's less than 1, so that the earlier layers will
+      have a lower learning rate. Defaults to 1.
+    - ``bias_decay_mult`` (float): It will be multiplied to the weight
+      decay for all bias parameters (except for those in normalization layers).
+    - ``norm_decay_mult`` (float): It will be multiplied to the weight
+      decay for all weight and bias parameters of normalization layers.
+    - ``flat_decay_mult`` (float): It will be multiplied to the weight
+      decay for all one-dimensional parameters
+    - ``custom_keys`` (dict): Specified parameters-wise settings by keys. If
+      one of the keys in ``custom_keys`` is a substring of the name of one
+      parameter, then the setting of the parameter will be specified by
+      ``custom_keys[key]`` and other setting like ``bias_decay_mult`` will be
+      ignored. It should be a dict and may contain fields ``decay_mult``.
+      (The ``lr_mult`` is disabled in this constructor).
+
+    Example:
+
+    In the config file, you can use this constructor as below:
+
+    .. code:: python
+
+        optim_wrapper = dict(
+            optimizer=dict(
+                type='AdamW',
+                lr=4e-3,
+                weight_decay=0.05,
+                eps=1e-8,
+                betas=(0.9, 0.999)),
+            constructor='LearningRateDecayOptimWrapperConstructor',
+            paramwise_cfg=dict(
+                layer_decay_rate=0.75,  # layer-wise lr decay factor
+                norm_decay_mult=0.,
+                flat_decay_mult=0.,
+                custom_keys={
+                    '.cls_token': dict(decay_mult=0.0),
+                    '.pos_embed': dict(decay_mult=0.0)
+                }))
+    """
+
+    def add_params(self,
+                   params: List[dict],
+                   module: nn.Module,
+                   prefix: str = '',
+                   get_layer_depth: Optional[Callable] = None,
+                   **kwargs) -> None:
+        """Add all parameters of module to the params list.
+
+        The parameters of the given module will be added to the list of param
+        groups, with specific rules defined by paramwise_cfg.
+
+        Args:
+            params (List[dict]): A list of param groups, it will be modified
+                in place.
+            module (nn.Module): The module to be added.
+            optimizer_cfg (dict): The configuration of optimizer.
+            prefix (str): The prefix of the module.
+        """
+        # get param-wise options
+        custom_keys = self.paramwise_cfg.get('custom_keys', {})
+        # first sort with alphabet order and then sort with reversed len of str
+        sorted_keys = sorted(sorted(custom_keys.keys()), key=len, reverse=True)
+        logger = MMLogger.get_current_instance()
+
+        # The model should have `get_layer_depth` method
+        if get_layer_depth is None and not hasattr(module, 'get_layer_depth'):
+            raise NotImplementedError('The layer-wise learning rate decay need'
+                                      f' the model {type(module)} has'
+                                      ' `get_layer_depth` method.')
+        else:
+            get_layer_depth = get_layer_depth or module.get_layer_depth
+
+        bias_decay_mult = self.paramwise_cfg.get('bias_decay_mult', None)
+        norm_decay_mult = self.paramwise_cfg.get('norm_decay_mult', None)
+        flat_decay_mult = self.paramwise_cfg.get('flat_decay_mult', None)
+        decay_rate = self.paramwise_cfg.get('layer_decay_rate', 1.0)
+
+        # special rules for norm layers and depth-wise conv layers
+        is_norm = isinstance(module,
+                             (_BatchNorm, _InstanceNorm, GroupNorm, LayerNorm))
+
+        for name, param in module.named_parameters(recurse=False):
+            param_group = {'params': [param]}
+            param_name = prefix + name
+            if not param.requires_grad:
+                continue
+
+            if self.base_wd is not None:
+                base_wd = self.base_wd
+                custom_key = next(
+                    filter(lambda k: k in param_name, sorted_keys), None)
+                # custom parameters decay
+                if custom_key is not None:
+                    custom_cfg = custom_keys[custom_key].copy()
+                    decay_mult = custom_cfg.pop('decay_mult', 1.)
+
+                    param_group['weight_decay'] = base_wd * decay_mult
+                    # add custom settings to param_group
+                    param_group.update(custom_cfg)
+                # norm decay
+                elif is_norm and norm_decay_mult is not None:
+                    param_group['weight_decay'] = base_wd * norm_decay_mult
+                # bias decay
+                elif name == 'bias' and bias_decay_mult is not None:
+                    param_group['weight_decay'] = base_wd * bias_decay_mult
+                # flatten parameters decay
+                elif param.ndim == 1 and flat_decay_mult is not None:
+                    param_group['weight_decay'] = base_wd * flat_decay_mult
+                else:
+                    param_group['weight_decay'] = base_wd
+
+            layer_id, max_id = get_layer_depth(param_name)
+            scale = decay_rate**(max_id - layer_id - 1)
+            param_group['lr'] = self.base_lr * scale
+            param_group['lr_scale'] = scale
+            param_group['layer_id'] = layer_id
+            param_group['param_name'] = param_name
+
+            params.append(param_group)
+
+        for child_name, child_mod in module.named_children():
+            child_prefix = f'{prefix}{child_name}.'
+            self.add_params(
+                params,
+                child_mod,
+                prefix=child_prefix,
+                get_layer_depth=get_layer_depth,
+            )
+
+        if prefix == '':
+            layer_params = defaultdict(list)
+            for param in params:
+                layer_params[param['layer_id']].append(param)
+            for layer_id, layer_params in layer_params.items():
+                lr_scale = layer_params[0]['lr_scale']
+                lr = layer_params[0]['lr']
+                msg = [
+                    f'layer {layer_id} params '
+                    f'(lr={lr:.3g}, lr_scale={lr_scale:.3g}):'
+                ]
+                for param in layer_params:
+                    msg.append(f'\t{param["param_name"]}: '
+                               f'weight_decay={param["weight_decay"]:.3g}')
+                logger.debug('\n'.join(msg))
--- a/mmpretrain/engine/runners/__init__.py
+++ b/mmpretrain/engine/runners/__init__.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from .retrieval_loop import RetrievalTestLoop, RetrievalValLoop
+
+__all__ = ['RetrievalTestLoop', 'RetrievalValLoop']
--- a/mmpretrain/engine/runners/retrieval_loop.py
+++ b/mmpretrain/engine/runners/retrieval_loop.py
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import torch
+from mmengine.model import is_model_wrapper
+from mmengine.runner import TestLoop, ValLoop, autocast
+
+from mmpretrain.registry import LOOPS
+
+
+@LOOPS.register_module()
+class RetrievalValLoop(ValLoop):
+    """Loop for multimodal retrieval val.
+
+    Args:
+        runner (Runner): A reference of runner.
+        dataloader (Dataloader or dict): A dataloader object or a dict to
+            build a dataloader.
+        evaluator (Evaluator or dict or list): Used for computing metrics.
+        fp16 (bool): Whether to enable fp16 valing. Defaults to
+            False.
+    """
+
+    def run(self) -> dict:
+        """Launch val."""
+        self.runner.call_hook('before_val')
+        self.runner.call_hook('before_val_epoch')
+        self.runner.model.eval()
+
+        feats_local = []
+        data_samples_local = []
+
+        for idx, data_batch in enumerate(self.dataloader):
+            with torch.no_grad():
+                self.runner.call_hook(
+                    'before_val_iter', batch_idx=idx, data_batch=data_batch)
+                # predictions should be sequence of BaseDataElement
+                with autocast(enabled=self.fp16):
+                    if is_model_wrapper(self.runner.model):
+                        data_preprocessor = self.runner.model.module.data_preprocessor  # noqa: E501
+                    else:
+                        data_preprocessor = self.runner.model.data_preprocessor
+
+                    # get features for retrieval instead of data samples
+                    data_batch = data_preprocessor(data_batch, False)
+                    feats = self.runner.model._run_forward(
+                        data_batch, mode='tensor')
+                    feats_local.append(feats)
+                    data_samples_local.extend(data_batch['data_samples'])
+                self.runner.call_hook(
+                    'after_val_iter',
+                    batch_idx=idx,
+                    data_batch=data_batch,
+                    outputs=feats)
+
+        # concatenate different features
+        feats_local = {
+            k: torch.cat([dic[k] for dic in feats_local])
+            for k in feats_local[0]
+        }
+
+        # get predictions
+        if is_model_wrapper(self.runner.model):
+            predict_all_fn = self.runner.model.module.predict_all
+        else:
+            predict_all_fn = self.runner.model.predict_all
+
+        img_size = self.dataloader.dataset.img_size
+        text_size = self.dataloader.dataset.text_size
+        with torch.no_grad():
+            i2t_data_samples, t2i_data_samples = predict_all_fn(
+                feats_local,
+                data_samples_local,
+                num_images=img_size,
+                num_texts=text_size,
+            )
+
+        # process in evaluator and compute metrics
+        self.evaluator.process(i2t_data_samples, None)
+        i2t_metrics = self.evaluator.evaluate(img_size)
+        i2t_metrics = {f'i2t/{k}': v for k, v in i2t_metrics.items()}
+        self.evaluator.process(t2i_data_samples, None)
+        t2i_metrics = self.evaluator.evaluate(text_size)
+        t2i_metrics = {f't2i/{k}': v for k, v in t2i_metrics.items()}
+        metrics = {**i2t_metrics, **t2i_metrics}
+
+        self.runner.call_hook('after_val_epoch', metrics=metrics)
+        self.runner.call_hook('after_val')
+        return metrics
+
+
+@LOOPS.register_module()
+class RetrievalTestLoop(TestLoop):
+    """Loop for multimodal retrieval test.
+
+    Args:
+        runner (Runner): A reference of runner.
+        dataloader (Dataloader or dict): A dataloader object or a dict to
+            build a dataloader.
+        evaluator (Evaluator or dict or list): Used for computing metrics.
+        fp16 (bool): Whether to enable fp16 testing. Defaults to
+            False.
+    """
+
+    def run(self) -> dict:
+        """Launch test."""
+        self.runner.call_hook('before_test')
+        self.runner.call_hook('before_test_epoch')
+        self.runner.model.eval()
+
+        feats_local = []
+        data_samples_local = []
+
+        for idx, data_batch in enumerate(self.dataloader):
+            with torch.no_grad():
+                self.runner.call_hook(
+                    'before_test_iter', batch_idx=idx, data_batch=data_batch)
+                # predictions should be sequence of BaseDataElement
+                with autocast(enabled=self.fp16):
+                    if is_model_wrapper(self.runner.model):
+                        data_preprocessor = self.runner.model.module.data_preprocessor  # noqa: E501
+                    else:
+                        data_preprocessor = self.runner.model.data_preprocessor
+                    # get features for retrieval instead of data samples
+                    data_batch = data_preprocessor(data_batch, False)
+                    feats = self.runner.model._run_forward(
+                        data_batch, mode='tensor')
+                    feats_local.append(feats)
+                    data_samples_local.extend(data_batch['data_samples'])
+                self.runner.call_hook(
+                    'after_test_iter',
+                    batch_idx=idx,
+                    data_batch=data_batch,
+                    outputs=feats)
+
+        # concatenate different features
+        feats_local = {
+            k: torch.cat([dic[k] for dic in feats_local])
+            for k in feats_local[0]
+        }
+
+        # get predictions
+        if is_model_wrapper(self.runner.model):
+            predict_all_fn = self.runner.model.module.predict_all
+        else:
+            predict_all_fn = self.runner.model.predict_all
+
+        img_size = self.dataloader.dataset.img_size
+        text_size = self.dataloader.dataset.text_size
+        with torch.no_grad():
+            i2t_data_samples, t2i_data_samples = predict_all_fn(
+                feats_local,
+                data_samples_local,
+                num_images=img_size,
+                num_texts=text_size,
+            )
+
+        # process in evaluator and compute metrics
+        self.evaluator.process(i2t_data_samples, None)
+        i2t_metrics = self.evaluator.evaluate(img_size)
+        i2t_metrics = {f'i2t/{k}': v for k, v in i2t_metrics.items()}
+        self.evaluator.process(t2i_data_samples, None)
+        t2i_metrics = self.evaluator.evaluate(text_size)
+        t2i_metrics = {f't2i/{k}': v for k, v in t2i_metrics.items()}
+        metrics = {**i2t_metrics, **t2i_metrics}
+
+        self.runner.call_hook('after_test_epoch', metrics=metrics)
+        self.runner.call_hook('after_test')
+        return metrics
--- a/mmpretrain/engine/schedulers/__init__.py
+++ b/mmpretrain/engine/schedulers/__init__.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from .weight_decay_scheduler import CosineAnnealingWeightDecay
+
+__all__ = ['CosineAnnealingWeightDecay']
--- a/mmpretrain/engine/schedulers/weight_decay_scheduler.py
+++ b/mmpretrain/engine/schedulers/weight_decay_scheduler.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+from mmengine.optim.scheduler import CosineAnnealingParamScheduler
+
+from mmpretrain.registry import PARAM_SCHEDULERS
+
+
+class WeightDecaySchedulerMixin:
+    """A mixin class for learning rate schedulers."""
+
+    def __init__(self, optimizer, *args, **kwargs):
+        super().__init__(optimizer, 'weight_decay', *args, **kwargs)
+
+
+@PARAM_SCHEDULERS.register_module()
+class CosineAnnealingWeightDecay(WeightDecaySchedulerMixin,
+                                 CosineAnnealingParamScheduler):
+    """Set the weight decay value of each parameter group using a cosine
+    annealing schedule.
+
+    If the weight decay was set to be 0 initially, the weight decay value will
+    be 0 constantly during the training.
+    """
+
+    def _get_value(self) -> list:
+        """Compute value using chainable form of the scheduler."""
+
+        def _get_eta_min(base_value):
+            if self.eta_min_ratio is None:
+                return self.eta_min
+            return base_value * self.eta_min_ratio
+
+        if self.last_step == 0:
+            return [
+                group[self.param_name] for group in self.optimizer.param_groups
+            ]
+        elif (self.last_step - 1 - self.T_max) % (2 * self.T_max) == 0:
+            weight_decay_value_list = []
+            for base_value, group in zip(self.base_values,
+                                         self.optimizer.param_groups):
+                if base_value == 0:
+                    group_value = 0
+                else:
+                    group_value = group[self.param_name] + (
+                        base_value - _get_eta_min(base_value)) * (
+                            1 - math.cos(math.pi / self.T_max)) / 2
+                weight_decay_value_list.append(group_value)
+            return weight_decay_value_list
+
+        weight_decay_value_list = []
+        for base_value, group in zip(self.base_values,
+                                     self.optimizer.param_groups):
+            if base_value == 0:
+                group_value = 0
+            else:
+                group_value = (
+                    1 + math.cos(math.pi * self.last_step / self.T_max)) / (
+                        1 + math.cos(math.pi *
+                                     (self.last_step - 1) / self.T_max)
+                    ) * (group[self.param_name] -
+                         _get_eta_min(base_value)) + _get_eta_min(base_value)
+            weight_decay_value_list.append(group_value)
+        return weight_decay_value_list
--- a/mmpretrain/evaluation/__init__.py
+++ b/mmpretrain/evaluation/__init__.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from .functional import *  # noqa: F401,F403
+from .metrics import *  # noqa: F401,F403
--- a/mmpretrain/evaluation/functional/__init__.py
+++ b/mmpretrain/evaluation/functional/__init__.py
+# Copyright (c) OpenMMLab. All rights reserved.
--- a/mmpretrain/evaluation/metrics/ANLS.py
+++ b/mmpretrain/evaluation/metrics/ANLS.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional
+
+from mmengine.evaluator import BaseMetric
+
+from mmpretrain.registry import METRICS
+
+
+@METRICS.register_module()
+class ANLS(BaseMetric):
+    """ANLS metric.
+
+    Compute the Average Normalized Levenshtein Similarity(ANLS).
+
+    Args:
+        threshold (float): ANLS threshold used for determining if the answer
+            has been correctly selected but not properly recognized,
+            or on the contrary, the output is a wrong text selected from the
+            options and given as an answer.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Should be modified according to the
+            `retrieval_type` for unambiguous results. Defaults to TR.
+    """
+    default_prefix = 'ANLS'
+
+    def __init__(self,
+                 threshold: float = 0.5,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        self.threshold = threshold
+
+    def process(self, data_batch, data_samples) -> None:
+        """Process one batch of data samples.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to computed the metrics when all batches have been processed.
+
+        Args:
+            data_batch: A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+        for sample in data_samples:
+            gt_answer = sample.get('gt_answer')
+            result = {
+                'pred_answer': sample.get('pred_answer'),
+                'gt_answer': gt_answer
+            }
+
+            self.results.append(result)
+
+    def compute_metrics(self, results: List) -> dict:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (dict): The processed results of each batch.
+
+        Returns:
+            Dict: The computed metrics. The keys are the names of the metrics,
+            and the values are corresponding results.
+        """
+        total_score = 0.
+        for result in results:
+            sample_score_list = []
+            pred = ' '.join(result['pred_answer'].strip().lower().split())
+            for gt in result['gt_answer']:
+                gt = ' '.join(gt.strip().lower().split())
+                dist = levenshtein_distance(gt, pred)
+                length = max(
+                    len(gt.upper()), len(result['pred_answer'].upper()))
+                sample_score_list.append(0.0 if length == 0 else float(dist) /
+                                         float(length))
+
+            per_sample_score = 1. - min(sample_score_list)
+            if per_sample_score < self.threshold:
+                per_sample_score = 0.
+
+            total_score += per_sample_score
+
+        total_score = total_score / len(results)
+        return {'ANLS': total_score}
+
+
+def levenshtein_distance(s1, s2):
+    if len(s1) > len(s2):
+        s1, s2 = s2, s1
+
+    distances = range(len(s1) + 1)
+    for i2, c2 in enumerate(s2):
+        distances_ = [i2 + 1]
+        for i1, c1 in enumerate(s1):
+            if c1 == c2:
+                distances_.append(distances[i1])
+            else:
+                distances_.append(1 + min((distances[i1], distances[i1 + 1],
+                                           distances_[-1])))
+        distances = distances_
+    return distances[-1]
--- a/mmpretrain/evaluation/metrics/__init__.py
+++ b/mmpretrain/evaluation/metrics/__init__.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from .ANLS import ANLS
+from .caption import COCOCaption
+from .gqa import GQAAcc
+from .multi_label import AveragePrecision, MultiLabelMetric
+from .multi_task import MultiTasksMetric
+from .nocaps import NocapsSave
+from .retrieval import RetrievalAveragePrecision, RetrievalRecall
+from .scienceqa import ScienceQAMetric
+from .shape_bias_label import ShapeBiasMetric
+from .single_label import Accuracy, ConfusionMatrix, SingleLabelMetric
+from .visual_grounding_eval import VisualGroundingMetric
+from .voc_multi_label import VOCAveragePrecision, VOCMultiLabelMetric
+from .vqa import ReportVQA, VQAAcc
+
+__all__ = [
+    'Accuracy', 'SingleLabelMetric', 'MultiLabelMetric', 'AveragePrecision',
+    'MultiTasksMetric', 'VOCAveragePrecision', 'VOCMultiLabelMetric',
+    'ConfusionMatrix', 'RetrievalRecall', 'VQAAcc', 'ReportVQA', 'COCOCaption',
+    'VisualGroundingMetric', 'ScienceQAMetric', 'GQAAcc', 'NocapsSave',
+    'RetrievalAveragePrecision', 'ShapeBiasMetric', 'ANLS'
+]
--- a/mmpretrain/evaluation/metrics/caption.py
+++ b/mmpretrain/evaluation/metrics/caption.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os
+import tempfile
+from typing import List, Optional
+
+from mmengine.evaluator import BaseMetric
+from mmengine.utils import track_iter_progress
+
+from mmpretrain.registry import METRICS
+from mmpretrain.utils import require
+
+try:
+    from pycocoevalcap.eval import COCOEvalCap
+    from pycocotools.coco import COCO
+except ImportError:
+    COCOEvalCap = None
+    COCO = None
+
+
+@METRICS.register_module()
+class COCOCaption(BaseMetric):
+    """Coco Caption evaluation wrapper.
+
+    Save the generated captions and transform into coco format.
+    Calling COCO API for caption metrics.
+
+    Args:
+        ann_file (str): the path for the COCO format caption ground truth
+            json file, load for evaluations.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Should be modified according to the
+            `retrieval_type` for unambiguous results. Defaults to TR.
+    """
+
+    @require('pycocoevalcap')
+    def __init__(self,
+                 ann_file: str,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None):
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        self.ann_file = ann_file
+
+    def process(self, data_batch, data_samples):
+        """Process one batch of data samples.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to computed the metrics when all batches have been processed.
+
+        Args:
+            data_batch: A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+
+        for data_sample in data_samples:
+            result = dict()
+
+            result['caption'] = data_sample.get('pred_caption')
+            result['image_id'] = int(data_sample.get('image_id'))
+
+            # Save the result to `self.results`.
+            self.results.append(result)
+
+    def compute_metrics(self, results: List):
+        """Compute the metrics from processed results.
+
+        Args:
+            results (dict): The processed results of each batch.
+
+        Returns:
+            Dict: The computed metrics. The keys are the names of the metrics,
+            and the values are corresponding results.
+        """
+        # NOTICE: don't access `self.results` from the method.
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+
+            eval_result_file = save_result(
+                result=results,
+                result_dir=temp_dir,
+                filename='m4-caption_pred',
+                remove_duplicate='image_id',
+            )
+
+            coco_val = coco_caption_eval(eval_result_file, self.ann_file)
+
+        return coco_val
+
+
+def save_result(result, result_dir, filename, remove_duplicate=''):
+    """Saving predictions as json file for evaluation."""
+
+    # combine results from all processes
+    result_new = []
+
+    if remove_duplicate:
+        result_new = []
+        id_list = []
+        for res in track_iter_progress(result):
+            if res[remove_duplicate] not in id_list:
+                id_list.append(res[remove_duplicate])
+                result_new.append(res)
+        result = result_new
+
+    final_result_file_url = os.path.join(result_dir, '%s.json' % filename)
+    print(f'result file saved to {final_result_file_url}')
+    json.dump(result, open(final_result_file_url, 'w'))
+
+    return final_result_file_url
+
+
+def coco_caption_eval(results_file, ann_file):
+    """Evaluation between gt json and prediction json files."""
+    # create coco object and coco_result object
+    coco = COCO(ann_file)
+    coco_result = coco.loadRes(results_file)
+
+    # create coco_eval object by taking coco and coco_result
+    coco_eval = COCOEvalCap(coco, coco_result)
+
+    # make sure the image ids are the same
+    coco_eval.params['image_id'] = coco_result.getImgIds()
+
+    # This will take some times at the first run
+    coco_eval.evaluate()
+
+    # print output evaluation scores
+    for metric, score in coco_eval.eval.items():
+        print(f'{metric}: {score:.3f}')
+
+    return coco_eval.eval
--- a/mmpretrain/evaluation/metrics/gqa.py
+++ b/mmpretrain/evaluation/metrics/gqa.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional
+
+from mmengine.evaluator import BaseMetric
+
+from mmpretrain.evaluation.metrics.vqa import (_process_digit_article,
+                                               _process_punctuation)
+from mmpretrain.registry import METRICS
+
+
+@METRICS.register_module()
+class GQAAcc(BaseMetric):
+    """GQA Acc metric.
+
+    Compute GQA accuracy.
+
+    Args:
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Should be modified according to the
+            `retrieval_type` for unambiguous results. Defaults to TR.
+    """
+    default_prefix = 'GQA'
+
+    def __init__(self,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+
+    def process(self, data_batch, data_samples) -> None:
+        """Process one batch of data samples.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to computed the metrics when all batches have been processed.
+
+        Args:
+            data_batch: A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+        for sample in data_samples:
+            gt_answer = sample.get('gt_answer')
+            result = {
+                'pred_answer': sample.get('pred_answer'),
+                'gt_answer': gt_answer
+            }
+
+            self.results.append(result)
+
+    def compute_metrics(self, results: List) -> dict:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (dict): The processed results of each batch.
+
+        Returns:
+            Dict: The computed metrics. The keys are the names of the metrics,
+            and the values are corresponding results.
+        """
+        acc = []
+        for result in results:
+            pred_answer = self._process_answer(result['pred_answer'])
+            gt_answer = self._process_answer(result['gt_answer'])
+            gqa_acc = 1 if pred_answer == gt_answer else 0
+            acc.append(gqa_acc)
+
+        accuracy = sum(acc) / len(acc)
+
+        metrics = {'acc': accuracy}
+        return metrics
+
+    def _process_answer(self, answer) -> str:
+        answer = _process_punctuation(answer)
+        answer = _process_digit_article(answer)
+        return answer
--- a/mmpretrain/evaluation/metrics/multi_label.py
+++ b/mmpretrain/evaluation/metrics/multi_label.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Sequence, Union
+
+import numpy as np
+import torch
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import MMLogger
+
+from mmpretrain.registry import METRICS
+from mmpretrain.structures import label_to_onehot
+from .single_label import _precision_recall_f1_support, to_tensor
+
+
+@METRICS.register_module()
+class MultiLabelMetric(BaseMetric):
+    r"""A collection of precision, recall, f1-score and support for
+    multi-label tasks.
+
+    The collection of metrics is for single-label multi-class classification.
+    And all these metrics are based on the confusion matrix of every category:
+
+    .. image:: ../../_static/image/confusion-matrix.png
+       :width: 60%
+       :align: center
+
+    All metrics can be formulated use variables above:
+
+    **Precision** is the fraction of correct predictions in all predictions:
+
+    .. math::
+        \text{Precision} = \frac{TP}{TP+FP}
+
+    **Recall** is the fraction of correct predictions in all targets:
+
+    .. math::
+        \text{Recall} = \frac{TP}{TP+FN}
+
+    **F1-score** is the harmonic mean of the precision and recall:
+
+    .. math::
+        \text{F1-score} = \frac{2\times\text{Recall}\times\text{Precision}}{\text{Recall}+\text{Precision}}
+
+    **Support** is the number of samples:
+
+    .. math::
+        \text{Support} = TP + TN + FN + FP
+
+    Args:
+        thr (float, optional): Predictions with scores under the threshold
+            are considered as negative. If None, the ``topk`` predictions will
+            be considered as positive. If the ``topk`` is also None, use
+            ``thr=0.5`` as default. Defaults to None.
+        topk (int, optional): Predictions with the k-th highest scores are
+            considered as positive. If None, use ``thr`` to determine positive
+            predictions. If both ``thr`` and ``topk`` are not None, use
+            ``thr``. Defaults to None.
+        items (Sequence[str]): The detailed metric items to evaluate, select
+            from "precision", "recall", "f1-score" and "support".
+            Defaults to ``('precision', 'recall', 'f1-score')``.
+        average (str | None): How to calculate the final metrics from the
+            confusion matrix of every category. It supports three modes:
+
+            - `"macro"`: Calculate metrics for each category, and calculate
+              the mean value over all categories.
+            - `"micro"`: Average the confusion matrix over all categories and
+              calculate metrics on the mean confusion matrix.
+            - `None`: Calculate metrics of every category and output directly.
+
+            Defaults to "macro".
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+
+    Examples:
+        >>> import torch
+        >>> from mmpretrain.evaluation import MultiLabelMetric
+        >>> # ------ The Basic Usage for category indices labels -------
+        >>> y_pred = [[0], [1], [0, 1], [3]]
+        >>> y_true = [[0, 3], [0, 2], [1], [3]]
+        >>> # Output precision, recall, f1-score and support
+        >>> MultiLabelMetric.calculate(
+        ...     y_pred, y_true, pred_indices=True, target_indices=True, num_classes=4)
+        (tensor(50.), tensor(50.), tensor(45.8333), tensor(6))
+        >>> # ----------- The Basic Usage for one-hot labels -----------
+        >>> y_pred = torch.tensor([[1, 1, 0, 0],
+        ...                        [1, 1, 0, 0],
+        ...                        [0, 0, 1, 0],
+        ...                        [0, 1, 0, 0],
+        ...                        [0, 1, 0, 0]])
+        >>> y_true = torch.Tensor([[1, 1, 0, 0],
+        ...                        [0, 0, 1, 0],
+        ...                        [1, 1, 1, 0],
+        ...                        [1, 0, 0, 0],
+        ...                        [1, 0, 0, 0]])
+        >>> MultiLabelMetric.calculate(y_pred, y_true)
+        (tensor(43.7500), tensor(31.2500), tensor(33.3333), tensor(8))
+        >>> # --------- The Basic Usage for one-hot pred scores ---------
+        >>> y_pred = torch.rand(y_true.size())
+        >>> y_pred
+        tensor([[0.4575, 0.7335, 0.3934, 0.2572],
+        [0.1318, 0.1004, 0.8248, 0.6448],
+        [0.8349, 0.6294, 0.7896, 0.2061],
+        [0.4037, 0.7308, 0.6713, 0.8374],
+        [0.3779, 0.4836, 0.0313, 0.0067]])
+        >>> # Calculate with different threshold.
+        >>> MultiLabelMetric.calculate(y_pred, y_true, thr=0.1)
+        (tensor(42.5000), tensor(75.), tensor(53.1746), tensor(8))
+        >>> # Calculate with topk.
+        >>> MultiLabelMetric.calculate(y_pred, y_true, topk=1)
+        (tensor(62.5000), tensor(31.2500), tensor(39.1667), tensor(8))
+        >>>
+        >>> # ------------------- Use with Evalutor -------------------
+        >>> from mmpretrain.structures import DataSample
+        >>> from mmengine.evaluator import Evaluator
+        >>> data_sampels = [
+        ...     DataSample().set_pred_score(pred).set_gt_score(gt)
+        ...     for pred, gt in zip(torch.rand(1000, 5), torch.randint(0, 2, (1000, 5)))]
+        >>> evaluator = Evaluator(metrics=MultiLabelMetric(thr=0.5))
+        >>> evaluator.process(data_sampels)
+        >>> evaluator.evaluate(1000)
+        {
+            'multi-label/precision': 50.72898037055408,
+            'multi-label/recall': 50.06836461357571,
+            'multi-label/f1-score': 50.384466955258475
+        }
+        >>> # Evaluate on each class by using topk strategy
+        >>> evaluator = Evaluator(metrics=MultiLabelMetric(topk=1, average=None))
+        >>> evaluator.process(data_sampels)
+        >>> evaluator.evaluate(1000)
+        {
+            'multi-label/precision_top1_classwise': [48.22, 50.54, 50.99, 44.18, 52.5],
+            'multi-label/recall_top1_classwise': [18.92, 19.22, 19.92, 20.0, 20.27],
+            'multi-label/f1-score_top1_classwise': [27.18, 27.85, 28.65, 27.54, 29.25]
+        }
+    """  # noqa: E501
+    default_prefix: Optional[str] = 'multi-label'
+
+    def __init__(self,
+                 thr: Optional[float] = None,
+                 topk: Optional[int] = None,
+                 items: Sequence[str] = ('precision', 'recall', 'f1-score'),
+                 average: Optional[str] = 'macro',
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None) -> None:
+
+        logger = MMLogger.get_current_instance()
+        if thr is None and topk is None:
+            thr = 0.5
+            logger.warning('Neither thr nor k is given, set thr as 0.5 by '
+                           'default.')
+        elif thr is not None and topk is not None:
+            logger.warning('Both thr and topk are given, '
+                           'use threshold in favor of top-k.')
+
+        self.thr = thr
+        self.topk = topk
+        self.average = average
+
+        for item in items:
+            assert item in ['precision', 'recall', 'f1-score', 'support'], \
+                f'The metric {item} is not supported by `SingleLabelMetric`,' \
+                ' please choose from "precision", "recall", "f1-score" and ' \
+                '"support".'
+        self.items = tuple(items)
+
+        super().__init__(collect_device=collect_device, prefix=prefix)
+
+    def process(self, data_batch, data_samples: Sequence[dict]):
+        """Process one batch of data samples.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to computed the metrics when all batches have been processed.
+
+        Args:
+            data_batch: A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+        for data_sample in data_samples:
+            result = dict()
+
+            result['pred_score'] = data_sample['pred_score'].clone()
+            num_classes = result['pred_score'].size()[-1]
+
+            if 'gt_score' in data_sample:
+                result['gt_score'] = data_sample['gt_score'].clone()
+            else:
+                result['gt_score'] = label_to_onehot(data_sample['gt_label'],
+                                                     num_classes)
+
+            # Save the result to `self.results`.
+            self.results.append(result)
+
+    def compute_metrics(self, results: List):
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict: The computed metrics. The keys are the names of the metrics,
+            and the values are corresponding results.
+        """
+        # NOTICE: don't access `self.results` from the method. `self.results`
+        # are a list of results from multiple batch, while the input `results`
+        # are the collected results.
+        metrics = {}
+
+        target = torch.stack([res['gt_score'] for res in results])
+        pred = torch.stack([res['pred_score'] for res in results])
+
+        metric_res = self.calculate(
+            pred,
+            target,
+            pred_indices=False,
+            target_indices=False,
+            average=self.average,
+            thr=self.thr,
+            topk=self.topk)
+
+        def pack_results(precision, recall, f1_score, support):
+            single_metrics = {}
+            if 'precision' in self.items:
+                single_metrics['precision'] = precision
+            if 'recall' in self.items:
+                single_metrics['recall'] = recall
+            if 'f1-score' in self.items:
+                single_metrics['f1-score'] = f1_score
+            if 'support' in self.items:
+                single_metrics['support'] = support
+            return single_metrics
+
+        if self.thr:
+            suffix = '' if self.thr == 0.5 else f'_thr-{self.thr:.2f}'
+            for k, v in pack_results(*metric_res).items():
+                metrics[k + suffix] = v
+        else:
+            for k, v in pack_results(*metric_res).items():
+                metrics[k + f'_top{self.topk}'] = v
+
+        result_metrics = dict()
+        for k, v in metrics.items():
+            if self.average is None:
+                result_metrics[k + '_classwise'] = v.detach().cpu().tolist()
+            elif self.average == 'macro':
+                result_metrics[k] = v.item()
+            else:
+                result_metrics[k + f'_{self.average}'] = v.item()
+        return result_metrics
+
+    @staticmethod
+    def calculate(
+        pred: Union[torch.Tensor, np.ndarray, Sequence],
+        target: Union[torch.Tensor, np.ndarray, Sequence],
+        pred_indices: bool = False,
+        target_indices: bool = False,
+        average: Optional[str] = 'macro',
+        thr: Optional[float] = None,
+        topk: Optional[int] = None,
+        num_classes: Optional[int] = None
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+        """Calculate the precision, recall, f1-score.
+
+        Args:
+            pred (torch.Tensor | np.ndarray | Sequence): The prediction
+                results. A :obj:`torch.Tensor` or :obj:`np.ndarray` with
+                shape ``(N, num_classes)`` or a sequence of index/onehot
+                format labels.
+            target (torch.Tensor | np.ndarray | Sequence): The prediction
+                results. A :obj:`torch.Tensor` or :obj:`np.ndarray` with
+                shape ``(N, num_classes)`` or a sequence of index/onehot
+                format labels.
+            pred_indices (bool): Whether the ``pred`` is a sequence of
+                category index labels. If True, ``num_classes`` must be set.
+                Defaults to False.
+            target_indices (bool): Whether the ``target`` is a sequence of
+                category index labels. If True, ``num_classes`` must be set.
+                Defaults to False.
+            average (str | None): How to calculate the final metrics from
+                the confusion matrix of every category. It supports three
+                modes:
+
+                - `"macro"`: Calculate metrics for each category, and calculate
+                  the mean value over all categories.
+                - `"micro"`: Average the confusion matrix over all categories
+                  and calculate metrics on the mean confusion matrix.
+                - `None`: Calculate metrics of every category and output
+                  directly.
+
+                Defaults to "macro".
+            thr (float, optional): Predictions with scores under the thresholds
+                are considered as negative. Defaults to None.
+            topk (int, optional): Predictions with the k-th highest scores are
+                considered as positive. Defaults to None.
+            num_classes (Optional, int): The number of classes. If the ``pred``
+                is indices instead of onehot, this argument is required.
+                Defaults to None.
+
+        Returns:
+            Tuple: The tuple contains precision, recall and f1-score.
+            And the type of each item is:
+
+            - torch.Tensor: A tensor for each metric. The shape is (1, ) if
+              ``average`` is not None, and (C, ) if ``average`` is None.
+
+        Notes:
+            If both ``thr`` and ``topk`` are set, use ``thr` to determine
+            positive predictions. If neither is set, use ``thr=0.5`` as
+            default.
+        """
+        average_options = ['micro', 'macro', None]
+        assert average in average_options, 'Invalid `average` argument, ' \
+            f'please specicy from {average_options}.'
+
+        def _format_label(label, is_indices):
+            """format various label to torch.Tensor."""
+            if isinstance(label, np.ndarray):
+                assert label.ndim == 2, 'The shape `pred` and `target` ' \
+                    'array must be (N, num_classes).'
+                label = torch.from_numpy(label)
+            elif isinstance(label, torch.Tensor):
+                assert label.ndim == 2, 'The shape `pred` and `target` ' \
+                    'tensor must be (N, num_classes).'
+            elif isinstance(label, Sequence):
+                if is_indices:
+                    assert num_classes is not None, 'For index-type labels, ' \
+                        'please specify `num_classes`.'
+                    label = torch.stack([
+                        label_to_onehot(indices, num_classes)
+                        for indices in label
+                    ])
+                else:
+                    label = torch.stack(
+                        [to_tensor(onehot) for onehot in label])
+            else:
+                raise TypeError(
+                    'The `pred` and `target` must be type of torch.tensor or '
+                    f'np.ndarray or sequence but get {type(label)}.')
+            return label
+
+        pred = _format_label(pred, pred_indices)
+        target = _format_label(target, target_indices).long()
+
+        assert pred.shape == target.shape, \
+            f"The size of pred ({pred.shape}) doesn't match "\
+            f'the target ({target.shape}).'
+
+        if num_classes is not None:
+            assert pred.size(1) == num_classes, \
+                f'The shape of `pred` ({pred.shape}) '\
+                f"doesn't match the num_classes ({num_classes})."
+        num_classes = pred.size(1)
+
+        thr = 0.5 if (thr is None and topk is None) else thr
+
+        if thr is not None:
+            # a label is predicted positive if larger than thr
+            pos_inds = (pred >= thr).long()
+        else:
+            # top-k labels will be predicted positive for any example
+            _, topk_indices = pred.topk(topk)
+            pos_inds = torch.zeros_like(pred).scatter_(1, topk_indices, 1)
+            pos_inds = pos_inds.long()
+
+        return _precision_recall_f1_support(pos_inds, target, average)
+
+
+def _average_precision(pred: torch.Tensor,
+                       target: torch.Tensor) -> torch.Tensor:
+    r"""Calculate the average precision for a single class.
+
+    AP summarizes a precision-recall curve as the weighted mean of maximum
+    precisions obtained for any r'>r, where r is the recall:
+
+    .. math::
+        \text{AP} = \sum_n (R_n - R_{n-1}) P_n
+
+    Note that no approximation is involved since the curve is piecewise
+    constant.
+
+    Args:
+        pred (torch.Tensor): The model prediction with shape
+            ``(N, num_classes)``.
+        target (torch.Tensor): The target of predictions with shape
+            ``(N, num_classes)``.
+
+    Returns:
+        torch.Tensor: average precision result.
+    """
+    assert pred.shape == target.shape, \
+        f"The size of pred ({pred.shape}) doesn't match "\
+        f'the target ({target.shape}).'
+
+    # a small value for division by zero errors
+    eps = torch.finfo(torch.float32).eps
+
+    # get rid of -1 target such as difficult sample
+    # that is not wanted in evaluation results.
+    valid_index = target > -1
+    pred = pred[valid_index]
+    target = target[valid_index]
+
+    # sort examples
+    sorted_pred_inds = torch.argsort(pred, dim=0, descending=True)
+    sorted_target = target[sorted_pred_inds]
+
+    # get indexes when gt_true is positive
+    pos_inds = sorted_target == 1
+
+    # Calculate cumulative tp case numbers
+    tps = torch.cumsum(pos_inds, 0)
+    total_pos = tps[-1].item()  # the last of tensor may change later
+
+    # Calculate cumulative tp&fp(pred_poss) case numbers
+    pred_pos_nums = torch.arange(1, len(sorted_target) + 1).to(pred.device)
+    pred_pos_nums[pred_pos_nums < eps] = eps
+
+    tps[torch.logical_not(pos_inds)] = 0
+    precision = tps / pred_pos_nums.float()
+    ap = torch.sum(precision, 0) / max(total_pos, eps)
+    return ap
+
+
+@METRICS.register_module()
+class AveragePrecision(BaseMetric):
+    r"""Calculate the average precision with respect of classes.
+
+    AveragePrecision (AP) summarizes a precision-recall curve as the weighted
+    mean of maximum precisions obtained for any r'>r, where r is the recall:
+
+    .. math::
+        \text{AP} = \sum_n (R_n - R_{n-1}) P_n
+
+    Note that no approximation is involved since the curve is piecewise
+    constant.
+
+    Args:
+        average (str | None): How to calculate the final metrics from
+            every category. It supports two modes:
+
+            - `"macro"`: Calculate metrics for each category, and calculate
+              the mean value over all categories. The result of this mode
+              is also called **mAP**.
+            - `None`: Calculate metrics of every category and output directly.
+
+            Defaults to "macro".
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+
+    References
+    ----------
+    1. `Wikipedia entry for the Average precision
+       <https://en.wikipedia.org/w/index.php?title=Information_retrieval&
+       oldid=793358396#Average_precision>`_
+
+    Examples:
+        >>> import torch
+        >>> from mmpretrain.evaluation import AveragePrecision
+        >>> # --------- The Basic Usage for one-hot pred scores ---------
+        >>> y_pred = torch.Tensor([[0.9, 0.8, 0.3, 0.2],
+        ...                        [0.1, 0.2, 0.2, 0.1],
+        ...                        [0.7, 0.5, 0.9, 0.3],
+        ...                        [0.8, 0.1, 0.1, 0.2]])
+        >>> y_true = torch.Tensor([[1, 1, 0, 0],
+        ...                        [0, 1, 0, 0],
+        ...                        [0, 0, 1, 0],
+        ...                        [1, 0, 0, 0]])
+        >>> AveragePrecision.calculate(y_pred, y_true)
+        tensor(70.833)
+        >>> # ------------------- Use with Evalutor -------------------
+        >>> from mmpretrain.structures import DataSample
+        >>> from mmengine.evaluator import Evaluator
+        >>> data_samples = [
+        ...     DataSample().set_pred_score(i).set_gt_score(j)
+        ...     for i, j in zip(y_pred, y_true)
+        ... ]
+        >>> evaluator = Evaluator(metrics=AveragePrecision())
+        >>> evaluator.process(data_samples)
+        >>> evaluator.evaluate(5)
+        {'multi-label/mAP': 70.83333587646484}
+        >>> # Evaluate on each class
+        >>> evaluator = Evaluator(metrics=AveragePrecision(average=None))
+        >>> evaluator.process(data_samples)
+        >>> evaluator.evaluate(5)
+        {'multi-label/AP_classwise': [100., 83.33, 100., 0.]}
+    """
+    default_prefix: Optional[str] = 'multi-label'
+
+    def __init__(self,
+                 average: Optional[str] = 'macro',
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        self.average = average
+
+    def process(self, data_batch, data_samples: Sequence[dict]):
+        """Process one batch of data samples.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to computed the metrics when all batches have been processed.
+
+        Args:
+            data_batch: A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+
+        for data_sample in data_samples:
+            result = dict()
+
+            result['pred_score'] = data_sample['pred_score'].clone()
+            num_classes = result['pred_score'].size()[-1]
+
+            if 'gt_score' in data_sample:
+                result['gt_score'] = data_sample['gt_score'].clone()
+            else:
+                result['gt_score'] = label_to_onehot(data_sample['gt_label'],
+                                                     num_classes)
+
+            # Save the result to `self.results`.
+            self.results.append(result)
+
+    def compute_metrics(self, results: List):
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict: The computed metrics. The keys are the names of the metrics,
+            and the values are corresponding results.
+        """
+        # NOTICE: don't access `self.results` from the method. `self.results`
+        # are a list of results from multiple batch, while the input `results`
+        # are the collected results.
+
+        # concat
+        target = torch.stack([res['gt_score'] for res in results])
+        pred = torch.stack([res['pred_score'] for res in results])
+
+        ap = self.calculate(pred, target, self.average)
+
+        result_metrics = dict()
+
+        if self.average is None:
+            result_metrics['AP_classwise'] = ap.detach().cpu().tolist()
+        else:
+            result_metrics['mAP'] = ap.item()
+
+        return result_metrics
+
+    @staticmethod
+    def calculate(pred: Union[torch.Tensor, np.ndarray],
+                  target: Union[torch.Tensor, np.ndarray],
+                  average: Optional[str] = 'macro') -> torch.Tensor:
+        r"""Calculate the average precision for a single class.
+
+        Args:
+            pred (torch.Tensor | np.ndarray): The model predictions with
+                shape ``(N, num_classes)``.
+            target (torch.Tensor | np.ndarray): The target of predictions
+                with shape ``(N, num_classes)``.
+            average (str | None): The average method. It supports two modes:
+
+                - `"macro"`: Calculate metrics for each category, and calculate
+                  the mean value over all categories. The result of this mode
+                  is also called mAP.
+                - `None`: Calculate metrics of every category and output
+                  directly.
+
+                Defaults to "macro".
+
+        Returns:
+            torch.Tensor: the average precision of all classes.
+        """
+        average_options = ['macro', None]
+        assert average in average_options, 'Invalid `average` argument, ' \
+            f'please specicy from {average_options}.'
+
+        pred = to_tensor(pred)
+        target = to_tensor(target)
+        assert pred.ndim == 2 and pred.shape == target.shape, \
+            'Both `pred` and `target` should have shape `(N, num_classes)`.'
+
+        num_classes = pred.shape[1]
+        ap = pred.new_zeros(num_classes)
+        for k in range(num_classes):
+            ap[k] = _average_precision(pred[:, k], target[:, k])
+        if average == 'macro':
+            return ap.mean() * 100.0
+        else:
+            return ap * 100
--- a/mmpretrain/evaluation/metrics/multi_task.py
+++ b/mmpretrain/evaluation/metrics/multi_task.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Sequence
+
+from mmengine.evaluator import BaseMetric
+
+from mmpretrain.registry import METRICS
+
+
+@METRICS.register_module()
+class MultiTasksMetric(BaseMetric):
+    """Metrics for MultiTask
+    Args:
+        task_metrics(dict): a dictionary in the keys are the names of the tasks
+            and the values is a list of the metric corresponds to this task
+    Examples:
+        >>> import torch
+        >>> from mmpretrain.evaluation import MultiTasksMetric
+        # -------------------- The Basic Usage --------------------
+        >>>task_metrics = {
+            'task0': [dict(type='Accuracy', topk=(1, ))],
+            'task1': [dict(type='Accuracy', topk=(1, 3))]
+        }
+        >>>pred = [{
+            'pred_task': {
+                'task0': torch.tensor([0.7, 0.0, 0.3]),
+                'task1': torch.tensor([0.5, 0.2, 0.3])
+            },
+            'gt_task': {
+                'task0':  torch.tensor(0),
+                'task1':  torch.tensor(2)
+            }
+        }, {
+            'pred_task': {
+                'task0': torch.tensor([0.0, 0.0, 1.0]),
+                'task1': torch.tensor([0.0, 0.0, 1.0])
+            },
+            'gt_task': {
+                'task0':  torch.tensor(2),
+                'task1':  torch.tensor(2)
+            }
+        }]
+        >>>metric = MultiTasksMetric(task_metrics)
+        >>>metric.process(None, pred)
+        >>>results = metric.evaluate(2)
+        results = {
+            'task0_accuracy/top1': 100.0,
+            'task1_accuracy/top1': 50.0,
+            'task1_accuracy/top3': 100.0
+        }
+    """
+
+    def __init__(self,
+                 task_metrics: Dict,
+                 collect_device: str = 'cpu') -> None:
+        self.task_metrics = task_metrics
+        super().__init__(collect_device=collect_device)
+
+        self._metrics = {}
+        for task_name in self.task_metrics.keys():
+            self._metrics[task_name] = []
+            for metric in self.task_metrics[task_name]:
+                self._metrics[task_name].append(METRICS.build(metric))
+
+    def process(self, data_batch, data_samples: Sequence[dict]):
+        """Process one batch of data samples.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to computed the metrics when all batches have been processed.
+        Args:
+            data_batch: A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+        for task_name in self.task_metrics.keys():
+            filtered_data_samples = []
+            for data_sample in data_samples:
+                eval_mask = data_sample[task_name]['eval_mask']
+                if eval_mask:
+                    filtered_data_samples.append(data_sample[task_name])
+            for metric in self._metrics[task_name]:
+                metric.process(data_batch, filtered_data_samples)
+
+    def compute_metrics(self, results: list) -> dict:
+        raise NotImplementedError(
+            'compute metrics should not be used here directly')
+
+    def evaluate(self, size):
+        """Evaluate the model performance of the whole dataset after processing
+        all batches.
+
+        Args:
+            size (int): Length of the entire validation dataset. When batch
+                size > 1, the dataloader may pad some data samples to make
+                sure all ranks have the same length of dataset slice. The
+                ``collect_results`` function will drop the padded data based on
+                this size.
+        Returns:
+            dict: Evaluation metrics dict on the val dataset. The keys are
+            "{task_name}_{metric_name}" , and the values
+            are corresponding results.
+        """
+        metrics = {}
+        for task_name in self._metrics:
+            for metric in self._metrics[task_name]:
+                name = metric.__class__.__name__
+                if name == 'MultiTasksMetric' or metric.results:
+                    results = metric.evaluate(size)
+                else:
+                    results = {metric.__class__.__name__: 0}
+                for key in results:
+                    name = f'{task_name}_{key}'
+                    if name in results:
+                        """Inspired from https://github.com/open-
+                        mmlab/mmengine/ bl ob/ed20a9cba52ceb371f7c825131636b9e2
+                        747172e/mmengine/evalua tor/evaluator.py#L84-L87."""
+                        raise ValueError(
+                            'There are multiple metric results with the same'
+                            f'metric name {name}. Please make sure all metrics'
+                            'have different prefixes.')
+                    metrics[name] = results[key]
+        return metrics
--- a/mmpretrain/evaluation/metrics/nocaps.py
+++ b/mmpretrain/evaluation/metrics/nocaps.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional
+
+import mmengine
+
+from mmpretrain.registry import METRICS
+from mmpretrain.utils import require
+from .caption import COCOCaption, save_result
+
+try:
+    from pycocoevalcap.eval import COCOEvalCap
+    from pycocotools.coco import COCO
+except ImportError:
+    COCOEvalCap = None
+    COCO = None
+
+
+@METRICS.register_module()
+class NocapsSave(COCOCaption):
+    """Nocaps evaluation wrapper.
+
+    Save the generated captions and transform into coco format.
+    The dumped file can be submitted to the official evluation system.
+
+    Args:
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Should be modified according to the
+            `retrieval_type` for unambiguous results. Defaults to TR.
+    """
+
+    @require('pycocoevalcap')
+    def __init__(self,
+                 save_dir: str = './',
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None):
+        super(COCOCaption, self).__init__(
+            collect_device=collect_device, prefix=prefix)
+        self.save_dir = save_dir
+
+    def compute_metrics(self, results: List):
+        """Compute the metrics from processed results.
+
+        Args:
+            results (dict): The processed results of each batch.
+        """
+        mmengine.mkdir_or_exist(self.save_dir)
+        save_result(
+            result=results,
+            result_dir=self.save_dir,
+            filename='nocap_pred',
+            remove_duplicate='image_id',
+        )
+
+        return dict()