add a to another part of mmgeneration code

b7536f78 · limm · 57e0e891 · b7536f78 · b7536f78 · b7536f78
Commit b7536f78 authored Jun 16, 2025 by limm
20 changed files
--- a/mmgen/models/architectures/cyclegan/__init__.py
+++ b/mmgen/models/architectures/cyclegan/__init__.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from .generator_discriminator import ResnetGenerator
+from .modules import ResidualBlockWithDropout
+
+__all__ = ['ResnetGenerator', 'ResidualBlockWithDropout']
--- a/mmgen/models/architectures/cyclegan/generator_discriminator.py
+++ b/mmgen/models/architectures/cyclegan/generator_discriminator.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.runner import load_checkpoint
+
+from mmgen.models.architectures.pix2pix import generation_init_weights
+from mmgen.models.builder import MODULES
+from mmgen.utils import get_root_logger
+from .modules import ResidualBlockWithDropout
+
+
+@MODULES.register_module()
+class ResnetGenerator(nn.Module):
+    """Construct a Resnet-based generator that consists of residual blocks
+    between a few downsampling/upsampling operations.
+
+    Args:
+        in_channels (int): Number of channels in input images.
+        out_channels (int): Number of channels in output images.
+        base_channels (int): Number of filters at the last conv layer.
+            Default: 64.
+        norm_cfg (dict): Config dict to build norm layer. Default:
+            `dict(type='IN')`.
+        use_dropout (bool): Whether to use dropout layers. Default: False.
+        num_blocks (int): Number of residual blocks. Default: 9.
+        padding_mode (str): The name of padding layer in conv layers:
+            'reflect' | 'replicate' | 'zeros'. Default: 'reflect'.
+        init_cfg (dict): Config dict for initialization.
+            `type`: The name of our initialization method. Default: 'normal'.
+            `gain`: Scaling factor for normal, xavier and orthogonal.
+            Default: 0.02.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 base_channels=64,
+                 norm_cfg=dict(type='IN'),
+                 use_dropout=False,
+                 num_blocks=9,
+                 padding_mode='reflect',
+                 init_cfg=dict(type='normal', gain=0.02)):
+        super().__init__()
+        assert num_blocks >= 0, ('Number of residual blocks must be '
+                                 f'non-negative, but got {num_blocks}.')
+        assert isinstance(norm_cfg, dict), ("'norm_cfg' should be dict, but"
+                                            f'got {type(norm_cfg)}')
+        assert 'type' in norm_cfg, "'norm_cfg' must have key 'type'"
+        # We use norm layers in the resnet generator.
+        # Only for IN, use bias to follow cyclegan's original implementation.
+        use_bias = norm_cfg['type'] == 'IN'
+
+        model = []
+        model += [
+            ConvModule(
+                in_channels=in_channels,
+                out_channels=base_channels,
+                kernel_size=7,
+                padding=3,
+                bias=use_bias,
+                norm_cfg=norm_cfg,
+                padding_mode=padding_mode)
+        ]
+
+        num_down = 2
+        # add downsampling layers
+        for i in range(num_down):
+            multiple = 2**i
+            model += [
+                ConvModule(
+                    in_channels=base_channels * multiple,
+                    out_channels=base_channels * multiple * 2,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    bias=use_bias,
+                    norm_cfg=norm_cfg)
+            ]
+
+        # add residual blocks
+        multiple = 2**num_down
+        for i in range(num_blocks):
+            model += [
+                ResidualBlockWithDropout(
+                    base_channels * multiple,
+                    padding_mode=padding_mode,
+                    norm_cfg=norm_cfg,
+                    use_dropout=use_dropout)
+            ]
+
+        # add upsampling layers
+        for i in range(num_down):
+            multiple = 2**(num_down - i)
+            model += [
+                ConvModule(
+                    in_channels=base_channels * multiple,
+                    out_channels=base_channels * multiple // 2,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    bias=use_bias,
+                    conv_cfg=dict(type='deconv', output_padding=1),
+                    norm_cfg=norm_cfg)
+            ]
+
+        model += [
+            ConvModule(
+                in_channels=base_channels,
+                out_channels=out_channels,
+                kernel_size=7,
+                padding=3,
+                bias=True,
+                norm_cfg=None,
+                act_cfg=dict(type='Tanh'),
+                padding_mode=padding_mode)
+        ]
+
+        self.model = nn.Sequential(*model)
+        self.init_type = 'normal' if init_cfg is None else init_cfg.get(
+            'type', 'normal')
+        self.init_gain = 0.02 if init_cfg is None else init_cfg.get(
+            'gain', 0.02)
+
+    def forward(self, x):
+        """Forward function.
+
+        Args:
+            x (Tensor): Input tensor with shape (n, c, h, w).
+
+        Returns:
+            Tensor: Forward results.
+        """
+        return self.model(x)
+
+    def init_weights(self, pretrained=None, strict=True):
+        """Initialize weights for the model.
+
+        Args:
+            pretrained (str, optional): Path for pretrained weights. If given
+                None, pretrained weights will not be loaded. Default: None.
+            strict (bool, optional): Whether to allow different params for the
+                model and checkpoint. Default: True.
+        """
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            load_checkpoint(self, pretrained, strict=strict, logger=logger)
+        elif pretrained is None:
+            generation_init_weights(
+                self, init_type=self.init_type, init_gain=self.init_gain)
+        else:
+            raise TypeError("'pretrained' must be a str or None. "
+                            f'But received {type(pretrained)}.')
--- a/mmgen/models/architectures/cyclegan/modules.py
+++ b/mmgen/models/architectures/cyclegan/modules.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+
+class ResidualBlockWithDropout(nn.Module):
+    """Define a Residual Block with dropout layers.
+
+    Ref:
+    Deep Residual Learning for Image Recognition
+
+    A residual block is a conv block with skip connections. A dropout layer is
+    added between two common conv modules.
+
+    Args:
+        channels (int): Number of channels in the conv layer.
+        padding_mode (str): The name of padding layer:
+            'reflect' | 'replicate' | 'zeros'.
+        norm_cfg (dict): Config dict to build norm layer. Default:
+            `dict(type='IN')`.
+        use_dropout (bool): Whether to use dropout layers. Default: True.
+    """
+
+    def __init__(self,
+                 channels,
+                 padding_mode,
+                 norm_cfg=dict(type='BN'),
+                 use_dropout=True):
+        super().__init__()
+        assert isinstance(norm_cfg, dict), ("'norm_cfg' should be dict, but"
+                                            f'got {type(norm_cfg)}')
+        assert 'type' in norm_cfg, "'norm_cfg' must have key 'type'"
+        # We use norm layers in the residual block with dropout layers.
+        # Only for IN, use bias to follow cyclegan's original implementation.
+        use_bias = norm_cfg['type'] == 'IN'
+
+        block = [
+            ConvModule(
+                in_channels=channels,
+                out_channels=channels,
+                kernel_size=3,
+                padding=1,
+                bias=use_bias,
+                norm_cfg=norm_cfg,
+                padding_mode=padding_mode)
+        ]
+
+        if use_dropout:
+            block += [nn.Dropout(0.5)]
+
+        block += [
+            ConvModule(
+                in_channels=channels,
+                out_channels=channels,
+                kernel_size=3,
+                padding=1,
+                bias=use_bias,
+                norm_cfg=norm_cfg,
+                act_cfg=None,
+                padding_mode=padding_mode)
+        ]
+
+        self.block = nn.Sequential(*block)
+
+    def forward(self, x):
+        """Forward function. Add skip connections without final ReLU.
+
+        Args:
+            x (Tensor): Input tensor with shape (n, c, h, w).
+
+        Returns:
+            Tensor: Forward results.
+        """
+        out = x + self.block(x)
+        return out
--- a/mmgen/models/architectures/dcgan/__init__.py
+++ b/mmgen/models/architectures/dcgan/__init__.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from .generator_discriminator import DCGANDiscriminator, DCGANGenerator
+
+__all__ = ['DCGANGenerator', 'DCGANDiscriminator']
--- a/mmgen/models/architectures/dcgan/generator_discriminator.py
+++ b/mmgen/models/architectures/dcgan/generator_discriminator.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, normal_init
+from mmcv.runner import load_checkpoint
+from mmcv.utils.parrots_wrapper import _BatchNorm
+
+from mmgen.models.builder import MODULES
+from mmgen.utils import get_root_logger
+from ..common import get_module_device
+
+
+@MODULES.register_module()
+class DCGANGenerator(nn.Module):
+    """Generator for DCGAN.
+
+    Implementation Details for DCGAN architecture:
+
+    #. Adopt transposed convolution in the generator;
+    #. Use batchnorm in the generator except for the final output layer;
+    #. Use ReLU in the generator in addition to the final output layer.
+
+    More details can be found in the original paper:
+    Unsupervised Representation Learning with Deep Convolutional
+    Generative Adversarial Networks
+    http://arxiv.org/abs/1511.06434
+
+    Args:
+        output_scale (int | tuple[int]): Output scale for the generated
+            image. If only a integer is provided, the output image will
+            be a square shape. The tuple of two integers will set the
+            height and width for the output image, respectively.
+        out_channels (int, optional): The channel number of the output feature.
+            Default to 3.
+        base_channels (int, optional): The basic channel number of the
+            generator. The other layers contains channels based on this number.
+            Default to 1024.
+        input_scale (int | tuple[int], optional): Output scale for the
+            generated image. If only a integer is provided, the input feature
+            ahead of the convolutional generator will be a square shape. The
+            tuple of two integers will set the height and width for the input
+            convolutional feature, respectively. Defaults to 4.
+        noise_size (int, optional): Size of the input noise
+            vector. Defaults to 100.
+        default_norm_cfg (dict, optional): Norm config for all of layers
+            except for the final output layer. Defaults to ``dict(type='BN')``.
+        default_act_cfg (dict, optional): Activation config for all of layers
+            except for the final output layer. Defaults to
+            ``dict(type='ReLU')``.
+        out_act_cfg (dict, optional): Activation config for the final output
+            layer. Defaults to ``dict(type='Tanh')``.
+        pretrained (str, optional): Path for the pretrained model. Default to
+            ``None``.
+    """
+
+    def __init__(self,
+                 output_scale,
+                 out_channels=3,
+                 base_channels=1024,
+                 input_scale=4,
+                 noise_size=100,
+                 default_norm_cfg=dict(type='BN'),
+                 default_act_cfg=dict(type='ReLU'),
+                 out_act_cfg=dict(type='Tanh'),
+                 pretrained=None):
+        super().__init__()
+        self.output_scale = output_scale
+        self.base_channels = base_channels
+        self.input_scale = input_scale
+        self.noise_size = noise_size
+
+        # the number of times for upsampling
+        self.num_upsamples = int(np.log2(output_scale // input_scale))
+
+        # output 4x4 feature map
+        self.noise2feat = ConvModule(
+            noise_size,
+            base_channels,
+            kernel_size=4,
+            stride=1,
+            padding=0,
+            conv_cfg=dict(type='ConvTranspose2d'),
+            norm_cfg=default_norm_cfg,
+            act_cfg=default_act_cfg)
+
+        # build up upsampling backbone (excluding the output layer)
+        upsampling = []
+        curr_channel = base_channels
+        for _ in range(self.num_upsamples - 1):
+            upsampling.append(
+                ConvModule(
+                    curr_channel,
+                    curr_channel // 2,
+                    kernel_size=4,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=dict(type='ConvTranspose2d'),
+                    norm_cfg=default_norm_cfg,
+                    act_cfg=default_act_cfg))
+
+            curr_channel //= 2
+
+        self.upsampling = nn.Sequential(*upsampling)
+
+        # output layer
+        self.output_layer = ConvModule(
+            curr_channel,
+            out_channels,
+            kernel_size=4,
+            stride=2,
+            padding=1,
+            conv_cfg=dict(type='ConvTranspose2d'),
+            norm_cfg=None,
+            act_cfg=out_act_cfg)
+
+        self.init_weights(pretrained=pretrained)
+
+    def forward(self, noise, num_batches=0, return_noise=False):
+        """Forward function.
+
+        Args:
+            noise (torch.Tensor | callable | None): You can directly give a
+                batch of noise through a ``torch.Tensor`` or offer a callable
+                function to sample a batch of noise data. Otherwise, the
+                ``None`` indicates to use the default noise sampler.
+            num_batches (int, optional): The number of batch size.
+                Defaults to 0.
+            return_noise (bool, optional): If True, ``noise_batch`` will be
+                returned in a dict with ``fake_img``. Defaults to False.
+
+        Returns:
+            torch.Tensor | dict: If not ``return_noise``, only the output image
+                will be returned. Otherwise, a dict contains ``fake_img`` and
+                ``noise_batch`` will be returned.
+        """
+        # receive noise and conduct sanity check.
+        if isinstance(noise, torch.Tensor):
+            assert noise.shape[1] == self.noise_size
+            if noise.ndim == 2:
+                noise_batch = noise[:, :, None, None]
+            elif noise.ndim == 4:
+                noise_batch = noise
+            else:
+                raise ValueError('The noise should be in shape of (n, c) or '
+                                 f'(n, c, 1, 1), but got {noise.shape}')
+        # receive a noise generator and sample noise.
+        elif callable(noise):
+            noise_generator = noise
+            assert num_batches > 0
+            noise_batch = noise_generator((num_batches, self.noise_size, 1, 1))
+        # otherwise, we will adopt default noise sampler.
+        else:
+            assert num_batches > 0
+            noise_batch = torch.randn((num_batches, self.noise_size, 1, 1))
+
+        # dirty code for putting data on the right device
+        noise_batch = noise_batch.to(get_module_device(self))
+
+        x = self.noise2feat(noise_batch)
+        x = self.upsampling(x)
+        x = self.output_layer(x)
+
+        if return_noise:
+            return dict(fake_img=x, noise_batch=noise_batch)
+
+        return x
+
+    def init_weights(self, pretrained=None):
+        """Init weights for models.
+
+        We just use the initialization method proposed in the original paper.
+
+        Args:
+            pretrained (str, optional): Path for pretrained weights. If given
+                None, pretrained weights will not be loaded. Defaults to None.
+        """
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, (nn.Conv2d, nn.ConvTranspose2d)):
+                    normal_init(m, 0, 0.02)
+                elif isinstance(m, _BatchNorm):
+                    nn.init.normal_(m.weight.data)
+                    nn.init.constant_(m.bias.data, 0)
+        else:
+            raise TypeError('pretrained must be a str or None but'
+                            f' got {type(pretrained)} instead.')
+
+
+@MODULES.register_module()
+class DCGANDiscriminator(nn.Module):
+    """Discriminator for DCGAN.
+
+    Implementation Details for DCGAN architecture:
+
+    #. Adopt convolution in the discriminator;
+    #. Use batchnorm in the discriminator except for the input and final \
+       output layer;
+    #. Use LeakyReLU in the discriminator in addition to the output layer.
+
+    Args:
+        input_scale (int): The scale of the input image.
+        output_scale (int): The final scale of the convolutional feature.
+        out_channels (int): The channel number of the final output layer.
+        in_channels (int, optional): The channel number of the input image.
+            Defaults to 3.
+        base_channels (int, optional): The basic channel number of the
+            generator. The other layers contains channels based on this number.
+            Defaults to 128.
+        default_norm_cfg (dict, optional): Norm config for all of layers
+            except for the final output layer. Defaults to ``dict(type='BN')``.
+        default_act_cfg (dict, optional): Activation config for all of layers
+            except for the final output layer. Defaults to
+            ``dict(type='ReLU')``.
+        out_act_cfg (dict, optional): Activation config for the final output
+            layer. Defaults to ``dict(type='Tanh')``.
+        pretrained (str, optional): Path for the pretrained model. Default to
+            ``None``.
+    """
+
+    def __init__(self,
+                 input_scale,
+                 output_scale,
+                 out_channels,
+                 in_channels=3,
+                 base_channels=128,
+                 default_norm_cfg=dict(type='BN'),
+                 default_act_cfg=dict(type='LeakyReLU'),
+                 out_act_cfg=None,
+                 pretrained=None):
+        super().__init__()
+        self.input_scale = input_scale
+        self.output_scale = output_scale
+        self.out_channels = out_channels
+        self.base_channels = base_channels
+
+        # the number of times for downsampling
+        self.num_downsamples = int(np.log2(input_scale // output_scale))
+
+        # build up downsampling backbone (excluding the output layer)
+        downsamples = []
+        for i in range(self.num_downsamples):
+            # remove norm for the first conv
+            norm_cfg_ = None if i == 0 else default_norm_cfg
+            in_ch = in_channels if i == 0 else base_channels * 2**(i - 1)
+
+            downsamples.append(
+                ConvModule(
+                    in_ch,
+                    base_channels * 2**i,
+                    kernel_size=4,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=dict(type='Conv2d'),
+                    norm_cfg=norm_cfg_,
+                    act_cfg=default_act_cfg))
+            curr_channels = base_channels * 2**i
+
+        self.downsamples = nn.Sequential(*downsamples)
+
+        # define output layer
+        self.output_layer = ConvModule(
+            curr_channels,
+            out_channels,
+            kernel_size=4,
+            stride=1,
+            padding=0,
+            conv_cfg=dict(type='Conv2d'),
+            norm_cfg=None,
+            act_cfg=out_act_cfg)
+
+        self.init_weights(pretrained=pretrained)
+
+    def forward(self, x):
+        """Forward function.
+
+        Args:
+            x (torch.Tensor): Fake or real image tensor.
+
+        Returns:
+            torch.Tensor: Prediction for the reality of the input image.
+        """
+
+        n = x.shape[0]
+        x = self.downsamples(x)
+        x = self.output_layer(x)
+
+        # reshape to a flatten feature
+        return x.view(n, -1)
+
+    def init_weights(self, pretrained=None):
+        """Init weights for models.
+
+        We just use the initialization method proposed in the original paper.
+
+        Args:
+            pretrained (str, optional): Path for pretrained weights. If given
+                None, pretrained weights will not be loaded. Defaults to None.
+        """
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, (nn.Conv2d, nn.ConvTranspose2d)):
+                    normal_init(m, 0, 0.02)
+                elif isinstance(m, _BatchNorm):
+                    nn.init.normal_(m.weight.data)
+                    nn.init.constant_(m.bias.data, 0)
+        else:
+            raise TypeError('pretrained must be a str or None but'
+                            f' got {type(pretrained)} instead.')
--- a/mmgen/models/architectures/ddpm/__init__.py
+++ b/mmgen/models/architectures/ddpm/__init__.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from .denoising import DenoisingUnet
+from .modules import (DenoisingDownsample, DenoisingResBlock,
+                      DenoisingUpsample, TimeEmbedding)
+
+__all__ = [
+    'DenoisingUnet', 'TimeEmbedding', 'DenoisingDownsample',
+    'DenoisingUpsample', 'DenoisingResBlock'
+]
--- a/mmgen/models/architectures/ddpm/denoising.py
+++ b/mmgen/models/architectures/ddpm/denoising.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from copy import deepcopy
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import constant_init
+from mmcv.cnn.bricks.conv_module import ConvModule
+from mmcv.runner import load_checkpoint
+
+from mmgen.models.builder import MODULES, build_module
+from mmgen.utils import get_root_logger
+from .modules import EmbedSequential, TimeEmbedding
+
+
+@MODULES.register_module()
+class DenoisingUnet(nn.Module):
+    """Denoising Unet. This network receives a diffused image ``x_t`` and
+    current timestep ``t``, and returns a ``output_dict`` corresponding to the
+    passed ``output_cfg``.
+
+    ``output_cfg`` defines the number of channels and the meaning of the
+    output. ``output_cfg`` mainly contains keys of ``mean`` and ``var``,
+    denoting how the network outputs mean and variance required for the
+    denoising process.
+    For ``mean``:
+    1. ``dict(mean='EPS')``: Model will predict noise added in the
+        diffusion process, and the ``output_dict`` will contain a key named
+        ``eps_t_pred``.
+    2. ``dict(mean='START_X')``: Model will direct predict the mean of the
+        original image `x_0`, and the ``output_dict`` will contain a key named
+        ``x_0_pred``.
+    3. ``dict(mean='X_TM1_PRED')``: Model will predict the mean of diffused
+        image at `t-1` timestep, and the ``output_dict`` will contain a key
+        named ``x_tm1_pred``.
+
+    For ``var``:
+    1. ``dict(var='FIXED_SMALL')`` or ``dict(var='FIXED_LARGE')``: Variance in
+        the denoising process is regarded as a fixed value. Therefore only
+        'mean' will be predicted, and the output channels will equal to the
+        input image (e.g., three channels for RGB image.)
+    2. ``dict(var='LEARNED')``: Model will predict `log_variance` in the
+        denoising process, and the ``output_dict`` will contain a key named
+        ``log_var``.
+    3. ``dict(var='LEARNED_RANGE')``: Model will predict an interpolation
+        factor and the `log_variance` will be calculated as
+        `factor * upper_bound + (1-factor) * lower_bound`. The ``output_dict``
+        will contain a key named ``factor``.
+
+    If ``var`` is not ``FIXED_SMALL`` or ``FIXED_LARGE``, the number of output
+    channels will be the double of input channels, where the first half part
+    contains predicted mean values and the other part is the predicted
+    variance values. Otherwise, the number of output channels equals to the
+    input channels, only containing the predicted mean values.
+
+    Args:
+        image_size (int | list[int]): The size of image to denoise.
+        in_channels (int, optional): The input channels of the input image.
+            Defaults as ``3``.
+        base_channels (int, optional): The basic channel number of the
+            generator. The other layers contain channels based on this number.
+            Defaults to ``128``.
+        resblocks_per_downsample (int, optional): Number of ResBlock used
+            between two downsample operations. The number of ResBlock between
+            upsample operations will be the same value to keep symmetry.
+            Defaults to 3.
+        num_timesteps (int, optional): The total timestep of the denoising
+            process and the diffusion process. Defaults to ``1000``.
+        use_rescale_timesteps (bool, optional): Whether rescale the input
+            timesteps in range of [0, 1000].  Defaults to ``True``.
+        dropout (float, optional): The probability of dropout operation of
+            each ResBlock. Pass ``0`` to do not use dropout. Defaults as 0.
+        embedding_channels (int, optional): The output channels of time
+            embedding layer and label embedding layer. If not passed (or
+            passed ``-1``), output channels of the embedding layers will set
+            as four times of ``base_channels``. Defaults to ``-1``.
+        num_classes (int, optional): The number of conditional classes. If set
+            to 0, this model will be degraded to an unconditional model.
+            Defaults to 0.
+        channels_cfg (list | dict[list], optional): Config for input channels
+            of the intermedia blocks. If list is passed, each element of the
+            list indicates the scale factor for the input channels of the
+            current block with regard to the ``base_channels``. For block
+            ``i``, the input and output channels should be
+            ``channels_cfg[i] * base_channels`` and
+            ``channels_cfg[i+1] * base_channels`` If dict is provided, the key
+            of the dict should be the output scale and corresponding value
+            should be a list to define channels. Default: Please refer to
+            ``_defualt_channels_cfg``.
+        output_cfg (dict, optional): Config for output variables. Defaults to
+            ``dict(mean='eps', var='learned_range')``.
+        norm_cfg (dict, optional): The config for normalization layers.
+            Defaults to ``dict(type='GN', num_groups=32)``.
+        act_cfg (dict, optional): The config for activation layers. Defaults
+            to ``dict(type='SiLU', inplace=False)``.
+        shortcut_kernel_size (int, optional): The kernel size for shortcut
+            conv in ResBlocks. The value of this argument will overwrite the
+            default value of `resblock_cfg`. Defaults to `3`.
+        use_scale_shift_norm (bool, optional): Whether perform scale and shift
+            after normalization operation. Defaults to True.
+        num_heads (int, optional): The number of attention heads. Defaults to
+            4.
+        time_embedding_mode (str, optional): Embedding method of
+            ``time_embedding``. Defaults to 'sin'.
+        time_embedding_cfg (dict, optional): Config for ``time_embedding``.
+            Defaults to None.
+        resblock_cfg (dict, optional): Config for ResBlock. Defaults to
+            ``dict(type='DenoisingResBlock')``.
+        attention_cfg (dict, optional): Config for attention operation.
+            Defaults to ``dict(type='MultiHeadAttention')``.
+        upsample_conv (bool, optional): Whether use conv in upsample block.
+            Defaults to ``True``.
+        downsample_conv (bool, optional): Whether use conv operation in
+            downsample block.  Defaults to ``True``.
+        upsample_cfg (dict, optional): Config for upsample blocks.
+            Defaults to ``dict(type='DenoisingUpsample')``.
+        downsample_cfg (dict, optional): Config for downsample blocks.
+            Defaults to ``dict(type='DenoisingDownsample')``.
+        attention_res (int | list[int], optional): Resolution of feature maps
+            to apply attention operation. Defaults to ``[16, 8]``.
+        pretrained (str | dict, optional): Path for the pretrained model or
+            dict containing information for pretained models whose necessary
+            key is 'ckpt_path'. Besides, you can also provide 'prefix' to load
+            the generator part from the whole state dict.  Defaults to None.
+    """
+
+    _default_channels_cfg = {
+        256: [1, 1, 2, 2, 4, 4],
+        64: [1, 2, 3, 4],
+        32: [1, 2, 2, 2]
+    }
+
+    def __init__(self,
+                 image_size,
+                 in_channels=3,
+                 base_channels=128,
+                 resblocks_per_downsample=3,
+                 num_timesteps=1000,
+                 use_rescale_timesteps=True,
+                 dropout=0,
+                 embedding_channels=-1,
+                 num_classes=0,
+                 channels_cfg=None,
+                 output_cfg=dict(mean='eps', var='learned_range'),
+                 norm_cfg=dict(type='GN', num_groups=32),
+                 act_cfg=dict(type='SiLU', inplace=False),
+                 shortcut_kernel_size=1,
+                 use_scale_shift_norm=False,
+                 num_heads=4,
+                 time_embedding_mode='sin',
+                 time_embedding_cfg=None,
+                 resblock_cfg=dict(type='DenoisingResBlock'),
+                 attention_cfg=dict(type='MultiHeadAttention'),
+                 downsample_conv=True,
+                 upsample_conv=True,
+                 downsample_cfg=dict(type='DenoisingDownsample'),
+                 upsample_cfg=dict(type='DenoisingUpsample'),
+                 attention_res=[16, 8],
+                 pretrained=None):
+
+        super().__init__()
+
+        self.num_classes = num_classes
+        self.num_timesteps = num_timesteps
+        self.use_rescale_timesteps = use_rescale_timesteps
+
+        self.output_cfg = deepcopy(output_cfg)
+        self.mean_mode = self.output_cfg.get('mean', 'eps')
+        self.var_mode = self.output_cfg.get('var', 'learned_range')
+
+        # double output_channels to output mean and var at same time
+        out_channels = in_channels if 'FIXED' in self.var_mode.upper() \
+            else 2 * in_channels
+        self.out_channels = out_channels
+
+        # check type of image_size
+        if not isinstance(image_size, int) and not isinstance(
+                image_size, list):
+            raise TypeError(
+                'Only support `int` and `list[int]` for `image_size`.')
+        if isinstance(image_size, list):
+            assert len(
+                image_size) == 2, 'The length of `image_size` should be 2.'
+            assert image_size[0] == image_size[
+                1], 'Width and height of the image should be same.'
+            image_size = image_size[0]
+        self.image_size = image_size
+
+        channels_cfg = deepcopy(self._default_channels_cfg) \
+            if channels_cfg is None else deepcopy(channels_cfg)
+        if isinstance(channels_cfg, dict):
+            if image_size not in channels_cfg:
+                raise KeyError(f'`image_size={image_size} is not found in '
+                               '`channels_cfg`, only support configs for '
+                               f'{[chn for chn in channels_cfg.keys()]}')
+            self.channel_factor_list = channels_cfg[image_size]
+        elif isinstance(channels_cfg, list):
+            self.channel_factor_list = channels_cfg
+        else:
+            raise ValueError('Only support list or dict for `channels_cfg`, '
+                             f'receive {type(channels_cfg)}')
+
+        embedding_channels = base_channels * 4 \
+            if embedding_channels == -1 else embedding_channels
+        self.time_embedding = TimeEmbedding(
+            base_channels,
+            embedding_channels=embedding_channels,
+            embedding_mode=time_embedding_mode,
+            embedding_cfg=time_embedding_cfg,
+            act_cfg=act_cfg)
+
+        if self.num_classes != 0:
+            self.label_embedding = nn.Embedding(self.num_classes,
+                                                embedding_channels)
+
+        self.resblock_cfg = deepcopy(resblock_cfg)
+        self.resblock_cfg.setdefault('dropout', dropout)
+        self.resblock_cfg.setdefault('norm_cfg', norm_cfg)
+        self.resblock_cfg.setdefault('act_cfg', act_cfg)
+        self.resblock_cfg.setdefault('embedding_channels', embedding_channels)
+        self.resblock_cfg.setdefault('use_scale_shift_norm',
+                                     use_scale_shift_norm)
+        self.resblock_cfg.setdefault('shortcut_kernel_size',
+                                     shortcut_kernel_size)
+
+        # get scales of ResBlock to apply attention
+        attention_scale = [image_size // int(res) for res in attention_res]
+        self.attention_cfg = deepcopy(attention_cfg)
+        self.attention_cfg.setdefault('num_heads', num_heads)
+        self.attention_cfg.setdefault('norm_cfg', norm_cfg)
+
+        self.downsample_cfg = deepcopy(downsample_cfg)
+        self.downsample_cfg.setdefault('with_conv', downsample_conv)
+        self.upsample_cfg = deepcopy(upsample_cfg)
+        self.upsample_cfg.setdefault('with_conv', upsample_conv)
+
+        # init the channel scale factor
+        scale = 1
+        self.in_blocks = nn.ModuleList([
+            EmbedSequential(
+                nn.Conv2d(in_channels, base_channels, 3, 1, padding=1))
+        ])
+        self.in_channels_list = [base_channels]
+
+        # construct the encoder part of Unet
+        for level, factor in enumerate(self.channel_factor_list):
+            in_channels_ = base_channels if level == 0 \
+                else base_channels * self.channel_factor_list[level - 1]
+            out_channels_ = base_channels * factor
+
+            for _ in range(resblocks_per_downsample):
+                layers = [
+                    build_module(self.resblock_cfg, {
+                        'in_channels': in_channels_,
+                        'out_channels': out_channels_
+                    })
+                ]
+                in_channels_ = out_channels_
+
+                if scale in attention_scale:
+                    layers.append(
+                        build_module(self.attention_cfg,
+                                     {'in_channels': in_channels_}))
+
+                self.in_channels_list.append(in_channels_)
+                self.in_blocks.append(EmbedSequential(*layers))
+
+            if level != len(self.channel_factor_list) - 1:
+                self.in_blocks.append(
+                    EmbedSequential(
+                        build_module(self.downsample_cfg,
+                                     {'in_channels': in_channels_})))
+                self.in_channels_list.append(in_channels_)
+                scale *= 2
+
+        # construct the bottom part of Unet
+        self.mid_blocks = EmbedSequential(
+            build_module(self.resblock_cfg, {'in_channels': in_channels_}),
+            build_module(self.attention_cfg, {'in_channels': in_channels_}),
+            build_module(self.resblock_cfg, {'in_channels': in_channels_}),
+        )
+
+        # construct the decoder part of Unet
+        in_channels_list = deepcopy(self.in_channels_list)
+        self.out_blocks = nn.ModuleList()
+        for level, factor in enumerate(self.channel_factor_list[::-1]):
+            for idx in range(resblocks_per_downsample + 1):
+                layers = [
+                    build_module(
+                        self.resblock_cfg, {
+                            'in_channels':
+                            in_channels_ + in_channels_list.pop(),
+                            'out_channels': base_channels * factor
+                        })
+                ]
+                in_channels_ = base_channels * factor
+                if scale in attention_scale:
+                    layers.append(
+                        build_module(self.attention_cfg,
+                                     {'in_channels': in_channels_}))
+                if (level != len(self.channel_factor_list) - 1
+                        and idx == resblocks_per_downsample):
+                    layers.append(
+                        build_module(self.upsample_cfg,
+                                     {'in_channels': in_channels_}))
+                    scale //= 2
+                self.out_blocks.append(EmbedSequential(*layers))
+
+        self.out = ConvModule(
+            in_channels=in_channels_,
+            out_channels=out_channels,
+            kernel_size=3,
+            padding=1,
+            act_cfg=act_cfg,
+            norm_cfg=norm_cfg,
+            bias=True,
+            order=('norm', 'act', 'conv'))
+
+        self.init_weights(pretrained)
+
+    def forward(self, x_t, t, label=None, return_noise=False):
+        """Forward function.
+        Args:
+            x_t (torch.Tensor): Diffused image at timestep `t` to denoise.
+            t (torch.Tensor): Current timestep.
+            label (torch.Tensor | callable | None): You can directly give a
+                batch of label through a ``torch.Tensor`` or offer a callable
+                function to sample a batch of label data. Otherwise, the
+                ``None`` indicates to use the default label sampler.
+            return_noise (bool, optional): If True, inputted ``x_t`` and ``t``
+                will be returned in a dict with output desired by
+                ``output_cfg``. Defaults to False.
+
+        Returns:
+            torch.Tensor | dict: If not ``return_noise``
+        """
+
+        if self.use_rescale_timesteps:
+            t = t.float() * (1000.0 / self.num_timesteps)
+        embedding = self.time_embedding(t)
+
+        if label is not None:
+            assert hasattr(self, 'label_embedding')
+            embedding = self.label_embedding(label) + embedding
+
+        h, hs = x_t, []
+        # forward downsample blocks
+        for block in self.in_blocks:
+            h = block(h, embedding)
+            hs.append(h)
+
+        # forward middle blocks
+        h = self.mid_blocks(h, embedding)
+
+        # forward upsample blocks
+        for block in self.out_blocks:
+            h = block(torch.cat([h, hs.pop()], dim=1), embedding)
+        outputs = self.out(h)
+
+        output_dict = dict()
+        if 'FIXED' not in self.var_mode.upper():
+            # split mean and learned from output
+            mean, var = outputs.split(self.out_channels // 2, dim=1)
+            if self.var_mode.upper() == 'LEARNED_RANGE':
+                # rescale [-1, 1] to [0, 1]
+                output_dict['factor'] = (var + 1) / 2
+            elif self.var_mode.upper() == 'LEARNED':
+                output_dict['logvar'] = var
+            else:
+                raise AttributeError(
+                    'Only support \'FIXED\', \'LEARNED_RANGE\' '
+                    'and \'LEARNED\' for variance output format. But receive '
+                    f'\'{self.var_mode}\'.')
+        else:
+            mean = outputs
+
+        if self.mean_mode.upper() == 'EPS':
+            output_dict['eps_t_pred'] = mean
+        elif self.mean_mode.upper() == 'START_X':
+            output_dict['x_0_pred'] = mean
+        elif self.mean_mode.upper() == 'PREVIOUS_X':
+            output_dict['x_tm1_pred'] = mean
+        else:
+            raise AttributeError(
+                'Only support \'EPS\', \'START_X\' and \'PREVIOUS_X\' for '
+                f'mean output format. But receive \'{self.mean_mode}\'.')
+
+        if return_noise:
+            output_dict['x_t'] = x_t
+            output_dict['t_rescaled'] = t
+            if self.num_classes > 0:
+                output_dict['label'] = label
+
+        return output_dict
+
+    def init_weights(self, pretrained=None):
+        """Init weights for models.
+
+        We just use the initialization method proposed in the original paper.
+
+        Args:
+            pretrained (str, optional): Path for pretrained weights. If given
+                None, pretrained weights will not be loaded. Defaults to None.
+        """
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            # As Improved-DDPM, we apply zero-initialization to
+            #   second conv block in ResBlock (keywords: conv_2)
+            #   the output layer of the Unet (keywords: 'out' but
+            #     not 'out_blocks')
+            #   projection layer in Attention layer (keywords: proj)
+            for n, m in self.named_modules():
+                if isinstance(m, nn.Conv2d) and ('conv_2' in n or
+                                                 ('out' in n
+                                                  and 'out_blocks' not in n)):
+                    constant_init(m, 0)
+                if isinstance(m, nn.Conv1d) and 'proj' in n:
+                    constant_init(m, 0)
+        else:
+            raise TypeError('pretrained must be a str or None but'
+                            f' got {type(pretrained)} instead.')
--- a/mmgen/models/architectures/ddpm/modules.py
+++ b/mmgen/models/architectures/ddpm/modules.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from copy import deepcopy
+from functools import partial
+
+import mmcv
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ACTIVATION_LAYERS
+from mmcv.cnn.bricks import build_activation_layer, build_norm_layer
+from mmcv.cnn.utils import constant_init
+from mmcv.utils import digit_version
+
+from mmgen.models.builder import MODULES, build_module
+
+
+class EmbedSequential(nn.Sequential):
+    """A sequential module that passes timestep embeddings to the children that
+    support it as an extra input.
+
+    Modified from
+    https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/unet.py#L35
+    """
+
+    def forward(self, x, y):
+        for layer in self:
+            if isinstance(layer, DenoisingResBlock):
+                x = layer(x, y)
+            else:
+                x = layer(x)
+        return x
+
+
+if 'SiLU' not in ACTIVATION_LAYERS:
+
+    @ACTIVATION_LAYERS.register_module()
+    class SiLU(nn.Module):
+        r"""Applies the Sigmoid Linear Unit (SiLU) function, element-wise.
+        The SiLU function is also known as the swish function.
+        Args:
+            input (bool, optional): Use inplace operation or not.
+                Defaults to `False`.
+        """
+
+        def __init__(self, inplace=False):
+            super().__init__()
+            if digit_version(
+                    torch.__version__) < digit_version('1.7.0') and inplace:
+                mmcv.print_log('Inplace version of \'SiLU\' is not supported '
+                               'for torch < 1.7.0, found '
+                               f'\'{torch.version}\'.')
+            self.inplace = inplace
+
+        def forward(self, x):
+            """Forward function for SiLU.
+            Args:
+                x (torch.Tensor): Input tensor.
+
+            Returns:
+                torch.Tensor: Tensor after activation.
+            """
+
+            if digit_version(torch.__version__) < digit_version('1.7.0'):
+                return x * torch.sigmoid(x)
+
+            return F.silu(x, inplace=self.inplace)
+
+
+@MODULES.register_module()
+class MultiHeadAttention(nn.Module):
+    """An attention block allows spatial position to attend to each other.
+
+    Originally ported from here, but adapted to the N-d case.
+    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.  # noqa
+
+    Args:
+        in_channels (int): Channels of the input feature map.
+        num_heads (int, optional): Number of heads in the attention.
+        norm_cfg (dict, optional): Config for normalization layer. Default
+            to ``dict(type='GN', num_groups=32)``
+    """
+
+    def __init__(self,
+                 in_channels,
+                 num_heads=1,
+                 norm_cfg=dict(type='GN', num_groups=32)):
+        super().__init__()
+        self.num_heads = num_heads
+        _, self.norm = build_norm_layer(norm_cfg, in_channels)
+        self.qkv = nn.Conv1d(in_channels, in_channels * 3, 1)
+        self.proj = nn.Conv1d(in_channels, in_channels, 1)
+        self.init_weights()
+
+    @staticmethod
+    def QKVAttention(qkv):
+        channel = qkv.shape[1] // 3
+        q, k, v = torch.chunk(qkv, 3, dim=1)
+        scale = 1 / np.sqrt(np.sqrt(channel))
+        weight = torch.einsum('bct,bcs->bts', q * scale, k * scale)
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+        weight = torch.einsum('bts,bcs->bct', weight, v)
+        return weight
+
+    def forward(self, x):
+        """Forward function for multi head attention.
+        Args:
+            x (torch.Tensor): Input feature map.
+
+        Returns:
+            torch.Tensor: Feature map after attention.
+        """
+        b, c, *spatial = x.shape
+        x = x.reshape(b, c, -1)
+        qkv = self.qkv(self.norm(x))
+        qkv = qkv.reshape(b * self.num_heads, -1, qkv.shape[2])
+        h = self.QKVAttention(qkv)
+        h = h.reshape(b, -1, h.shape[-1])
+        h = self.proj(h)
+        return (h + x).reshape(b, c, *spatial)
+
+    def init_weights(self):
+        constant_init(self.proj, 0)
+
+
+@MODULES.register_module()
+class TimeEmbedding(nn.Module):
+    """Time embedding layer, reference to Two level embedding. First embedding
+    time by an embedding function, then feed to neural networks.
+
+    Args:
+        in_channels (int): The channel number of the input feature map.
+        embedding_channels (int): The channel number of the output embedding.
+        embedding_mode (str, optional): Embedding mode for the time embedding.
+            Defaults to 'sin'.
+        embedding_cfg (dict, optional): Config for time embedding.
+            Defaults to None.
+        act_cfg (dict, optional): Config for activation layer. Defaults to
+            ``dict(type='SiLU', inplace=False)``.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 embedding_channels,
+                 embedding_mode='sin',
+                 embedding_cfg=None,
+                 act_cfg=dict(type='SiLU', inplace=False)):
+        super().__init__()
+        self.blocks = nn.Sequential(
+            nn.Linear(in_channels, embedding_channels),
+            build_activation_layer(act_cfg),
+            nn.Linear(embedding_channels, embedding_channels))
+
+        # add `dim` to embedding config
+        embedding_cfg_ = dict(dim=in_channels)
+        if embedding_cfg is not None:
+            embedding_cfg_.update(embedding_cfg)
+        if embedding_mode.upper() == 'SIN':
+            self.embedding_fn = partial(self.sinusodial_embedding,
+                                        **embedding_cfg_)
+        else:
+            raise ValueError('Only support `SIN` for time embedding, '
+                             f'but receive {embedding_mode}.')
+
+    @staticmethod
+    def sinusodial_embedding(timesteps, dim, max_period=10000):
+        """Create sinusoidal timestep embeddings.
+
+        Args:
+            timesteps (torch.Tensor): Timestep to embedding. 1-D tensor shape
+                as ``[bz, ]``,  one per batch element.
+            dim (int): The dimension of the embedding.
+            max_period (int, optional): Controls the minimum frequency of the
+                embeddings. Defaults to ``10000``.
+
+        Returns:
+            torch.Tensor: Embedding results shape as `[bz, dim]`.
+        """
+
+        half = dim // 2
+        freqs = torch.exp(
+            -np.log(max_period) *
+            torch.arange(start=0, end=half, dtype=torch.float32) /
+            half).to(device=timesteps.device)
+        args = timesteps[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat(
+                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+
+    def forward(self, t):
+        """Forward function for time embedding layer.
+        Args:
+            t (torch.Tensor): Input timesteps.
+
+        Returns:
+            torch.Tensor: Timesteps embedding.
+
+        """
+        return self.blocks(self.embedding_fn(t))
+
+
+@MODULES.register_module()
+class DenoisingResBlock(nn.Module):
+    """Resblock for the denoising network. If `in_channels` not equals to
+    `out_channels`, a learnable shortcut with conv layers will be added.
+
+    Args:
+        in_channels (int): Number of channels of the input feature map.
+        embedding_channels (int): Number of channels of the input embedding.
+        use_scale_shift_norm (bool): Whether use scale-shift-norm in
+            `NormWithEmbedding` layer.
+        dropout (float): Probability of the dropout layers.
+        out_channels (int, optional): Number of output channels of the
+            ResBlock. If not defined, the output channels will equal to the
+            `in_channels`. Defaults to `None`.
+        norm_cfg (dict, optional): The config for the normalization layers.
+            Defaults too ``dict(type='GN', num_groups=32)``.
+        act_cfg (dict, optional): The config for the activation layers.
+            Defaults to ``dict(type='SiLU', inplace=False)``.
+        shortcut_kernel_size (int, optional): The kernel size for the shortcut
+            conv. Defaults to ``1``.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 embedding_channels,
+                 use_scale_shift_norm,
+                 dropout,
+                 out_channels=None,
+                 norm_cfg=dict(type='GN', num_groups=32),
+                 act_cfg=dict(type='SiLU', inplace=False),
+                 shortcut_kernel_size=1):
+        super().__init__()
+        out_channels = in_channels if out_channels is None else out_channels
+
+        _norm_cfg = deepcopy(norm_cfg)
+
+        _, norm_1 = build_norm_layer(_norm_cfg, in_channels)
+        conv_1 = [
+            norm_1,
+            build_activation_layer(act_cfg),
+            nn.Conv2d(in_channels, out_channels, 3, padding=1)
+        ]
+        self.conv_1 = nn.Sequential(*conv_1)
+
+        norm_with_embedding_cfg = dict(
+            in_channels=out_channels,
+            embedding_channels=embedding_channels,
+            use_scale_shift=use_scale_shift_norm,
+            norm_cfg=_norm_cfg)
+        self.norm_with_embedding = build_module(
+            dict(type='NormWithEmbedding'),
+            default_args=norm_with_embedding_cfg)
+
+        conv_2 = [
+            build_activation_layer(act_cfg),
+            nn.Dropout(dropout),
+            nn.Conv2d(out_channels, out_channels, 3, padding=1)
+        ]
+        self.conv_2 = nn.Sequential(*conv_2)
+
+        assert shortcut_kernel_size in [
+            1, 3
+        ], ('Only support `1` and `3` for `shortcut_kernel_size`, but '
+            f'receive {shortcut_kernel_size}.')
+
+        self.learnable_shortcut = out_channels != in_channels
+
+        if self.learnable_shortcut:
+            shortcut_padding = 1 if shortcut_kernel_size == 3 else 0
+            self.shortcut = nn.Conv2d(
+                in_channels,
+                out_channels,
+                shortcut_kernel_size,
+                padding=shortcut_padding)
+        self.init_weights()
+
+    def forward_shortcut(self, x):
+        if self.learnable_shortcut:
+            return self.shortcut(x)
+        return x
+
+    def forward(self, x, y):
+        """Forward function.
+
+        Args:
+            x (torch.Tensor): Input feature map tensor.
+            y (torch.Tensor): Shared time embedding or shared label embedding.
+
+        Returns:
+            torch.Tensor : Output feature map tensor.
+        """
+        shortcut = self.forward_shortcut(x)
+        x = self.conv_1(x)
+        x = self.norm_with_embedding(x, y)
+        x = self.conv_2(x)
+        return x + shortcut
+
+    def init_weights(self):
+        # apply zero init to last conv layer
+        constant_init(self.conv_2[-1], 0)
+
+
+@MODULES.register_module()
+class NormWithEmbedding(nn.Module):
+    """Nornalization with embedding layer. If `use_scale_shift == True`,
+    embedding results will be chunked and used to re-shift and re-scale
+    normalization results. Otherwise, embedding results will directly add to
+    input of normalization layer.
+
+    Args:
+        in_channels (int): Number of channels of the input feature map.
+        embedding_channels (int) Number of channels of the input embedding.
+        norm_cfg (dict, optional): Config for the normalization operation.
+            Defaults to `dict(type='GN', num_groups=32)`.
+        act_cfg (dict, optional): Config for the activation layer. Defaults
+            to `dict(type='SiLU', inplace=False)`.
+        use_scale_shift (bool): If True, the output of Embedding layer will be
+            split to 'scale' and 'shift' and map the output of normalization
+            layer to ``out * (1 + scale) + shift``. Otherwise, the output of
+            Embedding layer will be added with the input before normalization
+            operation. Defaults to True.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 embedding_channels,
+                 norm_cfg=dict(type='GN', num_groups=32),
+                 act_cfg=dict(type='SiLU', inplace=False),
+                 use_scale_shift=True):
+        super().__init__()
+        self.use_scale_shift = use_scale_shift
+        _, self.norm = build_norm_layer(norm_cfg, in_channels)
+
+        embedding_output = in_channels * 2 if use_scale_shift else in_channels
+        self.embedding_layer = nn.Sequential(
+            build_activation_layer(act_cfg),
+            nn.Linear(embedding_channels, embedding_output))
+
+    def forward(self, x, y):
+        """Forward function.
+
+        Args:
+            x (torch.Tensor): Input feature map tensor.
+            y (torch.Tensor): Shared time embedding or shared label embedding.
+
+        Returns:
+            torch.Tensor : Output feature map tensor.
+        """
+        embedding = self.embedding_layer(y)[:, :, None, None]
+        if self.use_scale_shift:
+            scale, shift = torch.chunk(embedding, 2, dim=1)
+            x = self.norm(x)
+            x = x * (1 + scale) + shift
+        else:
+            x = self.norm(x + embedding)
+        return x
+
+
+@MODULES.register_module()
+class DenoisingDownsample(nn.Module):
+    """Downsampling operation used in the denoising network. Support average
+    pooling and convolution for downsample operation.
+
+    Args:
+        in_channels (int): Number of channels of the input feature map to be
+            downsampled.
+        with_conv (bool, optional): Whether use convolution operation for
+            downsampling.  Defaults to `True`.
+    """
+
+    def __init__(self, in_channels, with_conv=True):
+        super().__init__()
+        if with_conv:
+            self.downsample = nn.Conv2d(in_channels, in_channels, 3, 2, 1)
+        else:
+            self.downsample = nn.AvgPool2d(stride=2)
+
+    def forward(self, x):
+        """Forward function for downsampling operation.
+        Args:
+            x (torch.Tensor): Feature map to downsample.
+
+        Returns:
+            torch.Tensor: Feature map after downsampling.
+        """
+        return self.downsample(x)
+
+
+@MODULES.register_module()
+class DenoisingUpsample(nn.Module):
+    """Upsampling operation used in the denoising network. Allows users to
+    apply an additional convolution layer after the nearest interpolation
+    operation.
+
+    Args:
+        in_channels (int): Number of channels of the input feature map to be
+            downsampled.
+        with_conv (bool, optional): Whether apply an additional convolution
+            layer after upsampling.  Defaults to `True`.
+    """
+
+    def __init__(self, in_channels, with_conv=True):
+        super().__init__()
+        if with_conv:
+            self.with_conv = True
+            self.conv = nn.Conv2d(in_channels, in_channels, 3, 1, 1)
+
+    def forward(self, x):
+        """Forward function for upsampling operation.
+        Args:
+            x (torch.Tensor): Feature map to upsample.
+
+        Returns:
+            torch.Tensor: Feature map after upsampling.
+        """
+        x = F.interpolate(x, scale_factor=2, mode='nearest')
+        if self.with_conv:
+            x = self.conv(x)
+        return x
--- a/mmgen/models/architectures/fid_inception.py
+++ b/mmgen/models/architectures/fid_inception.py
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Inception networks used in calculating FID and Inception metrics.
+
+This code is modified from:
+https://github.com/rosinality/stylegan2-pytorch/blob/master/inception.py
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.model_zoo import load_url
+from torchvision import models
+
+# Inception weights ported to PyTorch from
+# https://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz
+FID_WEIGHTS_URL = 'https://github.com/mseitzer/pytorch-fid/releases/download/fid_weights/pt_inception-2015-12-05-6726825d.pth'  # noqa: E501
+
+
+class InceptionV3(nn.Module):
+    """Pretrained InceptionV3 network returning feature maps."""
+
+    # Index of default block of inception to return,
+    # corresponds to output of final average pooling
+    DEFAULT_BLOCK_INDEX = 3
+
+    # Maps feature dimensionality to their output blocks indices
+    BLOCK_INDEX_BY_DIM = {
+        64: 0,  # First max pooling features
+        192: 1,  # Second max pooling features
+        768: 2,  # Pre-aux classifier features
+        2048: 3  # Final average pooling features
+    }
+
+    def __init__(self,
+                 output_blocks=[DEFAULT_BLOCK_INDEX],
+                 resize_input=True,
+                 normalize_input=True,
+                 requires_grad=False,
+                 use_fid_inception=True,
+                 load_fid_inception=True):
+        """Build pretrained InceptionV3.
+
+        Args:
+            output_blocks (list[int]): Indices of blocks to return features of.
+                Possible values are:
+                    - 0: corresponds to output of first max pooling
+                    - 1: corresponds to output of second max pooling
+                    - 2: corresponds to output which is fed to aux classifier
+                    - 3: corresponds to output of final average pooling
+            resize_input (bool): If true, bilinearly resizes input to width and
+                height 299 before feeding input to model. As the network
+                without fully connected layers is fully convolutional, it
+                should be able to handle inputs of arbitrary size, so resizing
+                might not be strictly needed.
+            normalize_input (bool): If true, scales the input from range (0, 1)
+                to the range the pretrained Inception network expects, namely
+                (-1, 1).
+            requires_grad (bool): If true, parameters of the model require
+                gradients. Possibly useful for finetuning the network.
+            use_fid_inception (bool): If true, uses the pretrained Inception
+                model used in Tensorflow's FID implementation. If false, uses
+                the pretrained Inception model available in torchvision. The
+                FID Inception model has different weights and a slightly
+                different structure from torchvision's Inception model. If you
+                want to compute FID scores, you are strongly advised to set
+                this parameter to true to get comparable results.
+        """
+        super().__init__()
+
+        self.resize_input = resize_input
+        self.normalize_input = normalize_input
+        self.output_blocks = sorted(output_blocks)
+        self.last_needed_block = max(output_blocks)
+
+        assert self.last_needed_block <= 3, \
+            'Last possible output block index is 3'
+
+        self.blocks = nn.ModuleList()
+
+        if use_fid_inception:
+            inception = fid_inception_v3(load_fid_inception)
+        else:
+            inception = models.inception_v3(pretrained=True)
+
+        # Block 0: input to maxpool1
+        block0 = [
+            inception.Conv2d_1a_3x3, inception.Conv2d_2a_3x3,
+            inception.Conv2d_2b_3x3,
+            nn.MaxPool2d(kernel_size=3, stride=2)
+        ]
+        self.blocks.append(nn.Sequential(*block0))
+
+        # Block 1: maxpool1 to maxpool2
+        if self.last_needed_block >= 1:
+            block1 = [
+                inception.Conv2d_3b_1x1, inception.Conv2d_4a_3x3,
+                nn.MaxPool2d(kernel_size=3, stride=2)
+            ]
+            self.blocks.append(nn.Sequential(*block1))
+
+        # Block 2: maxpool2 to aux classifier
+        if self.last_needed_block >= 2:
+            block2 = [
+                inception.Mixed_5b,
+                inception.Mixed_5c,
+                inception.Mixed_5d,
+                inception.Mixed_6a,
+                inception.Mixed_6b,
+                inception.Mixed_6c,
+                inception.Mixed_6d,
+                inception.Mixed_6e,
+            ]
+            self.blocks.append(nn.Sequential(*block2))
+
+        # Block 3: aux classifier to final avgpool
+        if self.last_needed_block >= 3:
+            block3 = [
+                inception.Mixed_7a, inception.Mixed_7b, inception.Mixed_7c,
+                nn.AdaptiveAvgPool2d(output_size=(1, 1))
+            ]
+            self.blocks.append(nn.Sequential(*block3))
+
+        for param in self.parameters():
+            param.requires_grad = requires_grad
+
+    def forward(self, inp):
+        """Get Inception feature maps.
+
+        Args:
+            inp (torch.Tensor): Input tensor of shape Bx3xHxW.
+                Values are expected to be in range (0, 1)
+
+        Returns:
+            list(torch.Tensor): Corresponding to the selected output \
+                block, sorted ascending by index.
+        """
+        outp = []
+        x = inp
+
+        if self.resize_input:
+            x = F.interpolate(
+                x, size=(299, 299), mode='bilinear', align_corners=False)
+
+        if self.normalize_input:
+            x = 2 * x - 1  # Scale from range (0, 1) to range (-1, 1)
+
+        for idx, block in enumerate(self.blocks):
+            x = block(x)
+            if idx in self.output_blocks:
+                outp.append(x)
+
+            if idx == self.last_needed_block:
+                break
+
+        return outp
+
+
+def fid_inception_v3(load_ckpt=True):
+    """Build pretrained Inception model for FID computation.
+
+    The Inception model for FID computation uses a different set of weights
+    and has a slightly different structure than torchvision's Inception.
+
+    This method first constructs torchvision's Inception and then patches the
+    necessary parts that are different in the FID Inception model.
+    """
+    inception = models.inception_v3(
+        num_classes=1008, aux_logits=False, pretrained=False)
+    inception.Mixed_5b = FIDInceptionA(192, pool_features=32)
+    inception.Mixed_5c = FIDInceptionA(256, pool_features=64)
+    inception.Mixed_5d = FIDInceptionA(288, pool_features=64)
+    inception.Mixed_6b = FIDInceptionC(768, channels_7x7=128)
+    inception.Mixed_6c = FIDInceptionC(768, channels_7x7=160)
+    inception.Mixed_6d = FIDInceptionC(768, channels_7x7=160)
+    inception.Mixed_6e = FIDInceptionC(768, channels_7x7=192)
+    inception.Mixed_7b = FIDInceptionE_1(1280)
+    inception.Mixed_7c = FIDInceptionE_2(2048)
+
+    if load_ckpt:
+        state_dict = load_url(FID_WEIGHTS_URL, progress=True)
+        inception.load_state_dict(state_dict)
+
+    return inception
+
+
+class FIDInceptionA(models.inception.InceptionA):
+    """InceptionA block patched for FID computation."""
+
+    def __init__(self, in_channels, pool_features):
+        super().__init__(in_channels, pool_features)
+
+    def forward(self, x):
+        """Get InceptionA feature maps.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape BxCxHxW.
+
+        Returns:
+            torch.Tensor: Feature Maps of x outputted by this block.
+        """
+        branch1x1 = self.branch1x1(x)
+
+        branch5x5 = self.branch5x5_1(x)
+        branch5x5 = self.branch5x5_2(branch5x5)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
+
+        # Patch: Tensorflow's average pool does not use the padded zero's in
+        # its average calculation
+        branch_pool = F.avg_pool2d(
+            x, kernel_size=3, stride=1, padding=1, count_include_pad=False)
+        branch_pool = self.branch_pool(branch_pool)
+
+        outputs = [branch1x1, branch5x5, branch3x3dbl, branch_pool]
+        return torch.cat(outputs, 1)
+
+
+class FIDInceptionC(models.inception.InceptionC):
+    """InceptionC block patched for FID computation."""
+
+    def __init__(self, in_channels, channels_7x7):
+        super().__init__(in_channels, channels_7x7)
+
+    def forward(self, x):
+        """Get InceptionC feature maps.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape BxCxHxW.
+
+        Returns:
+            torch.Tensor: Feature Maps of x outputted by this block.
+        """
+        branch1x1 = self.branch1x1(x)
+
+        branch7x7 = self.branch7x7_1(x)
+        branch7x7 = self.branch7x7_2(branch7x7)
+        branch7x7 = self.branch7x7_3(branch7x7)
+
+        branch7x7dbl = self.branch7x7dbl_1(x)
+        branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl)
+
+        # Patch: Tensorflow's average pool does not use the padded zero's in
+        # its average calculation
+        branch_pool = F.avg_pool2d(
+            x, kernel_size=3, stride=1, padding=1, count_include_pad=False)
+        branch_pool = self.branch_pool(branch_pool)
+
+        outputs = [branch1x1, branch7x7, branch7x7dbl, branch_pool]
+        return torch.cat(outputs, 1)
+
+
+class FIDInceptionE_1(models.inception.InceptionE):
+    """First InceptionE block patched for FID computation."""
+
+    def __init__(self, in_channels):
+        super().__init__(in_channels)
+
+    def forward(self, x):
+        """Get first InceptionE feature maps.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape BxCxHxW.
+
+        Returns:
+            torch.Tensor: Feature Maps of x outputted by this block.
+        """
+        branch1x1 = self.branch1x1(x)
+
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = [
+            self.branch3x3_2a(branch3x3),
+            self.branch3x3_2b(branch3x3),
+        ]
+        branch3x3 = torch.cat(branch3x3, 1)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = [
+            self.branch3x3dbl_3a(branch3x3dbl),
+            self.branch3x3dbl_3b(branch3x3dbl),
+        ]
+        branch3x3dbl = torch.cat(branch3x3dbl, 1)
+
+        # Patch: Tensorflow's average pool does not use the padded zero's in
+        # its average calculation
+        branch_pool = F.avg_pool2d(
+            x, kernel_size=3, stride=1, padding=1, count_include_pad=False)
+        branch_pool = self.branch_pool(branch_pool)
+
+        outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
+        return torch.cat(outputs, 1)
+
+
+class FIDInceptionE_2(models.inception.InceptionE):
+    """Second InceptionE block patched for FID computation."""
+
+    def __init__(self, in_channels):
+        super().__init__(in_channels)
+
+    def forward(self, x):
+        """Get second InceptionE feature maps.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape BxCxHxW.
+
+        Returns:
+            torch.Tensor: Feature Maps of x outputted by this block.
+        """
+        branch1x1 = self.branch1x1(x)
+
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = [
+            self.branch3x3_2a(branch3x3),
+            self.branch3x3_2b(branch3x3),
+        ]
+        branch3x3 = torch.cat(branch3x3, 1)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = [
+            self.branch3x3dbl_3a(branch3x3dbl),
+            self.branch3x3dbl_3b(branch3x3dbl),
+        ]
+        branch3x3dbl = torch.cat(branch3x3dbl, 1)
+
+        # Patch: The FID Inception model uses max pooling instead of average
+        # pooling. This is likely an error in this specific Inception
+        # implementation, as other Inception models use average pooling here
+        # (which matches the description in the paper).
+        branch_pool = F.max_pool2d(x, kernel_size=3, stride=1, padding=1)
+        branch_pool = self.branch_pool(branch_pool)
+
+        outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
+        return torch.cat(outputs, 1)
--- a/mmgen/models/architectures/lpips/__init__.py
+++ b/mmgen/models/architectures/lpips/__init__.py
+# Copyright (c) OpenMMLab. All rights reserved.
+r"""
+    The lpips module was adapted from https://github.com/rosinality/stylegan2-pytorch/tree/master/lpips ,  # noqa
+    and you can see the origin implementation in https://github.com/richzhang/PerceptualSimilarity/tree/master/lpips  # noqa
+"""
+from .perceptual_loss import PerceptualLoss
+
+__all__ = ['PerceptualLoss']
--- a/mmgen/models/architectures/lpips/networks_basic.py
+++ b/mmgen/models/architectures/lpips/networks_basic.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from .pretrained_networks import vgg16
+
+
+def normalize_tensor(in_feat, eps=1e-10):
+    """L2 normalization.
+
+    Args:
+        in_feat (Tensor): Tensor with shape [N, C, H, W].
+        eps (float, optional): Epsilon value to avoid computation error.
+            Defaults to 1e-10.
+
+    Returns:
+        Tensor: Tensor after L2 normalization per-instance.
+    """
+    norm_factor = torch.sqrt(torch.sum(in_feat**2, dim=1, keepdim=True))
+    return in_feat / (norm_factor + eps)
+
+
+def spatial_average(in_tens, keepdim=True):
+    """Returns the mean value of each row of the input tensor in the spatial
+    dimension.
+
+    Args:
+        in_tens (Tensor): Tensor with shape [N, C, H, W].
+        keepdim (bool, optional): If keepdim is True, the output tensor is of
+            the shape [N, C, 1, 1]. Otherwise, the output will have shape
+            [N, C]. Defaults to True.
+
+    Returns:
+        Tensor: Tensor after average pooling to 1x1 with shape [N, C, 1, 1] or
+            [N, C].
+    """
+    return in_tens.mean([2, 3], keepdim=keepdim)
+
+
+def upsample(in_tens, out_H=64):  # assumes scale factor is same for H and W
+    """Upsamples the input to the given size.
+
+    Args:
+        in_tens (Tensor): Tensor with shape [N, C, H, W].
+        out_H (int, optional): Output spatial size. Defaults to 64.
+
+    Returns:
+        Tensor: Output Tensor.
+    """
+    in_H = in_tens.shape[2]
+    scale_factor = 1. * out_H / in_H
+
+    return nn.Upsample(
+        scale_factor=scale_factor, mode='bilinear', align_corners=False)(
+            in_tens)
+
+
+# Learned perceptual metric
+class PNetLin(nn.Module):
+    r"""
+        Ref: https://github.com/richzhang/PerceptualSimilarity/blob/master/lpips/lpips.py # noqa
+    """
+
+    def __init__(self,
+                 pnet_rand=False,
+                 pnet_tune=False,
+                 use_dropout=True,
+                 spatial=False,
+                 version='0.1',
+                 lpips=True):
+        super().__init__()
+
+        self.pnet_tune = pnet_tune
+        self.pnet_rand = pnet_rand
+        self.spatial = spatial
+        self.lpips = lpips
+        self.version = version
+        self.scaling_layer = ScalingLayer()
+        self.channels = [64, 128, 256, 512, 512]
+        self.L = len(self.channels)
+        self.net = vgg16(
+            pretrained=not self.pnet_rand, requires_grad=self.pnet_tune)
+
+        self.lin0 = NetLinLayer(self.channels[0], use_dropout=use_dropout)
+        self.lin1 = NetLinLayer(self.channels[1], use_dropout=use_dropout)
+        self.lin2 = NetLinLayer(self.channels[2], use_dropout=use_dropout)
+        self.lin3 = NetLinLayer(self.channels[3], use_dropout=use_dropout)
+        self.lin4 = NetLinLayer(self.channels[4], use_dropout=use_dropout)
+        self.lins = [self.lin0, self.lin1, self.lin2, self.lin3, self.lin4]
+
+    def forward(self, in0, in1, retPerLayer=False):
+        # v0.0 - original release had a bug, where input was not scaled
+        in0_input, in1_input = (
+            self.scaling_layer(in0),
+            self.scaling_layer(in1)) if self.version == '0.1' else (in0, in1)
+        outs0, outs1 = self.net.forward(in0_input), self.net.forward(in1_input)
+        feats0, feats1, diffs = {}, {}, {}
+
+        for kk in range(self.L):
+            feats0[kk], feats1[kk] = normalize_tensor(
+                outs0[kk]), normalize_tensor(outs1[kk])
+            diffs[kk] = (feats0[kk] - feats1[kk])**2
+
+        if self.lpips:
+            if self.spatial:
+                res = [
+                    upsample(
+                        self.lins[kk].model(diffs[kk]), out_H=in0.shape[2])
+                    for kk in range(self.L)
+                ]
+            else:
+                res = [
+                    spatial_average(
+                        self.lins[kk].model(diffs[kk]), keepdim=True)
+                    for kk in range(self.L)
+                ]
+        else:
+            if self.spatial:
+                res = [
+                    upsample(
+                        diffs[kk].sum(dim=1, keepdim=True), out_H=in0.shape[2])
+                    for kk in range(self.L)
+                ]
+            else:
+                res = [
+                    spatial_average(
+                        diffs[kk].sum(dim=1, keepdim=True), keepdim=True)
+                    for kk in range(self.L)
+                ]
+
+        val = sum(res)
+        if retPerLayer:
+            return (val, res)
+
+        return val
+
+
+class ScalingLayer(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.register_buffer(
+            'shift',
+            torch.Tensor([-.030, -.088, -.188])[None, :, None, None])
+        self.register_buffer(
+            'scale',
+            torch.Tensor([.458, .448, .450])[None, :, None, None])
+
+    def forward(self, inp):
+        return (inp - self.shift) / self.scale
+
+
+class NetLinLayer(nn.Module):
+    """A single linear layer which does a 1x1 conv."""
+
+    def __init__(self, chn_in, chn_out=1, use_dropout=False):
+        super().__init__()
+
+        layers = [
+            nn.Dropout(),
+        ] if (use_dropout) else []
+        layers += [
+            nn.Conv2d(chn_in, chn_out, 1, stride=1, padding=0, bias=False),
+        ]
+        self.model = nn.Sequential(*layers)
+
+
+class Dist2LogitLayer(nn.Module):
+    """takes 2 distances, puts through fc layers, spits out value between [0,
+    1] (if use_sigmoid is True)"""
+
+    def __init__(self, chn_mid=32, use_sigmoid=True):
+        super().__init__()
+
+        layers = [
+            nn.Conv2d(5, chn_mid, 1, stride=1, padding=0, bias=True),
+        ]
+        layers += [
+            nn.LeakyReLU(0.2, True),
+        ]
+        layers += [
+            nn.Conv2d(chn_mid, chn_mid, 1, stride=1, padding=0, bias=True),
+        ]
+        layers += [
+            nn.LeakyReLU(0.2, True),
+        ]
+        layers += [
+            nn.Conv2d(chn_mid, 1, 1, stride=1, padding=0, bias=True),
+        ]
+        if use_sigmoid:
+            layers += [
+                nn.Sigmoid(),
+            ]
+        self.model = nn.Sequential(*layers)
+
+    def forward(self, d0, d1, eps=0.1):
+        return self.model.forward(
+            torch.cat((d0, d1, d0 - d1, d0 / (d1 + eps), d1 / (d0 + eps)),
+                      dim=1))
+
+
+class BCERankingLoss(nn.Module):
+
+    def __init__(self, chn_mid=32):
+        super().__init__()
+        self.net = Dist2LogitLayer(chn_mid=chn_mid)
+        # self.parameters = list(self.net.parameters())
+        self.loss = torch.nn.BCELoss()
+
+    def forward(self, d0, d1, judge):
+        per = (judge + 1.) / 2.
+        self.logit = self.net.forward(d0, d1)
+        return self.loss(self.logit, per)
--- a/mmgen/models/architectures/lpips/perceptual_loss.py
+++ b/mmgen/models/architectures/lpips/perceptual_loss.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch.utils.model_zoo import load_url
+
+from .networks_basic import PNetLin
+
+LPIPS_WEIGHTS_URL = 'https://download.openmmlab.com/mmgen/evaluation/lpips/weights/v0.1/vgg.pth'  # noqa
+
+
+class PerceptualLoss(torch.nn.Module):
+    r"""LPIPS metric with VGG using our perceptually-learned weights.
+
+        Ref: https://github.com/rosinality/stylegan2-pytorch/blob/master/lpips/__init__.py # noqa
+    """
+
+    def __init__(self,
+                 spatial=False,
+                 use_gpu=True,
+                 gpu_ids=[0],
+                 pretrained=True):
+        super().__init__()
+        print('Setting up Perceptual loss...')
+        self.use_gpu = use_gpu
+        self.spatial = spatial
+        self.gpu_ids = gpu_ids
+        print('...[pnet-lin, vgg16] initializing')
+        self.init_net(pretrained=pretrained)
+        print('...Done')
+
+    def forward(self, pred, target, normalize=False):
+
+        if normalize:
+            target = 2 * target - 1
+            pred = 2 * pred - 1
+
+        return self.net(target, pred)
+
+    def init_net(self,
+                 pnet_rand=False,
+                 pnet_tune=False,
+                 pretrained=True,
+                 version='0.1'):
+        self.net = PNetLin(
+            pnet_rand=pnet_rand,
+            pnet_tune=pnet_tune,
+            use_dropout=True,
+            spatial=self.spatial,
+            version=version,
+            lpips=True)
+
+        if pretrained:
+            print('Loading model from: %s' % LPIPS_WEIGHTS_URL)
+            self.net.load_state_dict(
+                load_url(LPIPS_WEIGHTS_URL, map_location='cpu', progress=True),
+                strict=False)
+
+        self.parameters = list(self.net.parameters())
+        self.net.eval()
+
+        if self.use_gpu:
+            self.net.to(self.gpu_ids[0])
+            self.net = torch.nn.DataParallel(self.net, device_ids=self.gpu_ids)
--- a/mmgen/models/architectures/lpips/pretrained_networks.py
+++ b/mmgen/models/architectures/lpips/pretrained_networks.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import namedtuple
+
+import torch
+from torchvision import models as tv
+
+
+class vgg16(torch.nn.Module):
+    r"""VGG16 feature extractor for LPIPS metric.
+
+        Ref : https://github.com/richzhang/PerceptualSimilarity/blob/master/lpips/pretrained_networks.py # noqa
+    """
+
+    def __init__(self, requires_grad=False, pretrained=True):
+        super().__init__()
+        vgg_pretrained_features = tv.vgg16(pretrained=pretrained).features
+        self.slice1 = torch.nn.Sequential()
+        self.slice2 = torch.nn.Sequential()
+        self.slice3 = torch.nn.Sequential()
+        self.slice4 = torch.nn.Sequential()
+        self.slice5 = torch.nn.Sequential()
+        self.N_slices = 5
+        for x in range(4):
+            self.slice1.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(4, 9):
+            self.slice2.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(9, 16):
+            self.slice3.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(16, 23):
+            self.slice4.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(23, 30):
+            self.slice5.add_module(str(x), vgg_pretrained_features[x])
+        if not requires_grad:
+            for param in self.parameters():
+                param.requires_grad = False
+
+    def forward(self, X):
+        h = self.slice1(X)
+        h_relu1_2 = h
+        h = self.slice2(h)
+        h_relu2_2 = h
+        h = self.slice3(h)
+        h_relu3_3 = h
+        h = self.slice4(h)
+        h_relu4_3 = h
+        h = self.slice5(h)
+        h_relu5_3 = h
+        vgg_outputs = namedtuple(
+            'VggOutputs',
+            ['relu1_2', 'relu2_2', 'relu3_3', 'relu4_3', 'relu5_3'])
+        out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3,
+                          h_relu5_3)
+
+        return out
--- a/mmgen/models/architectures/lsgan/__init__.py
+++ b/mmgen/models/architectures/lsgan/__init__.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from .generator_discriminator import LSGANDiscriminator, LSGANGenerator
+
+__all__ = ['LSGANDiscriminator', 'LSGANGenerator']
--- a/mmgen/models/architectures/lsgan/generator_discriminator.py
+++ b/mmgen/models/architectures/lsgan/generator_discriminator.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.cnn.bricks import build_activation_layer
+
+from mmgen.models.builder import MODULES
+from ..common import get_module_device
+
+
+@MODULES.register_module()
+class LSGANGenerator(nn.Module):
+    """Generator for LSGAN.
+
+    Implementation Details for LSGAN architecture:
+
+    #. Adopt transposed convolution in the generator;
+    #. Use batchnorm in the generator except for the final output layer;
+    #. Use ReLU in the generator in addition to the final output layer;
+    #. Keep channels of feature maps unchanged in the convolution backbone;
+    #. Use one more 3x3 conv every upsampling in the convolution backbone.
+
+    We follow the implementation details of the origin paper:
+    Least Squares Generative Adversarial Networks
+    https://arxiv.org/pdf/1611.04076.pdf
+
+    Args:
+        output_scale (int, optional): Output scale for the generated image.
+            Defaults to 128.
+        out_channels (int, optional): The channel number of the output feature.
+            Defaults to 3.
+        base_channels (int, optional): The basic channel number of the
+            generator. The other layers contains channels based on this number.
+            Defaults to 256.
+        input_scale (int, optional): The scale of the input 2D feature map.
+            Defaults to 8.
+        noise_size (int, optional): Size of the input noise
+            vector. Defaults to 1024.
+        conv_cfg (dict, optional): Config for the convolution module used in
+            this generator. Defaults to dict(type='ConvTranspose2d').
+        default_norm_cfg (dict, optional): Norm config for all of layers
+            except for the final output layer. Defaults to dict(type='BN').
+        default_act_cfg (dict, optional): Activation config for all of layers
+            except for the final output layer. Defaults to dict(type='ReLU').
+        out_act_cfg (dict, optional): Activation config for the final output
+            layer. Defaults to dict(type='Tanh').
+    """
+
+    def __init__(self,
+                 output_scale=128,
+                 out_channels=3,
+                 base_channels=256,
+                 input_scale=8,
+                 noise_size=1024,
+                 conv_cfg=dict(type='ConvTranspose2d'),
+                 default_norm_cfg=dict(type='BN'),
+                 default_act_cfg=dict(type='ReLU'),
+                 out_act_cfg=dict(type='Tanh')):
+        super().__init__()
+        assert output_scale % input_scale == 0
+        assert output_scale // input_scale >= 4
+
+        self.output_scale = output_scale
+        self.base_channels = base_channels
+        self.input_scale = input_scale
+        self.noise_size = noise_size
+
+        self.noise2feat_head = nn.Sequential(
+            nn.Linear(noise_size, input_scale * input_scale * base_channels))
+        self.noise2feat_tail = nn.Sequential(nn.BatchNorm2d(base_channels))
+        if default_act_cfg is not None:
+            self.noise2feat_tail.add_module(
+                'act', build_activation_layer(default_act_cfg))
+
+        # the number of times for upsampling
+        self.num_upsamples = int(np.log2(output_scale // input_scale)) - 2
+
+        # build up convolution backbone (excluding the output layer)
+        self.conv_blocks = nn.ModuleList()
+        for _ in range(self.num_upsamples):
+            self.conv_blocks.append(
+                ConvModule(
+                    base_channels,
+                    base_channels,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=dict(conv_cfg, output_padding=1),
+                    norm_cfg=default_norm_cfg,
+                    act_cfg=default_act_cfg))
+            self.conv_blocks.append(
+                ConvModule(
+                    base_channels,
+                    base_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=default_norm_cfg,
+                    act_cfg=default_act_cfg))
+
+        # output blocks
+        self.conv_blocks.append(
+            ConvModule(
+                base_channels,
+                int(base_channels // 2),
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                conv_cfg=dict(conv_cfg, output_padding=1),
+                norm_cfg=default_norm_cfg,
+                act_cfg=default_act_cfg))
+        self.conv_blocks.append(
+            ConvModule(
+                int(base_channels // 2),
+                int(base_channels // 4),
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                conv_cfg=dict(conv_cfg, output_padding=1),
+                norm_cfg=default_norm_cfg,
+                act_cfg=default_act_cfg))
+        self.conv_blocks.append(
+            ConvModule(
+                int(base_channels // 4),
+                out_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=None,
+                act_cfg=out_act_cfg))
+
+    def forward(self, noise, num_batches=0, return_noise=False):
+        """Forward function.
+
+        Args:
+            noise (torch.Tensor | callable | None): You can directly give a
+                batch of noise through a ``torch.Tensor`` or offer a callable
+                function to sample a batch of noise data. Otherwise, the
+                ``None`` indicates to use the default noise sampler.
+            num_batches (int, optional): The number of batch size.
+                Defaults to 0.
+            return_noise (bool, optional): If True, ``noise_batch`` will be
+                returned in a dict with ``fake_img``. Defaults to False.
+
+        Returns:
+            torch.Tensor | dict: If not ``return_noise``, only the output image
+                will be returned. Otherwise, a dict contains ``fake_img`` and
+                ``noise_batch`` will be returned.
+        """
+        # receive noise and conduct sanity check.
+        if isinstance(noise, torch.Tensor):
+            assert noise.shape[1] == self.noise_size
+            if noise.ndim == 2:
+                noise_batch = noise
+            else:
+                raise ValueError('The noise should be in shape of (n, c)'
+                                 f'but got {noise.shape}')
+        # receive a noise generator and sample noise.
+        elif callable(noise):
+            noise_generator = noise
+            assert num_batches > 0
+            noise_batch = noise_generator((num_batches, self.noise_size))
+        # otherwise, we will adopt default noise sampler.
+        else:
+            assert num_batches > 0
+            noise_batch = torch.randn((num_batches, self.noise_size))
+
+        # dirty code for putting data on the right device
+        noise_batch = noise_batch.to(get_module_device(self))
+        # noise2feat
+        x = self.noise2feat_head(noise_batch)
+        x = x.reshape(
+            (-1, self.base_channels, self.input_scale, self.input_scale))
+        x = self.noise2feat_tail(x)
+        # conv module
+        for conv in self.conv_blocks:
+            x = conv(x)
+
+        if return_noise:
+            return dict(fake_img=x, noise_batch=noise_batch)
+
+        return x
+
+
+@MODULES.register_module()
+class LSGANDiscriminator(nn.Module):
+    """Discriminator for LSGAN.
+
+    Implementation Details for LSGAN architecture:
+
+    #. Adopt convolution in the discriminator;
+    #. Use batchnorm in the discriminator except for the input and final \
+       output layer;
+    #. Use LeakyReLU in the discriminator in addition to the output layer;
+    #. Use fully connected layer in the output layer;
+    #. Use 5x5 conv rather than 4x4 conv in DCGAN.
+
+    Args:
+        input_scale (int, optional): The scale of the input image. Defaults to
+            128.
+        output_scale (int, optional): The final scale of the convolutional
+            feature. Defaults to 8.
+        out_channels (int, optional): The channel number of the final output
+            layer. Defaults to 1.
+        in_channels (int, optional): The channel number of the input image.
+            Defaults to 3.
+        base_channels (int, optional): The basic channel number of the
+            generator. The other layers contains channels based on this number.
+            Defaults to 128.
+        conv_cfg (dict, optional): Config for the convolution module used in
+            this discriminator. Defaults to dict(type='Conv2d').
+        default_norm_cfg (dict, optional): Norm config for all of layers
+            except for the final output layer. Defaults to ``dict(type='BN')``.
+        default_act_cfg (dict, optional): Activation config for all of layers
+            except for the final output layer. Defaults to
+            ``dict(type='LeakyReLU', negative_slope=0.2)``.
+        out_act_cfg (dict, optional): Activation config for the final output
+            layer. Defaults to ``dict(type='Tanh')``.
+    """
+
+    def __init__(self,
+                 input_scale=128,
+                 output_scale=8,
+                 out_channels=1,
+                 in_channels=3,
+                 base_channels=64,
+                 conv_cfg=dict(type='Conv2d'),
+                 default_norm_cfg=dict(type='BN'),
+                 default_act_cfg=dict(type='LeakyReLU', negative_slope=0.2),
+                 out_act_cfg=None):
+        super().__init__()
+        assert input_scale % output_scale == 0
+        assert input_scale // output_scale >= 2
+
+        self.input_scale = input_scale
+        self.output_scale = output_scale
+        self.out_channels = out_channels
+        self.base_channels = base_channels
+        self.with_out_activation = out_act_cfg is not None
+
+        self.conv_blocks = nn.ModuleList()
+        self.conv_blocks.append(
+            ConvModule(
+                in_channels,
+                base_channels,
+                kernel_size=5,
+                stride=2,
+                padding=2,
+                conv_cfg=conv_cfg,
+                norm_cfg=None,
+                act_cfg=default_act_cfg))
+
+        # the number of times for downsampling
+        self.num_downsamples = int(np.log2(input_scale // output_scale)) - 1
+
+        # build up downsampling backbone (excluding the output layer)
+        curr_channels = base_channels
+        for _ in range(self.num_downsamples):
+            self.conv_blocks.append(
+                ConvModule(
+                    curr_channels,
+                    curr_channels * 2,
+                    kernel_size=5,
+                    stride=2,
+                    padding=2,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=default_norm_cfg,
+                    act_cfg=default_act_cfg))
+            curr_channels = curr_channels * 2
+
+        # output layer
+        self.decision = nn.Sequential(
+            nn.Linear(output_scale * output_scale * curr_channels,
+                      out_channels))
+        if self.with_out_activation:
+            self.out_activation = build_activation_layer(out_act_cfg)
+
+    def forward(self, x):
+        """Forward function.
+
+        Args:
+            x (torch.Tensor): Fake or real image tensor.
+
+        Returns:
+            torch.Tensor: Prediction for the reality of the input image.
+        """
+        n = x.shape[0]
+
+        for conv in self.conv_blocks:
+            x = conv(x)
+
+        x = x.reshape(n, -1)
+        x = self.decision(x)
+
+        if self.with_out_activation:
+            x = self.out_activation(x)
+
+        return x
--- a/mmgen/models/architectures/pggan/__init__.py
+++ b/mmgen/models/architectures/pggan/__init__.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from .generator_discriminator import PGGANDiscriminator, PGGANGenerator
+from .modules import (EqualizedLR, EqualizedLRConvDownModule,
+                      EqualizedLRConvModule, EqualizedLRConvUpModule,
+                      EqualizedLRLinearModule, MiniBatchStddevLayer,
+                      PGGANNoiseTo2DFeat, PixelNorm, equalized_lr)
+
+__all__ = [
+    'EqualizedLR', 'equalized_lr', 'EqualizedLRConvModule',
+    'EqualizedLRLinearModule', 'EqualizedLRConvUpModule',
+    'EqualizedLRConvDownModule', 'PixelNorm', 'MiniBatchStddevLayer',
+    'PGGANNoiseTo2DFeat', 'PGGANGenerator', 'PGGANDiscriminator'
+]
--- a/mmgen/models/architectures/pggan/generator_discriminator.py
+++ b/mmgen/models/architectures/pggan/generator_discriminator.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from copy import deepcopy
+from functools import partial
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn.bricks.upsample import build_upsample_layer
+
+from mmgen.models.builder import MODULES
+from ..common import get_module_device
+from .modules import (EqualizedLRConvDownModule, EqualizedLRConvModule,
+                      EqualizedLRConvUpModule, MiniBatchStddevLayer,
+                      PGGANDecisionHead, PGGANNoiseTo2DFeat)
+
+
+@MODULES.register_module()
+class PGGANGenerator(nn.Module):
+    """Generator for PGGAN.
+
+    Args:
+        noise_size (int): Size of the input noise vector.
+        out_scale (int): Output scale for the generated image.
+        label_size (int, optional): Size of the label vector.
+            Defaults to 0.
+        base_channels (int, optional): The basic channel number of the
+            generator. The other layers contains channels based on this
+            number. Defaults to 8192.
+        channel_decay (float, optional): Decay for channels of feature maps.
+            Defaults to 1.0.
+        max_channels (int, optional): Maximum channels for the feature
+            maps in the generator block. Defaults to 512.
+        fused_upconv (bool, optional): Whether use fused upconv.
+            Defaults to True.
+        conv_module_cfg (dict, optional): Config for the convolution
+            module used in this generator. Defaults to None.
+        fused_upconv_cfg (dict, optional): Config for the fused upconv
+            module used in this generator. Defaults to None.
+        upsample_cfg (dict, optional): Config for the upsampling operation.
+            Defaults to None.
+    """
+    _default_fused_upconv_cfg = dict(
+        conv_cfg=dict(type='deconv'),
+        kernel_size=3,
+        stride=2,
+        padding=1,
+        bias=True,
+        act_cfg=dict(type='LeakyReLU', negative_slope=0.2),
+        norm_cfg=dict(type='PixelNorm'),
+        order=('conv', 'act', 'norm'))
+    _default_conv_module_cfg = dict(
+        conv_cfg=None,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=True,
+        act_cfg=dict(type='LeakyReLU', negative_slope=0.2),
+        norm_cfg=dict(type='PixelNorm'),
+        order=('conv', 'act', 'norm'))
+
+    _default_upsample_cfg = dict(type='nearest', scale_factor=2)
+
+    def __init__(self,
+                 noise_size,
+                 out_scale,
+                 label_size=0,
+                 base_channels=8192,
+                 channel_decay=1.,
+                 max_channels=512,
+                 fused_upconv=True,
+                 conv_module_cfg=None,
+                 fused_upconv_cfg=None,
+                 upsample_cfg=None):
+        super().__init__()
+        self.noise_size = noise_size if noise_size else min(
+            base_channels, max_channels)
+        self.out_scale = out_scale
+        self.out_log2_scale = int(np.log2(out_scale))
+        # sanity check for the output scale
+        assert out_scale == 2**self.out_log2_scale and out_scale >= 4
+        self.label_size = label_size
+        self.base_channels = base_channels
+        self.channel_decay = channel_decay
+        self.max_channels = max_channels
+        self.fused_upconv = fused_upconv
+
+        # set conv cfg
+        self.conv_module_cfg = deepcopy(self._default_conv_module_cfg)
+        # update with customized config
+        if conv_module_cfg:
+            self.conv_module_cfg.update(conv_module_cfg)
+
+        if self.fused_upconv:
+            self.fused_upconv_cfg = deepcopy(self._default_fused_upconv_cfg)
+            # update with customized config
+            if fused_upconv_cfg:
+                self.fused_upconv_cfg.update(fused_upconv_cfg)
+
+        self.upsample_cfg = deepcopy(self._default_upsample_cfg)
+        if upsample_cfg is not None:
+            self.upsample_cfg.update(upsample_cfg)
+
+        self.noise2feat = PGGANNoiseTo2DFeat(noise_size + label_size,
+                                             self._num_out_channels(1))
+
+        self.torgb_layers = nn.ModuleList()
+        self.conv_blocks = nn.ModuleList()
+        for s in range(2, self.out_log2_scale + 1):
+            in_ch = self._num_out_channels(
+                s - 1) if s == 2 else self._num_out_channels(s - 2)
+            # setup torgb layers
+            self.torgb_layers.append(
+                self._get_torgb_layer(self._num_out_channels(s - 1)))
+            # setup upconv or conv blocks
+            self.conv_blocks.extend(self._get_upconv_block(in_ch, s))
+
+        # build upsample layer for residual path
+        self.upsample_layer = build_upsample_layer(self.upsample_cfg)
+
+    def _get_torgb_layer(self, in_channels):
+        return EqualizedLRConvModule(
+            in_channels,
+            3,
+            kernel_size=1,
+            stride=1,
+            equalized_lr_cfg=dict(gain=1),
+            bias=True,
+            norm_cfg=None,
+            act_cfg=None)
+
+    def _num_out_channels(self, log_scale):
+        return min(
+            int(self.base_channels / (2.0**(log_scale * self.channel_decay))),
+            self.max_channels)
+
+    def _get_upconv_block(self, in_channels, log_scale):
+        modules = []
+        # start 4x4 scale
+        if log_scale == 2:
+            modules.append(
+                EqualizedLRConvModule(in_channels,
+                                      self._num_out_channels(log_scale - 1),
+                                      **self.conv_module_cfg))
+        # 8x8 --> 1024x1024 scales
+        else:
+            if self.fused_upconv:
+                cfg_ = dict(upsample=dict(type='fused_nn'))
+                cfg_.update(self.fused_upconv_cfg)
+            else:
+                cfg_ = dict(upsample=self.upsample_cfg)
+                cfg_.update(self.conv_module_cfg)
+            # up + conv
+            modules.append(
+                EqualizedLRConvUpModule(in_channels,
+                                        self._num_out_channels(log_scale - 1),
+                                        **cfg_))
+            # refine conv
+            modules.append(
+                EqualizedLRConvModule(
+                    self._num_out_channels(log_scale - 1),
+                    self._num_out_channels(log_scale - 1),
+                    **self.conv_module_cfg))
+
+        return modules
+
+    def forward(self,
+                noise,
+                label=None,
+                num_batches=0,
+                return_noise=False,
+                transition_weight=1.,
+                curr_scale=-1):
+        """Forward function.
+
+        Args:
+            noise (torch.Tensor | callable | None): You can directly give a
+                batch of noise through a ``torch.Tensor`` or offer a callable
+                function to sample a batch of noise data. Otherwise, the
+                ``None`` indicates to use the default noise sampler.
+            label (Tensor, optional): Label vector with shape [N, C]. Defaults
+                to None.
+            num_batches (int, optional): The number of batch size. Defaults to
+                0.
+            return_noise (bool, optional): If True, ``noise_batch`` will be
+                returned in a dict with ``fake_img``. Defaults to False.
+            transition_weight (float, optional): The weight used in resolution
+                transition. Defaults to 1.0.
+            curr_scale (int, optional): The scale for the current inference or
+                training. Defaults to -1.
+
+        Returns:
+            torch.Tensor | dict: If not ``return_noise``, only the output image
+                will be returned. Otherwise, a dict contains ``fake_img`` and
+                ``noise_batch`` will be returned.
+        """
+        # receive noise and conduct sanity check.
+        if isinstance(noise, torch.Tensor):
+            assert noise.shape[1] == self.noise_size
+            assert noise.ndim == 2, ('The noise should be in shape of (n, c), '
+                                     f'but got {noise.shape}')
+            noise_batch = noise
+        # receive a noise generator and sample noise.
+        elif callable(noise):
+            noise_generator = noise
+            assert num_batches > 0
+            noise_batch = noise_generator((num_batches, self.noise_size))
+        # otherwise, we will adopt default noise sampler.
+        else:
+            assert num_batches > 0
+            # TODO: check pggan default noise type
+            noise_batch = torch.randn((num_batches, self.noise_size))
+
+        # dirty code for putting data on the right device
+        noise_batch = noise_batch.to(get_module_device(self))
+
+        if label is not None:
+            noise_batch = torch.cat(
+                [noise_batch, label.to(noise_batch)], dim=1)
+
+        # noise vector to 2D feature
+        x = self.noise2feat(noise_batch)
+
+        # build current computational graph
+        curr_log2_scale = self.out_log2_scale if curr_scale < 0 else int(
+            np.log2(curr_scale))
+
+        # 4x4 scale
+        x = self.conv_blocks[0](x)
+        if curr_log2_scale <= 3:
+            out_img = last_img = self.torgb_layers[0](x)
+
+        # 8x8 and larger scales
+        for s in range(3, curr_log2_scale + 1):
+            x = self.conv_blocks[2 * s - 5](x)
+            x = self.conv_blocks[2 * s - 4](x)
+            if s + 1 == curr_log2_scale:
+                last_img = self.torgb_layers[s - 2](x)
+            elif s == curr_log2_scale:
+                out_img = self.torgb_layers[s - 2](x)
+                residual_img = self.upsample_layer(last_img)
+                out_img = residual_img + transition_weight * (
+                    out_img - residual_img)
+
+        if return_noise:
+            output = dict(
+                fake_img=out_img, noise_batch=noise_batch, label=label)
+            return output
+
+        return out_img
+
+
+@MODULES.register_module()
+class PGGANDiscriminator(nn.Module):
+    """Discriminator for PGGAN.
+
+    Args:
+        in_scale (int): The scale of the input image.
+        label_size (int, optional): Size of the label vector. Defaults to
+            0.
+        base_channels (int, optional): The basic channel number of the
+            generator. The other layers contains channels based on this
+            number. Defaults to 8192.
+        max_channels (int, optional): Maximum channels for the feature
+            maps in the discriminator block. Defaults to 512.
+        in_channels (int, optional): Number of channels in input images.
+            Defaults to 3.
+        channel_decay (float, optional): Decay for channels of feature
+            maps. Defaults to 1.0.
+        mbstd_cfg (dict, optional): Configs for minibatch-stddev layer.
+            Defaults to dict(group_size=4).
+        fused_convdown (bool, optional): Whether use fused downconv.
+            Defaults to True.
+        conv_module_cfg (dict, optional): Config for the convolution
+            module used in this generator. Defaults to None.
+        fused_convdown_cfg (dict, optional): Config for the fused downconv
+            module used in this discriminator. Defaults to None.
+        fromrgb_layer_cfg (dict, optional): Config for the fromrgb layer.
+            Defaults to None.
+        downsample_cfg (dict, optional): Config for the downsampling
+            operation. Defaults to None.
+    """
+    _default_fromrgb_cfg = dict(
+        conv_cfg=None,
+        kernel_size=1,
+        stride=1,
+        padding=0,
+        bias=True,
+        act_cfg=dict(type='LeakyReLU', negative_slope=0.2),
+        norm_cfg=None,
+        order=('conv', 'act', 'norm'))
+
+    _default_conv_module_cfg = dict(
+        kernel_size=3,
+        padding=1,
+        stride=1,
+        norm_cfg=None,
+        act_cfg=dict(type='LeakyReLU', negative_slope=0.2))
+
+    _default_convdown_cfg = dict(
+        kernel_size=3,
+        padding=1,
+        stride=2,
+        norm_cfg=None,
+        act_cfg=dict(type='LeakyReLU', negative_slope=0.2))
+
+    def __init__(self,
+                 in_scale,
+                 label_size=0,
+                 base_channels=8192,
+                 max_channels=512,
+                 in_channels=3,
+                 channel_decay=1.0,
+                 mbstd_cfg=dict(group_size=4),
+                 fused_convdown=True,
+                 conv_module_cfg=None,
+                 fused_convdown_cfg=None,
+                 fromrgb_layer_cfg=None,
+                 downsample_cfg=None):
+        super().__init__()
+        self.in_scale = in_scale
+        self.in_log2_scale = int(np.log2(self.in_scale))
+        self.label_size = label_size
+        self.base_channels = base_channels
+        self.max_channels = max_channels
+        self.in_channels = in_channels
+        self.channel_decay = channel_decay
+        self.with_mbstd = mbstd_cfg is not None
+
+        self.fused_convdown = fused_convdown
+
+        self.conv_module_cfg = deepcopy(self._default_conv_module_cfg)
+        if conv_module_cfg is not None:
+            self.conv_module_cfg.update(conv_module_cfg)
+
+        if self.fused_convdown:
+            self.fused_convdown_cfg = deepcopy(self._default_convdown_cfg)
+            if fused_convdown_cfg is not None:
+                self.fused_convdown_cfg.update(fused_convdown_cfg)
+
+        self.fromrgb_layer_cfg = deepcopy(self._default_fromrgb_cfg)
+        if fromrgb_layer_cfg:
+            self.fromrgb_layer_cfg.update(fromrgb_layer_cfg)
+
+        # setup conv blocks
+        self.conv_blocks = nn.ModuleList()
+        self.fromrgb_layers = nn.ModuleList()
+
+        for s in range(2, self.in_log2_scale + 1):
+            self.fromrgb_layers.append(
+                self._get_fromrgb_layer(self.in_channels, s))
+
+            self.conv_blocks.extend(
+                self._get_convdown_block(self._num_out_channels(s - 1), s))
+
+        # setup downsample layer
+        self.downsample_cfg = deepcopy(downsample_cfg)
+        if self.downsample_cfg is None or self.downsample_cfg.get(
+                'type', None) == 'avgpool':
+            self.downsample = nn.AvgPool2d(kernel_size=2, stride=2)
+        elif self.downsample_cfg.get('type', None) in ['nearest', 'bilinear']:
+            self.downsample = partial(
+                F.interpolate,
+                mode=self.downsample_cfg.pop('type'),
+                **self.downsample_cfg)
+        else:
+            raise NotImplementedError(
+                'We have not supported the downsampling with type'
+                f' {downsample_cfg}.')
+
+        # setup minibatch stddev layer
+        if self.with_mbstd:
+            self.mbstd_layer = MiniBatchStddevLayer(**mbstd_cfg)
+            # minibatch stddev layer will concatenate an additional feature map
+            # in channel dimension.
+            decision_in_channels = self._num_out_channels(1) * 16 + 16
+        else:
+            decision_in_channels = self._num_out_channels(1) * 16
+
+        # setup decision layer
+        self.decision = PGGANDecisionHead(decision_in_channels,
+                                          self._num_out_channels(0),
+                                          1 + self.label_size)
+
+    def _num_out_channels(self, log_scale):
+        return min(
+            int(self.base_channels / (2.0**(log_scale * self.channel_decay))),
+            self.max_channels)
+
+    def _get_fromrgb_layer(self, in_channels, log2_scale):
+        return EqualizedLRConvModule(in_channels,
+                                     self._num_out_channels(log2_scale - 1),
+                                     **self.fromrgb_layer_cfg)
+
+    def _get_convdown_block(self, in_channels, log2_scale):
+        modules = []
+        if log2_scale == 2:
+            modules.append(
+                EqualizedLRConvModule(in_channels,
+                                      self._num_out_channels(log2_scale - 1),
+                                      **self.conv_module_cfg))
+        else:
+            modules.append(
+                EqualizedLRConvModule(in_channels,
+                                      self._num_out_channels(log2_scale - 1),
+                                      **self.conv_module_cfg))
+
+            if self.fused_convdown:
+                cfg_ = dict(downsample=dict(type='fused_pool'))
+                cfg_.update(self.fused_convdown_cfg)
+            else:
+                cfg_ = dict(downsample=self.downsample)
+                cfg_.update(self.conv_module_cfg)
+            modules.append(
+                EqualizedLRConvDownModule(
+                    self._num_out_channels(log2_scale - 1),
+                    self._num_out_channels(log2_scale - 2), **cfg_))
+        return modules
+
+    def forward(self, x, transition_weight=1., curr_scale=-1):
+        """Forward function.
+
+        Args:
+            x (torch.Tensor): Input image tensor.
+            transition_weight (float, optional): The weight used in resolution
+                transition. Defaults to 1.0.
+            curr_scale (int, optional): The scale for the current inference or
+                training. Defaults to -1.
+
+        Returns:
+            Tensor: Predict score for the input image.
+        """
+        curr_log2_scale = self.in_log2_scale if curr_scale < 4 else int(
+            np.log2(curr_scale))
+
+        original_img = x
+
+        x = self.fromrgb_layers[curr_log2_scale - 2](x)
+
+        for s in range(curr_log2_scale, 2, -1):
+            x = self.conv_blocks[2 * s - 5](x)
+            x = self.conv_blocks[2 * s - 4](x)
+            if s == curr_log2_scale:
+                img_down = self.downsample(original_img)
+                y = self.fromrgb_layers[curr_log2_scale - 3](img_down)
+                x = y + transition_weight * (x - y)
+
+        if self.with_mbstd:
+            x = self.mbstd_layer(x)
+
+        x = self.decision(x)
+
+        if self.label_size > 0:
+            return x[:, :1], x[:, 1:]
+
+        return x
--- a/mmgen/models/architectures/pggan/modules.py
+++ b/mmgen/models/architectures/pggan/modules.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from copy import deepcopy
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn.bricks import (NORM_LAYERS, PLUGIN_LAYERS, ConvModule,
+                             build_activation_layer, build_norm_layer,
+                             build_upsample_layer)
+from mmcv.cnn.utils import normal_init
+from torch.nn.init import _calculate_correct_fan
+
+from mmgen.models.builder import MODULES
+from mmgen.models.common import AllGatherLayer
+
+
+class EqualizedLR:
+    r"""Equalized Learning Rate.
+
+    This trick is proposed in:
+    Progressive Growing of GANs for Improved Quality, Stability, and Variation
+
+    The general idea is to dynamically rescale the weight in training instead
+    of in initializing so that the variance of the responses in each layer is
+    guaranteed with some statistical properties.
+
+    Note that this function is always combined with a convolution module which
+    is initialized with :math:`\mathcal{N}(0, 1)`.
+
+    Args:
+        name (str | optional): The name of weights. Defaults to 'weight'.
+        mode (str, optional): The mode of computing ``fan`` which is the
+            same as ``kaiming_init`` in pytorch. You can choose one from
+            ['fan_in', 'fan_out']. Defaults to 'fan_in'.
+    """
+
+    def __init__(self, name='weight', gain=2**0.5, mode='fan_in', lr_mul=1.0):
+        self.name = name
+        self.mode = mode
+        self.gain = gain
+        self.lr_mul = lr_mul
+
+    def compute_weight(self, module):
+        """Compute weight with equalized learning rate.
+
+        Args:
+            module (nn.Module): A module that is wrapped with equalized lr.
+
+        Returns:
+            torch.Tensor: Updated weight.
+        """
+        weight = getattr(module, self.name + '_orig')
+        if weight.ndim == 5:
+            # weight in shape of [b, out, in, k, k]
+            fan = _calculate_correct_fan(weight[0], self.mode)
+        else:
+            assert weight.ndim <= 4
+            fan = _calculate_correct_fan(weight, self.mode)
+        weight = weight * torch.tensor(
+            self.gain, device=weight.device) * torch.sqrt(
+                torch.tensor(1. / fan, device=weight.device)) * self.lr_mul
+
+        return weight
+
+    def __call__(self, module, inputs):
+        """Standard interface for forward pre hooks."""
+        setattr(module, self.name, self.compute_weight(module))
+
+    @staticmethod
+    def apply(module, name, gain=2**0.5, mode='fan_in', lr_mul=1.):
+        """Apply function.
+
+        This function is to register an equalized learning rate hook in an
+        ``nn.Module``.
+
+        Args:
+            module (nn.Module): Module to be wrapped.
+            name (str | optional): The name of weights. Defaults to 'weight'.
+            mode (str, optional): The mode of computing ``fan`` which is the
+                same as ``kaiming_init`` in pytorch. You can choose one from
+                ['fan_in', 'fan_out']. Defaults to 'fan_in'.
+
+        Returns:
+            nn.Module: Module that is registered with equalized lr hook.
+        """
+        # sanity check for duplicated hooks.
+        for _, hook in module._forward_pre_hooks.items():
+            if isinstance(hook, EqualizedLR):
+                raise RuntimeError(
+                    'Cannot register two equalized_lr hooks on the same '
+                    f'parameter {name} in {module} module.')
+
+        fn = EqualizedLR(name, gain=gain, mode=mode, lr_mul=lr_mul)
+        weight = module._parameters[name]
+
+        delattr(module, name)
+        module.register_parameter(name + '_orig', weight)
+
+        # We still need to assign weight back as fn.name because all sorts of
+        # things may assume that it exists, e.g., when initializing weights.
+        # However, we can't directly assign as it could be an nn.Parameter and
+        # gets added as a parameter. Instead, we register weight.data as a
+        # plain attribute.
+
+        setattr(module, name, weight.data)
+        module.register_forward_pre_hook(fn)
+
+        # TODO: register load state dict hook
+
+        return fn
+
+
+def equalized_lr(module, name='weight', gain=2**0.5, mode='fan_in', lr_mul=1.):
+    r"""Equalized Learning Rate.
+
+    This trick is proposed in:
+    Progressive Growing of GANs for Improved Quality, Stability, and Variation
+
+    The general idea is to dynamically rescale the weight in training instead
+    of in initializing so that the variance of the responses in each layer is
+    guaranteed with some statistical properties.
+
+    Note that this function is always combined with a convolution module which
+    is initialized with :math:`\mathcal{N}(0, 1)`.
+
+    Args:
+        module (nn.Module): Module to be wrapped.
+        name (str | optional): The name of weights. Defaults to 'weight'.
+        mode (str, optional): The mode of computing ``fan`` which is the
+            same as ``kaiming_init`` in pytorch. You can choose one from
+            ['fan_in', 'fan_out']. Defaults to 'fan_in'.
+
+    Returns:
+        nn.Module: Module that is registered with equalized lr hook.
+    """
+    EqualizedLR.apply(module, name, gain=gain, mode=mode, lr_mul=lr_mul)
+
+    return module
+
+
+def pixel_norm(x, eps=1e-6):
+    """Pixel Normalization.
+
+    This normalization is proposed in:
+    Progressive Growing of GANs for Improved Quality, Stability, and Variation
+
+    Args:
+        x (torch.Tensor): Tensor to be normalized.
+        eps (float, optional): Epsilon to avoid dividing zero.
+            Defaults to 1e-6.
+
+    Returns:
+        torch.Tensor: Normalized tensor.
+    """
+    if torch.__version__ >= '1.7.0':
+        norm = torch.linalg.norm(x, ord=2, dim=1, keepdim=True)
+    # support older pytorch version
+    else:
+        norm = torch.norm(x, p=2, dim=1, keepdim=True)
+    norm = norm / torch.sqrt(torch.tensor(x.shape[1]).to(x))
+
+    return x / (norm + eps)
+
+
+@MODULES.register_module()
+@NORM_LAYERS.register_module()
+class PixelNorm(nn.Module):
+    """Pixel Normalization.
+
+    This module is proposed in:
+    Progressive Growing of GANs for Improved Quality, Stability, and Variation
+
+    Args:
+        eps (float, optional): Epsilon value. Defaults to 1e-6.
+    """
+
+    _abbr_ = 'pn'
+
+    def __init__(self, in_channels=None, eps=1e-6):
+        super().__init__()
+        self.eps = eps
+
+    def forward(self, x):
+        """Forward function.
+
+        Args:
+            x (torch.Tensor): Tensor to be normalized.
+
+        Returns:
+            torch.Tensor: Normalized tensor.
+        """
+        return pixel_norm(x, self.eps)
+
+
+@PLUGIN_LAYERS.register_module()
+class EqualizedLRConvModule(ConvModule):
+    r"""Equalized LR ConvModule.
+
+    In this module, we inherit default ``mmcv.cnn.ConvModule`` and adopt
+    equalized lr in convolution. The equalized learning rate is proposed in:
+    Progressive Growing of GANs for Improved Quality, Stability, and Variation
+
+    Note that, the initialization of ``self.conv`` will be overwritten as
+    :math:`\mathcal{N}(0, 1)`.
+
+    Args:
+        equalized_lr_cfg (dict | None, optional): Config for ``EqualizedLR``.
+            If ``None``, equalized learning rate is ignored. Defaults to
+            dict(mode='fan_in').
+    """
+
+    def __init__(self, *args, equalized_lr_cfg=dict(mode='fan_in'), **kwargs):
+        super().__init__(*args, **kwargs)
+        self.with_equalized_lr = equalized_lr_cfg is not None
+        if self.with_equalized_lr:
+            self.conv = equalized_lr(self.conv, **equalized_lr_cfg)
+            # initialize the conv weight with standard Gaussian noise.
+            self._init_conv_weights()
+
+    def _init_conv_weights(self):
+        """Initialize conv weights as described in PGGAN."""
+        normal_init(self.conv)
+
+
+@PLUGIN_LAYERS.register_module()
+class EqualizedLRConvUpModule(EqualizedLRConvModule):
+    r"""Equalized LR (Upsample + Conv) Module.
+
+    In this module, we inherit ``EqualizedLRConvModule`` and adopt
+    upsampling before convolution. As for upsampling, in addition to the
+    sampling layer in MMCV, we also offer the "fused_nn" type. "fused_nn"
+    denotes fusing upsampling and convolution. The fusion is modified from
+    the official Tensorflow implementation in:
+    https://github.com/tkarras/progressive_growing_of_gans/blob/master/networks.py#L86
+
+    Args:
+        upsample (dict | None, optional): Config for upsampling operation. If
+        ``None``, upsampling is ignored. If you need a faster fused version as
+        the official PGGAN in Tensorflow, you should set it as
+        ``dict(type='fused_nn')``. Defaults to
+        ``dict(type='nearest', scale_factor=2)``.
+    """
+
+    def __init__(self,
+                 *args,
+                 upsample=dict(type='nearest', scale_factor=2),
+                 **kwargs):
+        super().__init__(*args, **kwargs)
+        self.with_upsample = upsample is not None
+        if self.with_upsample:
+            if upsample.get('type') == 'fused_nn':
+                assert isinstance(self.conv, nn.ConvTranspose2d)
+                self.conv.register_forward_pre_hook(
+                    EqualizedLRConvUpModule.fused_nn_hook)
+            else:
+                self.upsample_layer = build_upsample_layer(upsample)
+
+    def forward(self, x, **kwargs):
+        """Forward function.
+
+        Args:
+            x (Tensor): Input tensor with shape (n, c, h, w).
+
+        Returns:
+            Tensor: Forward results.
+        """
+        if hasattr(self, 'upsample_layer'):
+            x = self.upsample_layer(x)
+        return super().forward(x, **kwargs)
+
+    @staticmethod
+    def fused_nn_hook(module, inputs):
+        """Standard interface for forward pre hooks."""
+        weight = module.weight
+        # pad the last two dimensions
+        weight = F.pad(weight, (1, 1, 1, 1))
+        weight = weight[..., 1:, 1:] + weight[..., 1:, :-1] + weight[
+            ..., :-1, 1:] + weight[..., :-1, :-1]
+        module.weight = weight
+
+
+@PLUGIN_LAYERS.register_module()
+class EqualizedLRConvDownModule(EqualizedLRConvModule):
+    r"""Equalized LR (Conv + Downsample)  Module.
+
+    In this module, we inherit ``EqualizedLRConvModule`` and adopt
+    downsampling after convolution. As for downsampling, we provide two modes
+    of "avgpool" and "fused_pool". "avgpool" denotes the commonly used average
+    pooling operation, while "fused_pool" represents fusing downsampling and
+    convolution. The fusion is modified from the official Tensorflow
+    implementation in:
+    https://github.com/tkarras/progressive_growing_of_gans/blob/master/networks.py#L109
+
+
+    Args:
+        downsample (dict | None, optional): Config for downsampling operation.
+            If ``None``, downsampling is ignored. Currently, we support the
+            types of ["avgpool", "fused_pool"]. Defaults to
+            dict(type='fused_pool').
+    """
+
+    def __init__(self, *args, downsample=dict(type='fused_pool'), **kwargs):
+        super().__init__(*args, **kwargs)
+        downsample_cfg = deepcopy(downsample)
+        self.with_downsample = downsample is not None
+        if self.with_downsample:
+            type_ = downsample_cfg.pop('type')
+            if type_ == 'avgpool':
+                self.downsample = nn.AvgPool2d(2, 2)
+            elif type_ == 'fused_pool':
+                self.conv.register_forward_pre_hook(
+                    EqualizedLRConvDownModule.fused_avgpool_hook)
+            elif callable(downsample):
+                self.downsample = downsample
+            else:
+                raise NotImplementedError(
+                    'Currently, we only support ["avgpool", "fused_pool"] as '
+                    f'the type of downsample, but got {type_} instead.')
+
+    def forward(self, x, **kwargs):
+        """Forward function.
+
+        Args:
+            x (Tensor): Input tensor with shape (n, c, h, w).
+
+        Returns:
+            torch.Tensor: Normalized tensor.
+        """
+        x = super().forward(x, **kwargs)
+        if hasattr(self, 'downsample'):
+            x = self.downsample(x)
+        return x
+
+    @staticmethod
+    def fused_avgpool_hook(module, inputs):
+        """Standard interface for forward pre hooks."""
+        weight = module.weight
+        # pad the last two dimensions
+        weight = F.pad(weight, (1, 1, 1, 1))
+        weight = (weight[..., 1:, 1:] + weight[..., 1:, :-1] +
+                  weight[..., :-1, 1:] + weight[..., :-1, :-1]) * 0.25
+        module.weight = weight
+
+
+@PLUGIN_LAYERS.register_module()
+class EqualizedLRLinearModule(nn.Linear):
+    r"""Equalized LR LinearModule.
+
+    In this module, we adopt equalized lr in ``nn.Linear``. The equalized
+    learning rate is proposed in:
+    Progressive Growing of GANs for Improved Quality, Stability, and Variation
+
+    Note that, the initialization of ``self.weight`` will be overwritten as
+    :math:`\mathcal{N}(0, 1)`.
+
+    Args:
+        equalized_lr_cfg (dict | None, optional): Config for ``EqualizedLR``.
+            If ``None``, equalized learning rate is ignored. Defaults to
+            dict(mode='fan_in').
+    """
+
+    def __init__(self, *args, equalized_lr_cfg=dict(mode='fan_in'), **kwargs):
+        super().__init__(*args, **kwargs)
+        self.with_equalized_lr = equalized_lr_cfg is not None
+        if self.with_equalized_lr:
+            self.lr_mul = equalized_lr_cfg.get('lr_mul', 1.)
+        else:
+            # In fact, lr_mul will only be used in EqualizedLR for
+            # initialization
+            self.lr_mul = 1.
+        if self.with_equalized_lr:
+            equalized_lr(self, **equalized_lr_cfg)
+            self._init_linear_weights()
+
+    def _init_linear_weights(self):
+        """Initialize linear weights as described in PGGAN."""
+        nn.init.normal_(self.weight, 0, 1. / self.lr_mul)
+        if self.bias is not None:
+            nn.init.constant_(self.bias, 0.)
+
+
+@MODULES.register_module()
+class PGGANNoiseTo2DFeat(nn.Module):
+
+    def __init__(self,
+                 noise_size,
+                 out_channels,
+                 act_cfg=dict(type='LeakyReLU', negative_slope=0.2),
+                 norm_cfg=dict(type='PixelNorm'),
+                 normalize_latent=True,
+                 order=('linear', 'act', 'norm')):
+        super().__init__()
+        self.noise_size = noise_size
+        self.out_channels = out_channels
+        self.normalize_latent = normalize_latent
+        self.with_activation = act_cfg is not None
+        self.with_norm = norm_cfg is not None
+        self.order = order
+        assert len(order) == 3 and set(order) == set(['linear', 'act', 'norm'])
+
+        # w/o bias, because the bias is added after reshaping the tensor to
+        # 2D feature
+        self.linear = EqualizedLRLinearModule(
+            noise_size,
+            out_channels * 16,
+            equalized_lr_cfg=dict(gain=np.sqrt(2) / 4),
+            bias=False)
+
+        if self.with_activation:
+            self.activation = build_activation_layer(act_cfg)
+
+        # add bias for reshaped 2D feature.
+        self.register_parameter(
+            'bias', nn.Parameter(torch.zeros(1, out_channels, 1, 1)))
+
+        if self.with_norm:
+            _, self.norm = build_norm_layer(norm_cfg, out_channels)
+
+    def forward(self, x):
+        """Forward function.
+
+        Args:
+            x (Tensor): Input noise tensor with shape (n, c).
+
+        Returns:
+            Tensor: Forward results with shape (n, c, 4, 4).
+        """
+        assert x.ndim == 2
+        if self.normalize_latent:
+            x = pixel_norm(x)
+        for order in self.order:
+            if order == 'linear':
+                x = self.linear(x)
+                # [n, c, 4, 4]
+                x = torch.reshape(x, (-1, self.out_channels, 4, 4))
+                x = x + self.bias
+            elif order == 'act' and self.with_activation:
+                x = self.activation(x)
+            elif order == 'norm' and self.with_norm:
+                x = self.norm(x)
+
+        return x
+
+
+class PGGANDecisionHead(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 mid_channels,
+                 out_channels,
+                 bias=True,
+                 equalized_lr_cfg=dict(gain=1),
+                 act_cfg=dict(type='LeakyReLU', negative_slope=0.2),
+                 out_act=None):
+        super().__init__()
+        self.in_channels = in_channels
+        self.mid_channels = mid_channels
+        self.out_channels = out_channels
+        self.with_activation = act_cfg is not None
+        self.with_out_activation = out_act is not None
+
+        # setup linear layers
+        # dirty code for supporting default mode in PGGAN
+        if equalized_lr_cfg:
+            equalized_lr_cfg_ = dict(gain=2**0.5)
+        else:
+            equalized_lr_cfg_ = None
+        self.linear0 = EqualizedLRLinearModule(
+            self.in_channels,
+            self.mid_channels,
+            bias=bias,
+            equalized_lr_cfg=equalized_lr_cfg_)
+        self.linear1 = EqualizedLRLinearModule(
+            self.mid_channels,
+            self.out_channels,
+            bias=bias,
+            equalized_lr_cfg=equalized_lr_cfg)
+
+        # setup activation layers
+        if self.with_activation:
+            self.activation = build_activation_layer(act_cfg)
+
+        if self.with_out_activation:
+            self.out_activation = build_activation_layer(out_act)
+
+    def forward(self, x):
+        """Forward function.
+
+        Args:
+            x (Tensor): Input tensor with shape (n, c, h, w).
+
+        Returns:
+            Tensor: Forward results.
+        """
+        if x.ndim > 2:
+            x = torch.reshape(x, (x.shape[0], -1))
+
+        x = self.linear0(x)
+        if self.with_activation:
+            x = self.activation(x)
+
+        x = self.linear1(x)
+        if self.with_out_activation:
+            x = self.out_activation(x)
+
+        return x
+
+
+@MODULES.register_module()
+@PLUGIN_LAYERS.register_module()
+class MiniBatchStddevLayer(nn.Module):
+    """Minibatch standard deviation.
+
+    Args:
+        group_size (int, optional): The size of groups in batch dimension.
+            Defaults to 4.
+        eps (float, optional):  Epsilon value to avoid computation error.
+            Defaults to 1e-8.
+        gather_all_batch (bool, optional): Whether gather batch from all GPUs.
+            Defaults to False.
+    """
+
+    def __init__(self, group_size=4, eps=1e-8, gather_all_batch=False):
+        super().__init__()
+        self.group_size = group_size
+        self.eps = eps
+        self.gather_all_batch = gather_all_batch
+        if self.gather_all_batch:
+            assert torch.distributed.is_initialized(
+            ), 'Only in distributed training can the tensors be all gathered.'
+
+    def forward(self, x):
+        """Forward function.
+
+        Args:
+            x (Tensor): Input tensor with shape (n, c, h, w).
+
+        Returns:
+            Tensor: Forward results.
+        """
+        if self.gather_all_batch:
+            x = torch.cat(AllGatherLayer.apply(x), dim=0)
+
+        # batch size should be smaller than or equal to group size. Otherwise,
+        # batch size should be divisible by the group size.
+        assert x.shape[
+            0] <= self.group_size or x.shape[0] % self.group_size == 0, (
+                'Batch size be smaller than or equal '
+                'to group size. Otherwise,'
+                ' batch size should be divisible by the group size.'
+                f'But got batch size {x.shape[0]},'
+                f' group size {self.group_size}')
+        n, c, h, w = x.shape
+        group_size = min(n, self.group_size)
+        # [G, M, C, H, W]
+        y = torch.reshape(x, (group_size, -1, c, h, w))
+        # [G, M, C, H, W]
+        y = y - y.mean(dim=0, keepdim=True)
+        # In pt>=1.7, you can just use `.square()` function.
+        # [M, C, H, W]
+        y = y.pow(2).mean(dim=0, keepdim=False)
+        y = torch.sqrt(y + self.eps)
+        # [M, 1, 1, 1]
+        y = y.mean(dim=(1, 2, 3), keepdim=True)
+        y = y.repeat(group_size, 1, h, w)
+        return torch.cat([x, y], dim=1)
--- a/mmgen/models/architectures/pix2pix/__init__.py
+++ b/mmgen/models/architectures/pix2pix/__init__.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from .generator_discriminator import PatchDiscriminator, UnetGenerator
+from .modules import UnetSkipConnectionBlock, generation_init_weights
+
+__all__ = [
+    'PatchDiscriminator', 'UnetGenerator', 'UnetSkipConnectionBlock',
+    'generation_init_weights'
+]
--- a/mmgen/models/architectures/pix2pix/generator_discriminator.py
+++ b/mmgen/models/architectures/pix2pix/generator_discriminator.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import ConvModule, build_conv_layer
+from mmcv.runner import load_checkpoint
+
+from mmgen.models.builder import MODULES
+from mmgen.utils import get_root_logger
+from .modules import UnetSkipConnectionBlock, generation_init_weights
+
+
+@MODULES.register_module()
+class UnetGenerator(nn.Module):
+    """Construct the Unet-based generator from the innermost layer to the
+    outermost layer, which is a recursive process.
+
+    Args:
+        in_channels (int): Number of channels in input images.
+        out_channels (int): Number of channels in output images.
+        num_down (int): Number of downsamplings in Unet. If `num_down` is 8,
+            the image with size 256x256 will become 1x1 at the bottleneck.
+            Default: 8.
+        base_channels (int): Number of channels at the last conv layer.
+            Default: 64.
+        norm_cfg (dict): Config dict to build norm layer. Default:
+            `dict(type='BN')`.
+        use_dropout (bool): Whether to use dropout layers. Default: False.
+        init_cfg (dict): Config dict for initialization.
+            `type`: The name of our initialization method. Default: 'normal'.
+            `gain`: Scaling factor for normal, xavier and orthogonal.
+            Default: 0.02.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_down=8,
+                 base_channels=64,
+                 norm_cfg=dict(type='BN'),
+                 use_dropout=False,
+                 init_cfg=dict(type='normal', gain=0.02)):
+        super().__init__()
+        # We use norm layers in the unet generator.
+        assert isinstance(norm_cfg, dict), ("'norm_cfg' should be dict, but"
+                                            f'got {type(norm_cfg)}')
+        assert 'type' in norm_cfg, "'norm_cfg' must have key 'type'"
+
+        # add the innermost layer
+        unet_block = UnetSkipConnectionBlock(
+            base_channels * 8,
+            base_channels * 8,
+            in_channels=None,
+            submodule=None,
+            norm_cfg=norm_cfg,
+            is_innermost=True)
+        # add intermediate layers with base_channels * 8 filters
+        for _ in range(num_down - 5):
+            unet_block = UnetSkipConnectionBlock(
+                base_channels * 8,
+                base_channels * 8,
+                in_channels=None,
+                submodule=unet_block,
+                norm_cfg=norm_cfg,
+                use_dropout=use_dropout)
+        # gradually reduce the number of filters
+        # from base_channels * 8 to base_channels
+        unet_block = UnetSkipConnectionBlock(
+            base_channels * 4,
+            base_channels * 8,
+            in_channels=None,
+            submodule=unet_block,
+            norm_cfg=norm_cfg)
+        unet_block = UnetSkipConnectionBlock(
+            base_channels * 2,
+            base_channels * 4,
+            in_channels=None,
+            submodule=unet_block,
+            norm_cfg=norm_cfg)
+        unet_block = UnetSkipConnectionBlock(
+            base_channels,
+            base_channels * 2,
+            in_channels=None,
+            submodule=unet_block,
+            norm_cfg=norm_cfg)
+        # add the outermost layer
+        self.model = UnetSkipConnectionBlock(
+            out_channels,
+            base_channels,
+            in_channels=in_channels,
+            submodule=unet_block,
+            is_outermost=True,
+            norm_cfg=norm_cfg)
+
+        self.init_type = 'normal' if init_cfg is None else init_cfg.get(
+            'type', 'normal')
+        self.init_gain = 0.02 if init_cfg is None else init_cfg.get(
+            'gain', 0.02)
+
+    def forward(self, x):
+        """Forward function.
+
+        Args:
+            x (Tensor): Input tensor with shape (n, c, h, w).
+
+        Returns:
+            Tensor: Forward results.
+        """
+        return self.model(x)
+
+    def init_weights(self, pretrained=None, strict=True):
+        """Initialize weights for the model.
+
+        Args:
+            pretrained (str, optional): Path for pretrained weights. If given
+                None, pretrained weights will not be loaded. Default: None.
+            strict (bool, optional): Whether to allow different params for the
+                model and checkpoint. Default: True.
+        """
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            load_checkpoint(self, pretrained, strict=strict, logger=logger)
+        elif pretrained is None:
+            generation_init_weights(
+                self, init_type=self.init_type, init_gain=self.init_gain)
+        else:
+            raise TypeError("'pretrained' must be a str or None. "
+                            f'But received {type(pretrained)}.')
+
+
+@MODULES.register_module()
+class PatchDiscriminator(nn.Module):
+    """A PatchGAN discriminator.
+
+    Args:
+        in_channels (int): Number of channels in input images.
+        base_channels (int): Number of channels at the first conv layer.
+            Default: 64.
+        num_conv (int): Number of stacked intermediate convs (excluding input
+            and output conv). Default: 3.
+        norm_cfg (dict): Config dict to build norm layer. Default:
+            `dict(type='BN')`.
+        init_cfg (dict): Config dict for initialization.
+            `type`: The name of our initialization method. Default: 'normal'.
+            `gain`: Scaling factor for normal, xavier and orthogonal.
+            Default: 0.02.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 base_channels=64,
+                 num_conv=3,
+                 norm_cfg=dict(type='BN'),
+                 init_cfg=dict(type='normal', gain=0.02)):
+        super().__init__()
+        assert isinstance(norm_cfg, dict), ("'norm_cfg' should be dict, but"
+                                            f'got {type(norm_cfg)}')
+        assert 'type' in norm_cfg, "'norm_cfg' must have key 'type'"
+        # We use norm layers in the patch discriminator.
+        # Only for IN, use bias since it does not have affine parameters.
+        use_bias = norm_cfg['type'] == 'IN'
+
+        kernel_size = 4
+        padding = 1
+
+        # input layer
+        sequence = [
+            ConvModule(
+                in_channels=in_channels,
+                out_channels=base_channels,
+                kernel_size=kernel_size,
+                stride=2,
+                padding=padding,
+                bias=True,
+                norm_cfg=None,
+                act_cfg=dict(type='LeakyReLU', negative_slope=0.2))
+        ]
+
+        # stacked intermediate layers,
+        # gradually increasing the number of filters
+        multiple_now = 1
+        multiple_prev = 1
+        for n in range(1, num_conv):
+            multiple_prev = multiple_now
+            multiple_now = min(2**n, 8)
+            sequence += [
+                ConvModule(
+                    in_channels=base_channels * multiple_prev,
+                    out_channels=base_channels * multiple_now,
+                    kernel_size=kernel_size,
+                    stride=2,
+                    padding=padding,
+                    bias=use_bias,
+                    norm_cfg=norm_cfg,
+                    act_cfg=dict(type='LeakyReLU', negative_slope=0.2))
+            ]
+        multiple_prev = multiple_now
+        multiple_now = min(2**num_conv, 8)
+        sequence += [
+            ConvModule(
+                in_channels=base_channels * multiple_prev,
+                out_channels=base_channels * multiple_now,
+                kernel_size=kernel_size,
+                stride=1,
+                padding=padding,
+                bias=use_bias,
+                norm_cfg=norm_cfg,
+                act_cfg=dict(type='LeakyReLU', negative_slope=0.2))
+        ]
+
+        # output one-channel prediction map
+        sequence += [
+            build_conv_layer(
+                dict(type='Conv2d'),
+                base_channels * multiple_now,
+                1,
+                kernel_size=kernel_size,
+                stride=1,
+                padding=padding)
+        ]
+
+        self.model = nn.Sequential(*sequence)
+        self.init_type = 'normal' if init_cfg is None else init_cfg.get(
+            'type', 'normal')
+        self.init_gain = 0.02 if init_cfg is None else init_cfg.get(
+            'gain', 0.02)
+
+    def forward(self, x):
+        """Forward function.
+
+        Args:
+            x (Tensor): Input tensor with shape (n, c, h, w).
+
+        Returns:
+            Tensor: Forward results.
+        """
+        return self.model(x)
+
+    def init_weights(self, pretrained=None):
+        """Initialize weights for the model.
+
+        Args:
+            pretrained (str, optional): Path for pretrained weights. If given
+                None, pretrained weights will not be loaded. Default: None.
+        """
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            generation_init_weights(
+                self, init_type=self.init_type, init_gain=self.init_gain)
+        else:
+            raise TypeError("'pretrained' must be a str or None. "
+                            f'But received {type(pretrained)}.')