[Feature]Add patch embed and patch merge (#1305)

001f3f66 · Shilong Zhang · GitHub · 66bff139 · 001f3f66 · 001f3f66
Unverified Commit 001f3f66 authored Dec 14, 2021 by Shilong Zhang Committed by GitHub Dec 14, 2021
Hide whitespace changes
Inline Side-by-side

Showing with 813 additions and 8 deletions

mmcv/cnn/bricks/transformer.py mmcv/cnn/bricks/transformer.py +353 -6

tests/test_cnn/test_transformer.py tests/test_cnn/test_transformer.py +460 -2

No files found.
--- a/mmcv/cnn/bricks/transformer.py
+++ b/mmcv/cnn/bricks/transformer.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import copy
+import math
 import warnings
+from typing import Sequence

 import torch
 import torch.nn as nn
+import torch.nn.functional as F

-from mmcv import ConfigDict, deprecated_api_warning
-from mmcv.cnn import Linear, build_activation_layer, build_norm_layer
+from mmcv.cnn import (Linear, build_activation_layer, build_conv_layer,
+                      build_norm_layer)
 from mmcv.runner.base_module import BaseModule, ModuleList, Sequential
-from mmcv.utils import build_from_cfg
+from mmcv.utils import (ConfigDict, build_from_cfg, deprecated_api_warning,
+                        to_2tuple)
 from .drop import build_dropout
 from .registry import (ATTENTION, FEEDFORWARD_NETWORK, POSITIONAL_ENCODING,
                       TRANSFORMER_LAYER, TRANSFORMER_LAYER_SEQUENCE)
@@ -55,6 +59,349 @@ def build_transformer_layer_sequence(cfg, default_args=None):
    return build_from_cfg(cfg, TRANSFORMER_LAYER_SEQUENCE, default_args)


+class AdaptivePadding(nn.Module):
+    """Applies padding adaptively to the input.
+
+    This module can make input get fully covered by filter
+    you specified. It support two modes "same" and "corner". The
+    "same" mode is same with "SAME" padding mode in TensorFlow, pad
+    zero around input. The "corner"  mode would pad zero
+    to bottom right.
+
+    Args:
+        kernel_size (int | tuple): Size of the kernel. Default: 1.
+        stride (int | tuple): Stride of the filter. Default: 1.
+        dilation (int | tuple): Spacing between kernel elements.
+            Default: 1.
+        padding (str): Support "same" and "corner", "corner" mode
+            would pad zero to bottom right, and "same" mode would
+            pad zero around input. Default: "corner".
+
+    Example:
+        >>> kernel_size = 16
+        >>> stride = 16
+        >>> dilation = 1
+        >>> input = torch.rand(1, 1, 15, 17)
+        >>> adap_pad = AdaptivePadding(
+        >>>     kernel_size=kernel_size,
+        >>>     stride=stride,
+        >>>     dilation=dilation,
+        >>>     padding="corner")
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+        >>> input = torch.rand(1, 1, 16, 17)
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+    """
+
+    def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'):
+        super(AdaptivePadding, self).__init__()
+        assert padding in ('same', 'corner')
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        self.padding = padding
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+
+    def get_pad_shape(self, input_shape):
+        """Calculate the padding size of input.
+
+        Args:
+            input_shape (:obj:`torch.Size`): arrange as (H, W).
+
+        Returns:
+            Tuple[int]: The padding size along the
+            original H and W directions
+        """
+        input_h, input_w = input_shape
+        kernel_h, kernel_w = self.kernel_size
+        stride_h, stride_w = self.stride
+        output_h = math.ceil(input_h / stride_h)
+        output_w = math.ceil(input_w / stride_w)
+        pad_h = max((output_h - 1) * stride_h +
+                    (kernel_h - 1) * self.dilation[0] + 1 - input_h, 0)
+        pad_w = max((output_w - 1) * stride_w +
+                    (kernel_w - 1) * self.dilation[1] + 1 - input_w, 0)
+        return pad_h, pad_w
+
+    def forward(self, x):
+        """Add padding to `x`
+
+        Args:
+            x (Tensor): Input tensor has shape (B, C, H, W).
+
+        Returns:
+            Tensor: The tensor with adaptive padding
+        """
+        pad_h, pad_w = self.get_pad_shape(x.size()[-2:])
+        if pad_h > 0 or pad_w > 0:
+            if self.padding == 'corner':
+                x = F.pad(x, [0, pad_w, 0, pad_h])
+            elif self.padding == 'same':
+                x = F.pad(x, [
+                    pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
+                    pad_h - pad_h // 2
+                ])
+        return x
+
+
+class PatchEmbed(BaseModule):
+    """Image to Patch Embedding.
+
+    We use a conv layer to implement PatchEmbed.
+
+    Args:
+        in_channels (int): The num of input channels. Default: 3
+        embed_dims (int): The dimensions of embedding. Default: 768
+        conv_type (str): The type of convolution
+            to generate patch embedding. Default: "Conv2d".
+        kernel_size (int): The kernel_size of embedding conv. Default: 16.
+        stride (int): The slide stride of embedding conv.
+            Default: 16.
+        padding (int | tuple | string): The padding length of
+            embedding conv. When it is a string, it means the mode
+            of adaptive padding, support "same" and "corner" now.
+            Default: "corner".
+        dilation (int): The dilation rate of embedding conv. Default: 1.
+        bias (bool): Bias of embed conv. Default: True.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: None.
+        input_size (int | tuple | None): The size of input, which will be
+            used to calculate the out size. Only works when `dynamic_size`
+            is False. Default: None.
+        init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 embed_dims=768,
+                 conv_type='Conv2d',
+                 kernel_size=16,
+                 stride=16,
+                 padding='corner',
+                 dilation=1,
+                 bias=True,
+                 norm_cfg=None,
+                 input_size=None,
+                 init_cfg=None):
+        super(PatchEmbed, self).__init__(init_cfg=init_cfg)
+
+        self.embed_dims = embed_dims
+        if stride is None:
+            stride = kernel_size
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        if isinstance(padding, str):
+            self.adaptive_padding = AdaptivePadding(
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding)
+            # disable the padding of conv
+            padding = 0
+        else:
+            self.adaptive_padding = None
+        padding = to_2tuple(padding)
+
+        self.projection = build_conv_layer(
+            dict(type=conv_type),
+            in_channels=in_channels,
+            out_channels=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
+        else:
+            self.norm = None
+
+        if input_size:
+            input_size = to_2tuple(input_size)
+            # `init_out_size` would be used outside to
+            # calculate the num_patches
+            # e.g. when `use_abs_pos_embed` outside
+            self.init_input_size = input_size
+            if self.adaptive_padding:
+                pad_h, pad_w = self.adaptive_padding.get_pad_shape(input_size)
+                input_h, input_w = input_size
+                input_h = input_h + pad_h
+                input_w = input_w + pad_w
+                input_size = (input_h, input_w)
+
+            # https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
+            h_out = (input_size[0] + 2 * padding[0] - dilation[0] *
+                     (kernel_size[0] - 1) - 1) // stride[0] + 1
+            w_out = (input_size[1] + 2 * padding[1] - dilation[1] *
+                     (kernel_size[1] - 1) - 1) // stride[1] + 1
+            self.init_out_size = (h_out, w_out)
+        else:
+            self.init_input_size = None
+            self.init_out_size = None
+
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor): Has shape (B, C, H, W). In most case, C is 3.
+
+        Returns:
+            tuple: Contains merged results and its spatial shape.
+
+            - x (Tensor): Has shape (B, out_h * out_w, embed_dims)
+            - out_size (tuple[int]): Spatial shape of x, arrange as
+              (out_h, out_w).
+        """
+
+        if self.adaptive_padding:
+            x = self.adaptive_padding(x)
+
+        x = self.projection(x)
+        out_size = (x.shape[2], x.shape[3])
+        x = x.flatten(2).transpose(1, 2)
+        if self.norm is not None:
+            x = self.norm(x)
+        return x, out_size
+
+
+class PatchMerging(BaseModule):
+    """Merge patch feature map.
+
+    This layer groups feature map by kernel_size, and applies norm and linear
+    layers to the grouped feature map ((used in Swin Transformer)).
+    Our implementation uses `nn.Unfold` to
+    merge patches, which is about 25% faster than the original
+    implementation. However, we need to modify pretrained
+    models for compatibility.
+
+    Args:
+        in_channels (int): The num of input channels.
+            to gets fully covered by filter and stride you specified.
+        out_channels (int): The num of output channels.
+        kernel_size (int | tuple, optional): the kernel size in the unfold
+            layer. Defaults to 2.
+        stride (int | tuple, optional): the stride of the sliding blocks in the
+            unfold layer. Default: None. (Would be set as `kernel_size`)
+        padding (int | tuple | string ): The padding length of
+            embedding conv. When it is a string, it means the mode
+            of adaptive padding, support "same" and "corner" now.
+            Default: "corner".
+        dilation (int | tuple, optional): dilation parameter in the unfold
+            layer. Default: 1.
+        bias (bool, optional): Whether to add bias in linear layer or not.
+            Defaults: False.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: dict(type='LN').
+        init_cfg (dict, optional): The extra config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=2,
+                 stride=None,
+                 padding='corner',
+                 dilation=1,
+                 bias=False,
+                 norm_cfg=dict(type='LN'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        if stride:
+            stride = stride
+        else:
+            stride = kernel_size
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        if isinstance(padding, str):
+            self.adaptive_padding = AdaptivePadding(
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding)
+            # disable the padding of unfold
+            padding = 0
+        else:
+            self.adaptive_padding = None
+
+        padding = to_2tuple(padding)
+        self.sampler = nn.Unfold(
+            kernel_size=kernel_size,
+            dilation=dilation,
+            padding=padding,
+            stride=stride)
+
+        sample_dim = kernel_size[0] * kernel_size[1] * in_channels
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, sample_dim)[1]
+        else:
+            self.norm = None
+
+        self.reduction = nn.Linear(sample_dim, out_channels, bias=bias)
+
+    def forward(self, x, input_size):
+        """
+        Args:
+            x (Tensor): Has shape (B, H*W, C_in).
+            input_size (tuple[int]): The spatial shape of x, arrange as (H, W).
+                Default: None.
+
+        Returns:
+            tuple: Contains merged results and its spatial shape.
+
+            - x (Tensor): Has shape (B, Merged_H * Merged_W, C_out)
+            - out_size (tuple[int]): Spatial shape of x, arrange as
+              (Merged_H, Merged_W).
+        """
+        B, L, C = x.shape
+        assert isinstance(input_size, Sequence), f'Expect ' \
+                                                 f'input_size is ' \
+                                                 f'`Sequence` ' \
+                                                 f'but get {input_size}'
+
+        H, W = input_size
+        assert L == H * W, 'input feature has wrong size'
+
+        x = x.view(B, H, W, C).permute([0, 3, 1, 2])  # B, C, H, W
+
+        if self.adaptive_padding:
+            x = self.adaptive_padding(x)
+            H, W = x.shape[-2:]
+
+        # Use nn.Unfold to merge patch. About 25% faster than original method,
+        # but need to modify pretrained model for compatibility
+        # if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2)
+        x = self.sampler(x)
+
+        out_h = (H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] *
+                 (self.sampler.kernel_size[0] - 1) -
+                 1) // self.sampler.stride[0] + 1
+        out_w = (W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] *
+                 (self.sampler.kernel_size[1] - 1) -
+                 1) // self.sampler.stride[1] + 1
+
+        output_size = (out_h, out_w)
+        x = x.transpose(1, 2)  # B, H/2*W/2, 4*C
+        x = self.norm(x) if self.norm else x
+        x = self.reduction(x)
+        return x, output_size
+
+
 @ATTENTION.register_module()
 class MultiheadAttention(BaseModule):
    """A wrapper for ``torch.nn.MultiheadAttention``.
@@ -154,9 +501,9 @@ class MultiheadAttention(BaseModule):

        Returns:
            Tensor: forwarded results with shape
-                [num_queries, bs, embed_dims]
-                if self.batch_first is False, else
-                [bs, num_queries embed_dims].
+            [num_queries, bs, embed_dims]
+            if self.batch_first is False, else
+            [bs, num_queries embed_dims].
        """

        if key is None:

--- a/tests/test_cnn/test_transformer.py
+++ b/tests/test_cnn/test_transformer.py
@@ -4,12 +4,470 @@ import pytest
 import torch

 from mmcv.cnn.bricks.drop import DropPath
-from mmcv.cnn.bricks.transformer import (FFN, BaseTransformerLayer,
-                                         MultiheadAttention,
+from mmcv.cnn.bricks.transformer import (FFN, AdaptivePadding,
+                                         BaseTransformerLayer,
+                                         MultiheadAttention, PatchEmbed,
+                                         PatchMerging,
                                         TransformerLayerSequence)
 from mmcv.runner import ModuleList


+def test_adaptive_padding():
+
+    for padding in ('same', 'corner'):
+        kernel_size = 16
+        stride = 16
+        dilation = 1
+        input = torch.rand(1, 1, 15, 17)
+        adap_pad = AdaptivePadding(
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding)
+        out = adap_pad(input)
+        # padding to divisible by 16
+        assert (out.shape[2], out.shape[3]) == (16, 32)
+        input = torch.rand(1, 1, 16, 17)
+        out = adap_pad(input)
+        # padding to divisible by 16
+        assert (out.shape[2], out.shape[3]) == (16, 32)
+
+        kernel_size = (2, 2)
+        stride = (2, 2)
+        dilation = (1, 1)
+
+        adap_pad = AdaptivePadding(
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding)
+        input = torch.rand(1, 1, 11, 13)
+        out = adap_pad(input)
+        # padding to divisible by 2
+        assert (out.shape[2], out.shape[3]) == (12, 14)
+
+        kernel_size = (2, 2)
+        stride = (10, 10)
+        dilation = (1, 1)
+
+        adap_pad = AdaptivePadding(
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding)
+        input = torch.rand(1, 1, 10, 13)
+        out = adap_pad(input)
+        #  no padding
+        assert (out.shape[2], out.shape[3]) == (10, 13)
+
+        kernel_size = (11, 11)
+        adap_pad = AdaptivePadding(
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding)
+        input = torch.rand(1, 1, 11, 13)
+        out = adap_pad(input)
+        #  all padding
+        assert (out.shape[2], out.shape[3]) == (21, 21)
+
+        # test padding as kernel is (7,9)
+        input = torch.rand(1, 1, 11, 13)
+        stride = (3, 4)
+        kernel_size = (4, 5)
+        dilation = (2, 2)
+        # actually (7, 9)
+        adap_pad = AdaptivePadding(
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding)
+        dilation_out = adap_pad(input)
+        assert (dilation_out.shape[2], dilation_out.shape[3]) == (16, 21)
+        kernel_size = (7, 9)
+        dilation = (1, 1)
+        adap_pad = AdaptivePadding(
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding)
+        kernel79_out = adap_pad(input)
+        assert (kernel79_out.shape[2], kernel79_out.shape[3]) == (16, 21)
+        assert kernel79_out.shape == dilation_out.shape
+
+    # assert only support "same" "corner"
+    with pytest.raises(AssertionError):
+        AdaptivePadding(
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=1)
+
+
+def test_patch_embed():
+    B = 2
+    H = 3
+    W = 4
+    C = 3
+    embed_dims = 10
+    kernel_size = 3
+    stride = 1
+    dummy_input = torch.rand(B, C, H, W)
+    patch_merge_1 = PatchEmbed(
+        in_channels=C,
+        embed_dims=embed_dims,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=0,
+        dilation=1,
+        norm_cfg=None)
+
+    x1, shape = patch_merge_1(dummy_input)
+    # test out shape
+    assert x1.shape == (2, 2, 10)
+    # test outsize is correct
+    assert shape == (1, 2)
+    # test L = out_h * out_w
+    assert shape[0] * shape[1] == x1.shape[1]
+
+    B = 2
+    H = 10
+    W = 10
+    C = 3
+    embed_dims = 10
+    kernel_size = 5
+    stride = 2
+    dummy_input = torch.rand(B, C, H, W)
+    # test dilation
+    patch_merge_2 = PatchEmbed(
+        in_channels=C,
+        embed_dims=embed_dims,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=0,
+        dilation=2,
+        norm_cfg=None,
+    )
+
+    x2, shape = patch_merge_2(dummy_input)
+    # test out shape
+    assert x2.shape == (2, 1, 10)
+    # test outsize is correct
+    assert shape == (1, 1)
+    # test L = out_h * out_w
+    assert shape[0] * shape[1] == x2.shape[1]
+
+    stride = 2
+    input_size = (10, 10)
+
+    dummy_input = torch.rand(B, C, H, W)
+    # test stride and norm
+    patch_merge_3 = PatchEmbed(
+        in_channels=C,
+        embed_dims=embed_dims,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=0,
+        dilation=2,
+        norm_cfg=dict(type='LN'),
+        input_size=input_size)
+
+    x3, shape = patch_merge_3(dummy_input)
+    # test out shape
+    assert x3.shape == (2, 1, 10)
+    # test outsize is correct
+    assert shape == (1, 1)
+    # test L = out_h * out_w
+    assert shape[0] * shape[1] == x3.shape[1]
+
+    # test the init_out_size with nn.Unfold
+    assert patch_merge_3.init_out_size[1] == (input_size[0] - 2 * 4 -
+                                              1) // 2 + 1
+    assert patch_merge_3.init_out_size[0] == (input_size[0] - 2 * 4 -
+                                              1) // 2 + 1
+    H = 11
+    W = 12
+    input_size = (H, W)
+    dummy_input = torch.rand(B, C, H, W)
+    # test stride and norm
+    patch_merge_3 = PatchEmbed(
+        in_channels=C,
+        embed_dims=embed_dims,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=0,
+        dilation=2,
+        norm_cfg=dict(type='LN'),
+        input_size=input_size)
+
+    _, shape = patch_merge_3(dummy_input)
+    # when input_size equal to real input
+    # the out_size should be equal to `init_out_size`
+    assert shape == patch_merge_3.init_out_size
+
+    input_size = (H, W)
+    dummy_input = torch.rand(B, C, H, W)
+    # test stride and norm
+    patch_merge_3 = PatchEmbed(
+        in_channels=C,
+        embed_dims=embed_dims,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=0,
+        dilation=2,
+        norm_cfg=dict(type='LN'),
+        input_size=input_size)
+
+    _, shape = patch_merge_3(dummy_input)
+    # when input_size equal to real input
+    # the out_size should be equal to `init_out_size`
+    assert shape == patch_merge_3.init_out_size
+
+    # test adap padding
+    for padding in ('same', 'corner'):
+        in_c = 2
+        embed_dims = 3
+        B = 2
+
+        # test stride is 1
+        input_size = (5, 5)
+        kernel_size = (5, 5)
+        stride = (1, 1)
+        dilation = 1
+        bias = False
+
+        x = torch.rand(B, in_c, *input_size)
+        patch_embed = PatchEmbed(
+            in_channels=in_c,
+            embed_dims=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_embed(x)
+        assert x_out.size() == (B, 25, 3)
+        assert out_size == (5, 5)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+        # test kernel_size == stride
+        input_size = (5, 5)
+        kernel_size = (5, 5)
+        stride = (5, 5)
+        dilation = 1
+        bias = False
+
+        x = torch.rand(B, in_c, *input_size)
+        patch_embed = PatchEmbed(
+            in_channels=in_c,
+            embed_dims=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_embed(x)
+        assert x_out.size() == (B, 1, 3)
+        assert out_size == (1, 1)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+        # test kernel_size == stride
+        input_size = (6, 5)
+        kernel_size = (5, 5)
+        stride = (5, 5)
+        dilation = 1
+        bias = False
+
+        x = torch.rand(B, in_c, *input_size)
+        patch_embed = PatchEmbed(
+            in_channels=in_c,
+            embed_dims=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_embed(x)
+        assert x_out.size() == (B, 2, 3)
+        assert out_size == (2, 1)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+        # test different kernel_size with different stride
+        input_size = (6, 5)
+        kernel_size = (6, 2)
+        stride = (6, 2)
+        dilation = 1
+        bias = False
+
+        x = torch.rand(B, in_c, *input_size)
+        patch_embed = PatchEmbed(
+            in_channels=in_c,
+            embed_dims=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_embed(x)
+        assert x_out.size() == (B, 3, 3)
+        assert out_size == (1, 3)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+
+def test_patch_merging():
+
+    # Test the model with int padding
+    in_c = 3
+    out_c = 4
+    kernel_size = 3
+    stride = 3
+    padding = 1
+    dilation = 1
+    bias = False
+    # test the case `pad_to_stride` is False
+    patch_merge = PatchMerging(
+        in_channels=in_c,
+        out_channels=out_c,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        bias=bias)
+    B, L, C = 1, 100, 3
+    input_size = (10, 10)
+    x = torch.rand(B, L, C)
+    x_out, out_size = patch_merge(x, input_size)
+    assert x_out.size() == (1, 16, 4)
+    assert out_size == (4, 4)
+    # assert out size is consistent with real output
+    assert x_out.size(1) == out_size[0] * out_size[1]
+    in_c = 4
+    out_c = 5
+    kernel_size = 6
+    stride = 3
+    padding = 2
+    dilation = 2
+    bias = False
+    patch_merge = PatchMerging(
+        in_channels=in_c,
+        out_channels=out_c,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        bias=bias)
+    B, L, C = 1, 100, 4
+    input_size = (10, 10)
+    x = torch.rand(B, L, C)
+    x_out, out_size = patch_merge(x, input_size)
+    assert x_out.size() == (1, 4, 5)
+    assert out_size == (2, 2)
+    # assert out size is consistent with real output
+    assert x_out.size(1) == out_size[0] * out_size[1]
+
+    # Test with adaptive padding
+    for padding in ('same', 'corner'):
+        in_c = 2
+        out_c = 3
+        B = 2
+
+        # test stride is 1
+        input_size = (5, 5)
+        kernel_size = (5, 5)
+        stride = (1, 1)
+        dilation = 1
+        bias = False
+        L = input_size[0] * input_size[1]
+
+        x = torch.rand(B, L, in_c)
+        patch_merge = PatchMerging(
+            in_channels=in_c,
+            out_channels=out_c,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_merge(x, input_size)
+        assert x_out.size() == (B, 25, 3)
+        assert out_size == (5, 5)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+        # test kernel_size == stride
+        input_size = (5, 5)
+        kernel_size = (5, 5)
+        stride = (5, 5)
+        dilation = 1
+        bias = False
+        L = input_size[0] * input_size[1]
+
+        x = torch.rand(B, L, in_c)
+        patch_merge = PatchMerging(
+            in_channels=in_c,
+            out_channels=out_c,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_merge(x, input_size)
+        assert x_out.size() == (B, 1, 3)
+        assert out_size == (1, 1)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+        # test kernel_size == stride
+        input_size = (6, 5)
+        kernel_size = (5, 5)
+        stride = (5, 5)
+        dilation = 1
+        bias = False
+        L = input_size[0] * input_size[1]
+
+        x = torch.rand(B, L, in_c)
+        patch_merge = PatchMerging(
+            in_channels=in_c,
+            out_channels=out_c,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_merge(x, input_size)
+        assert x_out.size() == (B, 2, 3)
+        assert out_size == (2, 1)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+        # test different kernel_size with different stride
+        input_size = (6, 5)
+        kernel_size = (6, 2)
+        stride = (6, 2)
+        dilation = 1
+        bias = False
+        L = input_size[0] * input_size[1]
+
+        x = torch.rand(B, L, in_c)
+        patch_merge = PatchMerging(
+            in_channels=in_c,
+            out_channels=out_c,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_merge(x, input_size)
+        assert x_out.size() == (B, 3, 3)
+        assert out_size == (1, 3)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+
 def test_multiheadattention():
    MultiheadAttention(
        embed_dims=5,