Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
MMCV
Commits
001f3f66
Unverified
Commit
001f3f66
authored
Dec 14, 2021
by
Shilong Zhang
Committed by
GitHub
Dec 14, 2021
Browse files
[Feature]Add patch embed and patch merge (#1305)
parent
66bff139
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
813 additions
and
8 deletions
+813
-8
mmcv/cnn/bricks/transformer.py
mmcv/cnn/bricks/transformer.py
+353
-6
tests/test_cnn/test_transformer.py
tests/test_cnn/test_transformer.py
+460
-2
No files found.
mmcv/cnn/bricks/transformer.py
View file @
001f3f66
# Copyright (c) OpenMMLab. All rights reserved.
import
copy
import
math
import
warnings
from
typing
import
Sequence
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
mmcv
import
ConfigDict
,
deprecated_api_warning
from
mmcv.cnn
import
Linear
,
build_activation_layer
,
build_norm_layer
from
mmcv
.cnn
import
(
Linear
,
build_activation_layer
,
build_conv_layer
,
build_norm_layer
)
from
mmcv.runner.base_module
import
BaseModule
,
ModuleList
,
Sequential
from
mmcv.utils
import
build_from_cfg
from
mmcv.utils
import
(
ConfigDict
,
build_from_cfg
,
deprecated_api_warning
,
to_2tuple
)
from
.drop
import
build_dropout
from
.registry
import
(
ATTENTION
,
FEEDFORWARD_NETWORK
,
POSITIONAL_ENCODING
,
TRANSFORMER_LAYER
,
TRANSFORMER_LAYER_SEQUENCE
)
...
...
@@ -55,6 +59,349 @@ def build_transformer_layer_sequence(cfg, default_args=None):
return
build_from_cfg
(
cfg
,
TRANSFORMER_LAYER_SEQUENCE
,
default_args
)
class
AdaptivePadding
(
nn
.
Module
):
"""Applies padding adaptively to the input.
This module can make input get fully covered by filter
you specified. It support two modes "same" and "corner". The
"same" mode is same with "SAME" padding mode in TensorFlow, pad
zero around input. The "corner" mode would pad zero
to bottom right.
Args:
kernel_size (int | tuple): Size of the kernel. Default: 1.
stride (int | tuple): Stride of the filter. Default: 1.
dilation (int | tuple): Spacing between kernel elements.
Default: 1.
padding (str): Support "same" and "corner", "corner" mode
would pad zero to bottom right, and "same" mode would
pad zero around input. Default: "corner".
Example:
>>> kernel_size = 16
>>> stride = 16
>>> dilation = 1
>>> input = torch.rand(1, 1, 15, 17)
>>> adap_pad = AdaptivePadding(
>>> kernel_size=kernel_size,
>>> stride=stride,
>>> dilation=dilation,
>>> padding="corner")
>>> out = adap_pad(input)
>>> assert (out.shape[2], out.shape[3]) == (16, 32)
>>> input = torch.rand(1, 1, 16, 17)
>>> out = adap_pad(input)
>>> assert (out.shape[2], out.shape[3]) == (16, 32)
"""
def
__init__
(
self
,
kernel_size
=
1
,
stride
=
1
,
dilation
=
1
,
padding
=
'corner'
):
super
(
AdaptivePadding
,
self
).
__init__
()
assert
padding
in
(
'same'
,
'corner'
)
kernel_size
=
to_2tuple
(
kernel_size
)
stride
=
to_2tuple
(
stride
)
dilation
=
to_2tuple
(
dilation
)
self
.
padding
=
padding
self
.
kernel_size
=
kernel_size
self
.
stride
=
stride
self
.
dilation
=
dilation
def
get_pad_shape
(
self
,
input_shape
):
"""Calculate the padding size of input.
Args:
input_shape (:obj:`torch.Size`): arrange as (H, W).
Returns:
Tuple[int]: The padding size along the
original H and W directions
"""
input_h
,
input_w
=
input_shape
kernel_h
,
kernel_w
=
self
.
kernel_size
stride_h
,
stride_w
=
self
.
stride
output_h
=
math
.
ceil
(
input_h
/
stride_h
)
output_w
=
math
.
ceil
(
input_w
/
stride_w
)
pad_h
=
max
((
output_h
-
1
)
*
stride_h
+
(
kernel_h
-
1
)
*
self
.
dilation
[
0
]
+
1
-
input_h
,
0
)
pad_w
=
max
((
output_w
-
1
)
*
stride_w
+
(
kernel_w
-
1
)
*
self
.
dilation
[
1
]
+
1
-
input_w
,
0
)
return
pad_h
,
pad_w
def
forward
(
self
,
x
):
"""Add padding to `x`
Args:
x (Tensor): Input tensor has shape (B, C, H, W).
Returns:
Tensor: The tensor with adaptive padding
"""
pad_h
,
pad_w
=
self
.
get_pad_shape
(
x
.
size
()[
-
2
:])
if
pad_h
>
0
or
pad_w
>
0
:
if
self
.
padding
==
'corner'
:
x
=
F
.
pad
(
x
,
[
0
,
pad_w
,
0
,
pad_h
])
elif
self
.
padding
==
'same'
:
x
=
F
.
pad
(
x
,
[
pad_w
//
2
,
pad_w
-
pad_w
//
2
,
pad_h
//
2
,
pad_h
-
pad_h
//
2
])
return
x
class
PatchEmbed
(
BaseModule
):
"""Image to Patch Embedding.
We use a conv layer to implement PatchEmbed.
Args:
in_channels (int): The num of input channels. Default: 3
embed_dims (int): The dimensions of embedding. Default: 768
conv_type (str): The type of convolution
to generate patch embedding. Default: "Conv2d".
kernel_size (int): The kernel_size of embedding conv. Default: 16.
stride (int): The slide stride of embedding conv.
Default: 16.
padding (int | tuple | string): The padding length of
embedding conv. When it is a string, it means the mode
of adaptive padding, support "same" and "corner" now.
Default: "corner".
dilation (int): The dilation rate of embedding conv. Default: 1.
bias (bool): Bias of embed conv. Default: True.
norm_cfg (dict, optional): Config dict for normalization layer.
Default: None.
input_size (int | tuple | None): The size of input, which will be
used to calculate the out size. Only works when `dynamic_size`
is False. Default: None.
init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization.
Default: None.
"""
def
__init__
(
self
,
in_channels
=
3
,
embed_dims
=
768
,
conv_type
=
'Conv2d'
,
kernel_size
=
16
,
stride
=
16
,
padding
=
'corner'
,
dilation
=
1
,
bias
=
True
,
norm_cfg
=
None
,
input_size
=
None
,
init_cfg
=
None
):
super
(
PatchEmbed
,
self
).
__init__
(
init_cfg
=
init_cfg
)
self
.
embed_dims
=
embed_dims
if
stride
is
None
:
stride
=
kernel_size
kernel_size
=
to_2tuple
(
kernel_size
)
stride
=
to_2tuple
(
stride
)
dilation
=
to_2tuple
(
dilation
)
if
isinstance
(
padding
,
str
):
self
.
adaptive_padding
=
AdaptivePadding
(
kernel_size
=
kernel_size
,
stride
=
stride
,
dilation
=
dilation
,
padding
=
padding
)
# disable the padding of conv
padding
=
0
else
:
self
.
adaptive_padding
=
None
padding
=
to_2tuple
(
padding
)
self
.
projection
=
build_conv_layer
(
dict
(
type
=
conv_type
),
in_channels
=
in_channels
,
out_channels
=
embed_dims
,
kernel_size
=
kernel_size
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
,
bias
=
bias
)
if
norm_cfg
is
not
None
:
self
.
norm
=
build_norm_layer
(
norm_cfg
,
embed_dims
)[
1
]
else
:
self
.
norm
=
None
if
input_size
:
input_size
=
to_2tuple
(
input_size
)
# `init_out_size` would be used outside to
# calculate the num_patches
# e.g. when `use_abs_pos_embed` outside
self
.
init_input_size
=
input_size
if
self
.
adaptive_padding
:
pad_h
,
pad_w
=
self
.
adaptive_padding
.
get_pad_shape
(
input_size
)
input_h
,
input_w
=
input_size
input_h
=
input_h
+
pad_h
input_w
=
input_w
+
pad_w
input_size
=
(
input_h
,
input_w
)
# https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
h_out
=
(
input_size
[
0
]
+
2
*
padding
[
0
]
-
dilation
[
0
]
*
(
kernel_size
[
0
]
-
1
)
-
1
)
//
stride
[
0
]
+
1
w_out
=
(
input_size
[
1
]
+
2
*
padding
[
1
]
-
dilation
[
1
]
*
(
kernel_size
[
1
]
-
1
)
-
1
)
//
stride
[
1
]
+
1
self
.
init_out_size
=
(
h_out
,
w_out
)
else
:
self
.
init_input_size
=
None
self
.
init_out_size
=
None
def
forward
(
self
,
x
):
"""
Args:
x (Tensor): Has shape (B, C, H, W). In most case, C is 3.
Returns:
tuple: Contains merged results and its spatial shape.
- x (Tensor): Has shape (B, out_h * out_w, embed_dims)
- out_size (tuple[int]): Spatial shape of x, arrange as
(out_h, out_w).
"""
if
self
.
adaptive_padding
:
x
=
self
.
adaptive_padding
(
x
)
x
=
self
.
projection
(
x
)
out_size
=
(
x
.
shape
[
2
],
x
.
shape
[
3
])
x
=
x
.
flatten
(
2
).
transpose
(
1
,
2
)
if
self
.
norm
is
not
None
:
x
=
self
.
norm
(
x
)
return
x
,
out_size
class
PatchMerging
(
BaseModule
):
"""Merge patch feature map.
This layer groups feature map by kernel_size, and applies norm and linear
layers to the grouped feature map ((used in Swin Transformer)).
Our implementation uses `nn.Unfold` to
merge patches, which is about 25% faster than the original
implementation. However, we need to modify pretrained
models for compatibility.
Args:
in_channels (int): The num of input channels.
to gets fully covered by filter and stride you specified.
out_channels (int): The num of output channels.
kernel_size (int | tuple, optional): the kernel size in the unfold
layer. Defaults to 2.
stride (int | tuple, optional): the stride of the sliding blocks in the
unfold layer. Default: None. (Would be set as `kernel_size`)
padding (int | tuple | string ): The padding length of
embedding conv. When it is a string, it means the mode
of adaptive padding, support "same" and "corner" now.
Default: "corner".
dilation (int | tuple, optional): dilation parameter in the unfold
layer. Default: 1.
bias (bool, optional): Whether to add bias in linear layer or not.
Defaults: False.
norm_cfg (dict, optional): Config dict for normalization layer.
Default: dict(type='LN').
init_cfg (dict, optional): The extra config for initialization.
Default: None.
"""
def
__init__
(
self
,
in_channels
,
out_channels
,
kernel_size
=
2
,
stride
=
None
,
padding
=
'corner'
,
dilation
=
1
,
bias
=
False
,
norm_cfg
=
dict
(
type
=
'LN'
),
init_cfg
=
None
):
super
().
__init__
(
init_cfg
=
init_cfg
)
self
.
in_channels
=
in_channels
self
.
out_channels
=
out_channels
if
stride
:
stride
=
stride
else
:
stride
=
kernel_size
kernel_size
=
to_2tuple
(
kernel_size
)
stride
=
to_2tuple
(
stride
)
dilation
=
to_2tuple
(
dilation
)
if
isinstance
(
padding
,
str
):
self
.
adaptive_padding
=
AdaptivePadding
(
kernel_size
=
kernel_size
,
stride
=
stride
,
dilation
=
dilation
,
padding
=
padding
)
# disable the padding of unfold
padding
=
0
else
:
self
.
adaptive_padding
=
None
padding
=
to_2tuple
(
padding
)
self
.
sampler
=
nn
.
Unfold
(
kernel_size
=
kernel_size
,
dilation
=
dilation
,
padding
=
padding
,
stride
=
stride
)
sample_dim
=
kernel_size
[
0
]
*
kernel_size
[
1
]
*
in_channels
if
norm_cfg
is
not
None
:
self
.
norm
=
build_norm_layer
(
norm_cfg
,
sample_dim
)[
1
]
else
:
self
.
norm
=
None
self
.
reduction
=
nn
.
Linear
(
sample_dim
,
out_channels
,
bias
=
bias
)
def
forward
(
self
,
x
,
input_size
):
"""
Args:
x (Tensor): Has shape (B, H*W, C_in).
input_size (tuple[int]): The spatial shape of x, arrange as (H, W).
Default: None.
Returns:
tuple: Contains merged results and its spatial shape.
- x (Tensor): Has shape (B, Merged_H * Merged_W, C_out)
- out_size (tuple[int]): Spatial shape of x, arrange as
(Merged_H, Merged_W).
"""
B
,
L
,
C
=
x
.
shape
assert
isinstance
(
input_size
,
Sequence
),
f
'Expect '
\
f
'input_size is '
\
f
'`Sequence` '
\
f
'but get
{
input_size
}
'
H
,
W
=
input_size
assert
L
==
H
*
W
,
'input feature has wrong size'
x
=
x
.
view
(
B
,
H
,
W
,
C
).
permute
([
0
,
3
,
1
,
2
])
# B, C, H, W
if
self
.
adaptive_padding
:
x
=
self
.
adaptive_padding
(
x
)
H
,
W
=
x
.
shape
[
-
2
:]
# Use nn.Unfold to merge patch. About 25% faster than original method,
# but need to modify pretrained model for compatibility
# if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2)
x
=
self
.
sampler
(
x
)
out_h
=
(
H
+
2
*
self
.
sampler
.
padding
[
0
]
-
self
.
sampler
.
dilation
[
0
]
*
(
self
.
sampler
.
kernel_size
[
0
]
-
1
)
-
1
)
//
self
.
sampler
.
stride
[
0
]
+
1
out_w
=
(
W
+
2
*
self
.
sampler
.
padding
[
1
]
-
self
.
sampler
.
dilation
[
1
]
*
(
self
.
sampler
.
kernel_size
[
1
]
-
1
)
-
1
)
//
self
.
sampler
.
stride
[
1
]
+
1
output_size
=
(
out_h
,
out_w
)
x
=
x
.
transpose
(
1
,
2
)
# B, H/2*W/2, 4*C
x
=
self
.
norm
(
x
)
if
self
.
norm
else
x
x
=
self
.
reduction
(
x
)
return
x
,
output_size
@
ATTENTION
.
register_module
()
class
MultiheadAttention
(
BaseModule
):
"""A wrapper for ``torch.nn.MultiheadAttention``.
...
...
@@ -154,9 +501,9 @@ class MultiheadAttention(BaseModule):
Returns:
Tensor: forwarded results with shape
[num_queries, bs, embed_dims]
if self.batch_first is False, else
[bs, num_queries embed_dims].
[num_queries, bs, embed_dims]
if self.batch_first is False, else
[bs, num_queries embed_dims].
"""
if
key
is
None
:
...
...
tests/test_cnn/test_transformer.py
View file @
001f3f66
...
...
@@ -4,12 +4,470 @@ import pytest
import
torch
from
mmcv.cnn.bricks.drop
import
DropPath
from
mmcv.cnn.bricks.transformer
import
(
FFN
,
BaseTransformerLayer
,
MultiheadAttention
,
from
mmcv.cnn.bricks.transformer
import
(
FFN
,
AdaptivePadding
,
BaseTransformerLayer
,
MultiheadAttention
,
PatchEmbed
,
PatchMerging
,
TransformerLayerSequence
)
from
mmcv.runner
import
ModuleList
def
test_adaptive_padding
():
for
padding
in
(
'same'
,
'corner'
):
kernel_size
=
16
stride
=
16
dilation
=
1
input
=
torch
.
rand
(
1
,
1
,
15
,
17
)
adap_pad
=
AdaptivePadding
(
kernel_size
=
kernel_size
,
stride
=
stride
,
dilation
=
dilation
,
padding
=
padding
)
out
=
adap_pad
(
input
)
# padding to divisible by 16
assert
(
out
.
shape
[
2
],
out
.
shape
[
3
])
==
(
16
,
32
)
input
=
torch
.
rand
(
1
,
1
,
16
,
17
)
out
=
adap_pad
(
input
)
# padding to divisible by 16
assert
(
out
.
shape
[
2
],
out
.
shape
[
3
])
==
(
16
,
32
)
kernel_size
=
(
2
,
2
)
stride
=
(
2
,
2
)
dilation
=
(
1
,
1
)
adap_pad
=
AdaptivePadding
(
kernel_size
=
kernel_size
,
stride
=
stride
,
dilation
=
dilation
,
padding
=
padding
)
input
=
torch
.
rand
(
1
,
1
,
11
,
13
)
out
=
adap_pad
(
input
)
# padding to divisible by 2
assert
(
out
.
shape
[
2
],
out
.
shape
[
3
])
==
(
12
,
14
)
kernel_size
=
(
2
,
2
)
stride
=
(
10
,
10
)
dilation
=
(
1
,
1
)
adap_pad
=
AdaptivePadding
(
kernel_size
=
kernel_size
,
stride
=
stride
,
dilation
=
dilation
,
padding
=
padding
)
input
=
torch
.
rand
(
1
,
1
,
10
,
13
)
out
=
adap_pad
(
input
)
# no padding
assert
(
out
.
shape
[
2
],
out
.
shape
[
3
])
==
(
10
,
13
)
kernel_size
=
(
11
,
11
)
adap_pad
=
AdaptivePadding
(
kernel_size
=
kernel_size
,
stride
=
stride
,
dilation
=
dilation
,
padding
=
padding
)
input
=
torch
.
rand
(
1
,
1
,
11
,
13
)
out
=
adap_pad
(
input
)
# all padding
assert
(
out
.
shape
[
2
],
out
.
shape
[
3
])
==
(
21
,
21
)
# test padding as kernel is (7,9)
input
=
torch
.
rand
(
1
,
1
,
11
,
13
)
stride
=
(
3
,
4
)
kernel_size
=
(
4
,
5
)
dilation
=
(
2
,
2
)
# actually (7, 9)
adap_pad
=
AdaptivePadding
(
kernel_size
=
kernel_size
,
stride
=
stride
,
dilation
=
dilation
,
padding
=
padding
)
dilation_out
=
adap_pad
(
input
)
assert
(
dilation_out
.
shape
[
2
],
dilation_out
.
shape
[
3
])
==
(
16
,
21
)
kernel_size
=
(
7
,
9
)
dilation
=
(
1
,
1
)
adap_pad
=
AdaptivePadding
(
kernel_size
=
kernel_size
,
stride
=
stride
,
dilation
=
dilation
,
padding
=
padding
)
kernel79_out
=
adap_pad
(
input
)
assert
(
kernel79_out
.
shape
[
2
],
kernel79_out
.
shape
[
3
])
==
(
16
,
21
)
assert
kernel79_out
.
shape
==
dilation_out
.
shape
# assert only support "same" "corner"
with
pytest
.
raises
(
AssertionError
):
AdaptivePadding
(
kernel_size
=
kernel_size
,
stride
=
stride
,
dilation
=
dilation
,
padding
=
1
)
def
test_patch_embed
():
B
=
2
H
=
3
W
=
4
C
=
3
embed_dims
=
10
kernel_size
=
3
stride
=
1
dummy_input
=
torch
.
rand
(
B
,
C
,
H
,
W
)
patch_merge_1
=
PatchEmbed
(
in_channels
=
C
,
embed_dims
=
embed_dims
,
kernel_size
=
kernel_size
,
stride
=
stride
,
padding
=
0
,
dilation
=
1
,
norm_cfg
=
None
)
x1
,
shape
=
patch_merge_1
(
dummy_input
)
# test out shape
assert
x1
.
shape
==
(
2
,
2
,
10
)
# test outsize is correct
assert
shape
==
(
1
,
2
)
# test L = out_h * out_w
assert
shape
[
0
]
*
shape
[
1
]
==
x1
.
shape
[
1
]
B
=
2
H
=
10
W
=
10
C
=
3
embed_dims
=
10
kernel_size
=
5
stride
=
2
dummy_input
=
torch
.
rand
(
B
,
C
,
H
,
W
)
# test dilation
patch_merge_2
=
PatchEmbed
(
in_channels
=
C
,
embed_dims
=
embed_dims
,
kernel_size
=
kernel_size
,
stride
=
stride
,
padding
=
0
,
dilation
=
2
,
norm_cfg
=
None
,
)
x2
,
shape
=
patch_merge_2
(
dummy_input
)
# test out shape
assert
x2
.
shape
==
(
2
,
1
,
10
)
# test outsize is correct
assert
shape
==
(
1
,
1
)
# test L = out_h * out_w
assert
shape
[
0
]
*
shape
[
1
]
==
x2
.
shape
[
1
]
stride
=
2
input_size
=
(
10
,
10
)
dummy_input
=
torch
.
rand
(
B
,
C
,
H
,
W
)
# test stride and norm
patch_merge_3
=
PatchEmbed
(
in_channels
=
C
,
embed_dims
=
embed_dims
,
kernel_size
=
kernel_size
,
stride
=
stride
,
padding
=
0
,
dilation
=
2
,
norm_cfg
=
dict
(
type
=
'LN'
),
input_size
=
input_size
)
x3
,
shape
=
patch_merge_3
(
dummy_input
)
# test out shape
assert
x3
.
shape
==
(
2
,
1
,
10
)
# test outsize is correct
assert
shape
==
(
1
,
1
)
# test L = out_h * out_w
assert
shape
[
0
]
*
shape
[
1
]
==
x3
.
shape
[
1
]
# test the init_out_size with nn.Unfold
assert
patch_merge_3
.
init_out_size
[
1
]
==
(
input_size
[
0
]
-
2
*
4
-
1
)
//
2
+
1
assert
patch_merge_3
.
init_out_size
[
0
]
==
(
input_size
[
0
]
-
2
*
4
-
1
)
//
2
+
1
H
=
11
W
=
12
input_size
=
(
H
,
W
)
dummy_input
=
torch
.
rand
(
B
,
C
,
H
,
W
)
# test stride and norm
patch_merge_3
=
PatchEmbed
(
in_channels
=
C
,
embed_dims
=
embed_dims
,
kernel_size
=
kernel_size
,
stride
=
stride
,
padding
=
0
,
dilation
=
2
,
norm_cfg
=
dict
(
type
=
'LN'
),
input_size
=
input_size
)
_
,
shape
=
patch_merge_3
(
dummy_input
)
# when input_size equal to real input
# the out_size should be equal to `init_out_size`
assert
shape
==
patch_merge_3
.
init_out_size
input_size
=
(
H
,
W
)
dummy_input
=
torch
.
rand
(
B
,
C
,
H
,
W
)
# test stride and norm
patch_merge_3
=
PatchEmbed
(
in_channels
=
C
,
embed_dims
=
embed_dims
,
kernel_size
=
kernel_size
,
stride
=
stride
,
padding
=
0
,
dilation
=
2
,
norm_cfg
=
dict
(
type
=
'LN'
),
input_size
=
input_size
)
_
,
shape
=
patch_merge_3
(
dummy_input
)
# when input_size equal to real input
# the out_size should be equal to `init_out_size`
assert
shape
==
patch_merge_3
.
init_out_size
# test adap padding
for
padding
in
(
'same'
,
'corner'
):
in_c
=
2
embed_dims
=
3
B
=
2
# test stride is 1
input_size
=
(
5
,
5
)
kernel_size
=
(
5
,
5
)
stride
=
(
1
,
1
)
dilation
=
1
bias
=
False
x
=
torch
.
rand
(
B
,
in_c
,
*
input_size
)
patch_embed
=
PatchEmbed
(
in_channels
=
in_c
,
embed_dims
=
embed_dims
,
kernel_size
=
kernel_size
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
,
bias
=
bias
)
x_out
,
out_size
=
patch_embed
(
x
)
assert
x_out
.
size
()
==
(
B
,
25
,
3
)
assert
out_size
==
(
5
,
5
)
assert
x_out
.
size
(
1
)
==
out_size
[
0
]
*
out_size
[
1
]
# test kernel_size == stride
input_size
=
(
5
,
5
)
kernel_size
=
(
5
,
5
)
stride
=
(
5
,
5
)
dilation
=
1
bias
=
False
x
=
torch
.
rand
(
B
,
in_c
,
*
input_size
)
patch_embed
=
PatchEmbed
(
in_channels
=
in_c
,
embed_dims
=
embed_dims
,
kernel_size
=
kernel_size
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
,
bias
=
bias
)
x_out
,
out_size
=
patch_embed
(
x
)
assert
x_out
.
size
()
==
(
B
,
1
,
3
)
assert
out_size
==
(
1
,
1
)
assert
x_out
.
size
(
1
)
==
out_size
[
0
]
*
out_size
[
1
]
# test kernel_size == stride
input_size
=
(
6
,
5
)
kernel_size
=
(
5
,
5
)
stride
=
(
5
,
5
)
dilation
=
1
bias
=
False
x
=
torch
.
rand
(
B
,
in_c
,
*
input_size
)
patch_embed
=
PatchEmbed
(
in_channels
=
in_c
,
embed_dims
=
embed_dims
,
kernel_size
=
kernel_size
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
,
bias
=
bias
)
x_out
,
out_size
=
patch_embed
(
x
)
assert
x_out
.
size
()
==
(
B
,
2
,
3
)
assert
out_size
==
(
2
,
1
)
assert
x_out
.
size
(
1
)
==
out_size
[
0
]
*
out_size
[
1
]
# test different kernel_size with different stride
input_size
=
(
6
,
5
)
kernel_size
=
(
6
,
2
)
stride
=
(
6
,
2
)
dilation
=
1
bias
=
False
x
=
torch
.
rand
(
B
,
in_c
,
*
input_size
)
patch_embed
=
PatchEmbed
(
in_channels
=
in_c
,
embed_dims
=
embed_dims
,
kernel_size
=
kernel_size
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
,
bias
=
bias
)
x_out
,
out_size
=
patch_embed
(
x
)
assert
x_out
.
size
()
==
(
B
,
3
,
3
)
assert
out_size
==
(
1
,
3
)
assert
x_out
.
size
(
1
)
==
out_size
[
0
]
*
out_size
[
1
]
def
test_patch_merging
():
# Test the model with int padding
in_c
=
3
out_c
=
4
kernel_size
=
3
stride
=
3
padding
=
1
dilation
=
1
bias
=
False
# test the case `pad_to_stride` is False
patch_merge
=
PatchMerging
(
in_channels
=
in_c
,
out_channels
=
out_c
,
kernel_size
=
kernel_size
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
,
bias
=
bias
)
B
,
L
,
C
=
1
,
100
,
3
input_size
=
(
10
,
10
)
x
=
torch
.
rand
(
B
,
L
,
C
)
x_out
,
out_size
=
patch_merge
(
x
,
input_size
)
assert
x_out
.
size
()
==
(
1
,
16
,
4
)
assert
out_size
==
(
4
,
4
)
# assert out size is consistent with real output
assert
x_out
.
size
(
1
)
==
out_size
[
0
]
*
out_size
[
1
]
in_c
=
4
out_c
=
5
kernel_size
=
6
stride
=
3
padding
=
2
dilation
=
2
bias
=
False
patch_merge
=
PatchMerging
(
in_channels
=
in_c
,
out_channels
=
out_c
,
kernel_size
=
kernel_size
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
,
bias
=
bias
)
B
,
L
,
C
=
1
,
100
,
4
input_size
=
(
10
,
10
)
x
=
torch
.
rand
(
B
,
L
,
C
)
x_out
,
out_size
=
patch_merge
(
x
,
input_size
)
assert
x_out
.
size
()
==
(
1
,
4
,
5
)
assert
out_size
==
(
2
,
2
)
# assert out size is consistent with real output
assert
x_out
.
size
(
1
)
==
out_size
[
0
]
*
out_size
[
1
]
# Test with adaptive padding
for
padding
in
(
'same'
,
'corner'
):
in_c
=
2
out_c
=
3
B
=
2
# test stride is 1
input_size
=
(
5
,
5
)
kernel_size
=
(
5
,
5
)
stride
=
(
1
,
1
)
dilation
=
1
bias
=
False
L
=
input_size
[
0
]
*
input_size
[
1
]
x
=
torch
.
rand
(
B
,
L
,
in_c
)
patch_merge
=
PatchMerging
(
in_channels
=
in_c
,
out_channels
=
out_c
,
kernel_size
=
kernel_size
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
,
bias
=
bias
)
x_out
,
out_size
=
patch_merge
(
x
,
input_size
)
assert
x_out
.
size
()
==
(
B
,
25
,
3
)
assert
out_size
==
(
5
,
5
)
assert
x_out
.
size
(
1
)
==
out_size
[
0
]
*
out_size
[
1
]
# test kernel_size == stride
input_size
=
(
5
,
5
)
kernel_size
=
(
5
,
5
)
stride
=
(
5
,
5
)
dilation
=
1
bias
=
False
L
=
input_size
[
0
]
*
input_size
[
1
]
x
=
torch
.
rand
(
B
,
L
,
in_c
)
patch_merge
=
PatchMerging
(
in_channels
=
in_c
,
out_channels
=
out_c
,
kernel_size
=
kernel_size
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
,
bias
=
bias
)
x_out
,
out_size
=
patch_merge
(
x
,
input_size
)
assert
x_out
.
size
()
==
(
B
,
1
,
3
)
assert
out_size
==
(
1
,
1
)
assert
x_out
.
size
(
1
)
==
out_size
[
0
]
*
out_size
[
1
]
# test kernel_size == stride
input_size
=
(
6
,
5
)
kernel_size
=
(
5
,
5
)
stride
=
(
5
,
5
)
dilation
=
1
bias
=
False
L
=
input_size
[
0
]
*
input_size
[
1
]
x
=
torch
.
rand
(
B
,
L
,
in_c
)
patch_merge
=
PatchMerging
(
in_channels
=
in_c
,
out_channels
=
out_c
,
kernel_size
=
kernel_size
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
,
bias
=
bias
)
x_out
,
out_size
=
patch_merge
(
x
,
input_size
)
assert
x_out
.
size
()
==
(
B
,
2
,
3
)
assert
out_size
==
(
2
,
1
)
assert
x_out
.
size
(
1
)
==
out_size
[
0
]
*
out_size
[
1
]
# test different kernel_size with different stride
input_size
=
(
6
,
5
)
kernel_size
=
(
6
,
2
)
stride
=
(
6
,
2
)
dilation
=
1
bias
=
False
L
=
input_size
[
0
]
*
input_size
[
1
]
x
=
torch
.
rand
(
B
,
L
,
in_c
)
patch_merge
=
PatchMerging
(
in_channels
=
in_c
,
out_channels
=
out_c
,
kernel_size
=
kernel_size
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
,
bias
=
bias
)
x_out
,
out_size
=
patch_merge
(
x
,
input_size
)
assert
x_out
.
size
()
==
(
B
,
3
,
3
)
assert
out_size
==
(
1
,
3
)
assert
x_out
.
size
(
1
)
==
out_size
[
0
]
*
out_size
[
1
]
def
test_multiheadattention
():
MultiheadAttention
(
embed_dims
=
5
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment