Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
InstructBLIP_pytorch
Commits
c04f261a
Commit
c04f261a
authored
Aug 22, 2024
by
dongchy920
Browse files
InstruceBLIP
parents
Pipeline
#1594
canceled with stages
Changes
421
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2621 additions
and
0 deletions
+2621
-0
lavis/common/annotator/uniformer/mmcv/cnn/bricks/conv.py
lavis/common/annotator/uniformer/mmcv/cnn/bricks/conv.py
+44
-0
lavis/common/annotator/uniformer/mmcv/cnn/bricks/conv2d_adaptive_padding.py
...ator/uniformer/mmcv/cnn/bricks/conv2d_adaptive_padding.py
+62
-0
lavis/common/annotator/uniformer/mmcv/cnn/bricks/conv_module.py
...common/annotator/uniformer/mmcv/cnn/bricks/conv_module.py
+206
-0
lavis/common/annotator/uniformer/mmcv/cnn/bricks/conv_ws.py
lavis/common/annotator/uniformer/mmcv/cnn/bricks/conv_ws.py
+148
-0
lavis/common/annotator/uniformer/mmcv/cnn/bricks/depthwise_separable_conv_module.py
...former/mmcv/cnn/bricks/depthwise_separable_conv_module.py
+96
-0
lavis/common/annotator/uniformer/mmcv/cnn/bricks/drop.py
lavis/common/annotator/uniformer/mmcv/cnn/bricks/drop.py
+65
-0
lavis/common/annotator/uniformer/mmcv/cnn/bricks/generalized_attention.py
...otator/uniformer/mmcv/cnn/bricks/generalized_attention.py
+412
-0
lavis/common/annotator/uniformer/mmcv/cnn/bricks/hsigmoid.py
lavis/common/annotator/uniformer/mmcv/cnn/bricks/hsigmoid.py
+34
-0
lavis/common/annotator/uniformer/mmcv/cnn/bricks/hswish.py
lavis/common/annotator/uniformer/mmcv/cnn/bricks/hswish.py
+29
-0
lavis/common/annotator/uniformer/mmcv/cnn/bricks/non_local.py
...s/common/annotator/uniformer/mmcv/cnn/bricks/non_local.py
+306
-0
lavis/common/annotator/uniformer/mmcv/cnn/bricks/norm.py
lavis/common/annotator/uniformer/mmcv/cnn/bricks/norm.py
+144
-0
lavis/common/annotator/uniformer/mmcv/cnn/bricks/padding.py
lavis/common/annotator/uniformer/mmcv/cnn/bricks/padding.py
+36
-0
lavis/common/annotator/uniformer/mmcv/cnn/bricks/plugin.py
lavis/common/annotator/uniformer/mmcv/cnn/bricks/plugin.py
+88
-0
lavis/common/annotator/uniformer/mmcv/cnn/bricks/registry.py
lavis/common/annotator/uniformer/mmcv/cnn/bricks/registry.py
+16
-0
lavis/common/annotator/uniformer/mmcv/cnn/bricks/scale.py
lavis/common/annotator/uniformer/mmcv/cnn/bricks/scale.py
+21
-0
lavis/common/annotator/uniformer/mmcv/cnn/bricks/swish.py
lavis/common/annotator/uniformer/mmcv/cnn/bricks/swish.py
+25
-0
lavis/common/annotator/uniformer/mmcv/cnn/bricks/transformer.py
...common/annotator/uniformer/mmcv/cnn/bricks/transformer.py
+595
-0
lavis/common/annotator/uniformer/mmcv/cnn/bricks/upsample.py
lavis/common/annotator/uniformer/mmcv/cnn/bricks/upsample.py
+84
-0
lavis/common/annotator/uniformer/mmcv/cnn/bricks/wrappers.py
lavis/common/annotator/uniformer/mmcv/cnn/bricks/wrappers.py
+180
-0
lavis/common/annotator/uniformer/mmcv/cnn/builder.py
lavis/common/annotator/uniformer/mmcv/cnn/builder.py
+30
-0
No files found.
Too many changes to show.
To preserve performance only
421 of 421+
files are displayed.
Plain diff
Email patch
lavis/common/annotator/uniformer/mmcv/cnn/bricks/conv.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
from
torch
import
nn
from
.registry
import
CONV_LAYERS
CONV_LAYERS
.
register_module
(
'Conv1d'
,
module
=
nn
.
Conv1d
)
CONV_LAYERS
.
register_module
(
'Conv2d'
,
module
=
nn
.
Conv2d
)
CONV_LAYERS
.
register_module
(
'Conv3d'
,
module
=
nn
.
Conv3d
)
CONV_LAYERS
.
register_module
(
'Conv'
,
module
=
nn
.
Conv2d
)
def
build_conv_layer
(
cfg
,
*
args
,
**
kwargs
):
"""Build convolution layer.
Args:
cfg (None or dict): The conv layer config, which should contain:
- type (str): Layer type.
- layer args: Args needed to instantiate an conv layer.
args (argument list): Arguments passed to the `__init__`
method of the corresponding conv layer.
kwargs (keyword arguments): Keyword arguments passed to the `__init__`
method of the corresponding conv layer.
Returns:
nn.Module: Created conv layer.
"""
if
cfg
is
None
:
cfg_
=
dict
(
type
=
'Conv2d'
)
else
:
if
not
isinstance
(
cfg
,
dict
):
raise
TypeError
(
'cfg must be a dict'
)
if
'type'
not
in
cfg
:
raise
KeyError
(
'the cfg dict must contain the key "type"'
)
cfg_
=
cfg
.
copy
()
layer_type
=
cfg_
.
pop
(
'type'
)
if
layer_type
not
in
CONV_LAYERS
:
raise
KeyError
(
f
'Unrecognized norm type
{
layer_type
}
'
)
else
:
conv_layer
=
CONV_LAYERS
.
get
(
layer_type
)
layer
=
conv_layer
(
*
args
,
**
kwargs
,
**
cfg_
)
return
layer
lavis/common/annotator/uniformer/mmcv/cnn/bricks/conv2d_adaptive_padding.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
math
from
torch
import
nn
from
torch.nn
import
functional
as
F
from
.registry
import
CONV_LAYERS
@
CONV_LAYERS
.
register_module
()
class
Conv2dAdaptivePadding
(
nn
.
Conv2d
):
"""Implementation of 2D convolution in tensorflow with `padding` as "same",
which applies padding to input (if needed) so that input image gets fully
covered by filter and stride you specified. For stride 1, this will ensure
that output image size is same as input. For stride of 2, output dimensions
will be half, for example.
Args:
in_channels (int): Number of channels in the input image
out_channels (int): Number of channels produced by the convolution
kernel_size (int or tuple): Size of the convolving kernel
stride (int or tuple, optional): Stride of the convolution. Default: 1
padding (int or tuple, optional): Zero-padding added to both sides of
the input. Default: 0
dilation (int or tuple, optional): Spacing between kernel elements.
Default: 1
groups (int, optional): Number of blocked connections from input
channels to output channels. Default: 1
bias (bool, optional): If ``True``, adds a learnable bias to the
output. Default: ``True``
"""
def
__init__
(
self
,
in_channels
,
out_channels
,
kernel_size
,
stride
=
1
,
padding
=
0
,
dilation
=
1
,
groups
=
1
,
bias
=
True
):
super
().
__init__
(
in_channels
,
out_channels
,
kernel_size
,
stride
,
0
,
dilation
,
groups
,
bias
)
def
forward
(
self
,
x
):
img_h
,
img_w
=
x
.
size
()[
-
2
:]
kernel_h
,
kernel_w
=
self
.
weight
.
size
()[
-
2
:]
stride_h
,
stride_w
=
self
.
stride
output_h
=
math
.
ceil
(
img_h
/
stride_h
)
output_w
=
math
.
ceil
(
img_w
/
stride_w
)
pad_h
=
(
max
((
output_h
-
1
)
*
self
.
stride
[
0
]
+
(
kernel_h
-
1
)
*
self
.
dilation
[
0
]
+
1
-
img_h
,
0
))
pad_w
=
(
max
((
output_w
-
1
)
*
self
.
stride
[
1
]
+
(
kernel_w
-
1
)
*
self
.
dilation
[
1
]
+
1
-
img_w
,
0
))
if
pad_h
>
0
or
pad_w
>
0
:
x
=
F
.
pad
(
x
,
[
pad_w
//
2
,
pad_w
-
pad_w
//
2
,
pad_h
//
2
,
pad_h
-
pad_h
//
2
])
return
F
.
conv2d
(
x
,
self
.
weight
,
self
.
bias
,
self
.
stride
,
self
.
padding
,
self
.
dilation
,
self
.
groups
)
lavis/common/annotator/uniformer/mmcv/cnn/bricks/conv_module.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
warnings
import
torch.nn
as
nn
from
annotator.uniformer.mmcv.utils
import
_BatchNorm
,
_InstanceNorm
from
..utils
import
constant_init
,
kaiming_init
from
.activation
import
build_activation_layer
from
.conv
import
build_conv_layer
from
.norm
import
build_norm_layer
from
.padding
import
build_padding_layer
from
.registry
import
PLUGIN_LAYERS
@
PLUGIN_LAYERS
.
register_module
()
class
ConvModule
(
nn
.
Module
):
"""A conv block that bundles conv/norm/activation layers.
This block simplifies the usage of convolution layers, which are commonly
used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).
It is based upon three build methods: `build_conv_layer()`,
`build_norm_layer()` and `build_activation_layer()`.
Besides, we add some additional features in this module.
1. Automatically set `bias` of the conv layer.
2. Spectral norm is supported.
3. More padding modes are supported. Before PyTorch 1.5, nn.Conv2d only
supports zero and circular padding, and we add "reflect" padding mode.
Args:
in_channels (int): Number of channels in the input feature map.
Same as that in ``nn._ConvNd``.
out_channels (int): Number of channels produced by the convolution.
Same as that in ``nn._ConvNd``.
kernel_size (int | tuple[int]): Size of the convolving kernel.
Same as that in ``nn._ConvNd``.
stride (int | tuple[int]): Stride of the convolution.
Same as that in ``nn._ConvNd``.
padding (int | tuple[int]): Zero-padding added to both sides of
the input. Same as that in ``nn._ConvNd``.
dilation (int | tuple[int]): Spacing between kernel elements.
Same as that in ``nn._ConvNd``.
groups (int): Number of blocked connections from input channels to
output channels. Same as that in ``nn._ConvNd``.
bias (bool | str): If specified as `auto`, it will be decided by the
norm_cfg. Bias will be set as True if `norm_cfg` is None, otherwise
False. Default: "auto".
conv_cfg (dict): Config dict for convolution layer. Default: None,
which means using conv2d.
norm_cfg (dict): Config dict for normalization layer. Default: None.
act_cfg (dict): Config dict for activation layer.
Default: dict(type='ReLU').
inplace (bool): Whether to use inplace mode for activation.
Default: True.
with_spectral_norm (bool): Whether use spectral norm in conv module.
Default: False.
padding_mode (str): If the `padding_mode` has not been supported by
current `Conv2d` in PyTorch, we will use our own padding layer
instead. Currently, we support ['zeros', 'circular'] with official
implementation and ['reflect'] with our own implementation.
Default: 'zeros'.
order (tuple[str]): The order of conv/norm/activation layers. It is a
sequence of "conv", "norm" and "act". Common examples are
("conv", "norm", "act") and ("act", "conv", "norm").
Default: ('conv', 'norm', 'act').
"""
_abbr_
=
'conv_block'
def
__init__
(
self
,
in_channels
,
out_channels
,
kernel_size
,
stride
=
1
,
padding
=
0
,
dilation
=
1
,
groups
=
1
,
bias
=
'auto'
,
conv_cfg
=
None
,
norm_cfg
=
None
,
act_cfg
=
dict
(
type
=
'ReLU'
),
inplace
=
True
,
with_spectral_norm
=
False
,
padding_mode
=
'zeros'
,
order
=
(
'conv'
,
'norm'
,
'act'
)):
super
(
ConvModule
,
self
).
__init__
()
assert
conv_cfg
is
None
or
isinstance
(
conv_cfg
,
dict
)
assert
norm_cfg
is
None
or
isinstance
(
norm_cfg
,
dict
)
assert
act_cfg
is
None
or
isinstance
(
act_cfg
,
dict
)
official_padding_mode
=
[
'zeros'
,
'circular'
]
self
.
conv_cfg
=
conv_cfg
self
.
norm_cfg
=
norm_cfg
self
.
act_cfg
=
act_cfg
self
.
inplace
=
inplace
self
.
with_spectral_norm
=
with_spectral_norm
self
.
with_explicit_padding
=
padding_mode
not
in
official_padding_mode
self
.
order
=
order
assert
isinstance
(
self
.
order
,
tuple
)
and
len
(
self
.
order
)
==
3
assert
set
(
order
)
==
set
([
'conv'
,
'norm'
,
'act'
])
self
.
with_norm
=
norm_cfg
is
not
None
self
.
with_activation
=
act_cfg
is
not
None
# if the conv layer is before a norm layer, bias is unnecessary.
if
bias
==
'auto'
:
bias
=
not
self
.
with_norm
self
.
with_bias
=
bias
if
self
.
with_explicit_padding
:
pad_cfg
=
dict
(
type
=
padding_mode
)
self
.
padding_layer
=
build_padding_layer
(
pad_cfg
,
padding
)
# reset padding to 0 for conv module
conv_padding
=
0
if
self
.
with_explicit_padding
else
padding
# build convolution layer
self
.
conv
=
build_conv_layer
(
conv_cfg
,
in_channels
,
out_channels
,
kernel_size
,
stride
=
stride
,
padding
=
conv_padding
,
dilation
=
dilation
,
groups
=
groups
,
bias
=
bias
)
# export the attributes of self.conv to a higher level for convenience
self
.
in_channels
=
self
.
conv
.
in_channels
self
.
out_channels
=
self
.
conv
.
out_channels
self
.
kernel_size
=
self
.
conv
.
kernel_size
self
.
stride
=
self
.
conv
.
stride
self
.
padding
=
padding
self
.
dilation
=
self
.
conv
.
dilation
self
.
transposed
=
self
.
conv
.
transposed
self
.
output_padding
=
self
.
conv
.
output_padding
self
.
groups
=
self
.
conv
.
groups
if
self
.
with_spectral_norm
:
self
.
conv
=
nn
.
utils
.
spectral_norm
(
self
.
conv
)
# build normalization layers
if
self
.
with_norm
:
# norm layer is after conv layer
if
order
.
index
(
'norm'
)
>
order
.
index
(
'conv'
):
norm_channels
=
out_channels
else
:
norm_channels
=
in_channels
self
.
norm_name
,
norm
=
build_norm_layer
(
norm_cfg
,
norm_channels
)
self
.
add_module
(
self
.
norm_name
,
norm
)
if
self
.
with_bias
:
if
isinstance
(
norm
,
(
_BatchNorm
,
_InstanceNorm
)):
warnings
.
warn
(
'Unnecessary conv bias before batch/instance norm'
)
else
:
self
.
norm_name
=
None
# build activation layer
if
self
.
with_activation
:
act_cfg_
=
act_cfg
.
copy
()
# nn.Tanh has no 'inplace' argument
if
act_cfg_
[
'type'
]
not
in
[
'Tanh'
,
'PReLU'
,
'Sigmoid'
,
'HSigmoid'
,
'Swish'
]:
act_cfg_
.
setdefault
(
'inplace'
,
inplace
)
self
.
activate
=
build_activation_layer
(
act_cfg_
)
# Use msra init by default
self
.
init_weights
()
@
property
def
norm
(
self
):
if
self
.
norm_name
:
return
getattr
(
self
,
self
.
norm_name
)
else
:
return
None
def
init_weights
(
self
):
# 1. It is mainly for customized conv layers with their own
# initialization manners by calling their own ``init_weights()``,
# and we do not want ConvModule to override the initialization.
# 2. For customized conv layers without their own initialization
# manners (that is, they don't have their own ``init_weights()``)
# and PyTorch's conv layers, they will be initialized by
# this method with default ``kaiming_init``.
# Note: For PyTorch's conv layers, they will be overwritten by our
# initialization implementation using default ``kaiming_init``.
if
not
hasattr
(
self
.
conv
,
'init_weights'
):
if
self
.
with_activation
and
self
.
act_cfg
[
'type'
]
==
'LeakyReLU'
:
nonlinearity
=
'leaky_relu'
a
=
self
.
act_cfg
.
get
(
'negative_slope'
,
0.01
)
else
:
nonlinearity
=
'relu'
a
=
0
kaiming_init
(
self
.
conv
,
a
=
a
,
nonlinearity
=
nonlinearity
)
if
self
.
with_norm
:
constant_init
(
self
.
norm
,
1
,
bias
=
0
)
def
forward
(
self
,
x
,
activate
=
True
,
norm
=
True
):
for
layer
in
self
.
order
:
if
layer
==
'conv'
:
if
self
.
with_explicit_padding
:
x
=
self
.
padding_layer
(
x
)
x
=
self
.
conv
(
x
)
elif
layer
==
'norm'
and
norm
and
self
.
with_norm
:
x
=
self
.
norm
(
x
)
elif
layer
==
'act'
and
activate
and
self
.
with_activation
:
x
=
self
.
activate
(
x
)
return
x
lavis/common/annotator/uniformer/mmcv/cnn/bricks/conv_ws.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
.registry
import
CONV_LAYERS
def
conv_ws_2d
(
input
,
weight
,
bias
=
None
,
stride
=
1
,
padding
=
0
,
dilation
=
1
,
groups
=
1
,
eps
=
1e-5
):
c_in
=
weight
.
size
(
0
)
weight_flat
=
weight
.
view
(
c_in
,
-
1
)
mean
=
weight_flat
.
mean
(
dim
=
1
,
keepdim
=
True
).
view
(
c_in
,
1
,
1
,
1
)
std
=
weight_flat
.
std
(
dim
=
1
,
keepdim
=
True
).
view
(
c_in
,
1
,
1
,
1
)
weight
=
(
weight
-
mean
)
/
(
std
+
eps
)
return
F
.
conv2d
(
input
,
weight
,
bias
,
stride
,
padding
,
dilation
,
groups
)
@
CONV_LAYERS
.
register_module
(
'ConvWS'
)
class
ConvWS2d
(
nn
.
Conv2d
):
def
__init__
(
self
,
in_channels
,
out_channels
,
kernel_size
,
stride
=
1
,
padding
=
0
,
dilation
=
1
,
groups
=
1
,
bias
=
True
,
eps
=
1e-5
):
super
(
ConvWS2d
,
self
).
__init__
(
in_channels
,
out_channels
,
kernel_size
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
,
groups
=
groups
,
bias
=
bias
)
self
.
eps
=
eps
def
forward
(
self
,
x
):
return
conv_ws_2d
(
x
,
self
.
weight
,
self
.
bias
,
self
.
stride
,
self
.
padding
,
self
.
dilation
,
self
.
groups
,
self
.
eps
)
@
CONV_LAYERS
.
register_module
(
name
=
'ConvAWS'
)
class
ConvAWS2d
(
nn
.
Conv2d
):
"""AWS (Adaptive Weight Standardization)
This is a variant of Weight Standardization
(https://arxiv.org/pdf/1903.10520.pdf)
It is used in DetectoRS to avoid NaN
(https://arxiv.org/pdf/2006.02334.pdf)
Args:
in_channels (int): Number of channels in the input image
out_channels (int): Number of channels produced by the convolution
kernel_size (int or tuple): Size of the conv kernel
stride (int or tuple, optional): Stride of the convolution. Default: 1
padding (int or tuple, optional): Zero-padding added to both sides of
the input. Default: 0
dilation (int or tuple, optional): Spacing between kernel elements.
Default: 1
groups (int, optional): Number of blocked connections from input
channels to output channels. Default: 1
bias (bool, optional): If set True, adds a learnable bias to the
output. Default: True
"""
def
__init__
(
self
,
in_channels
,
out_channels
,
kernel_size
,
stride
=
1
,
padding
=
0
,
dilation
=
1
,
groups
=
1
,
bias
=
True
):
super
().
__init__
(
in_channels
,
out_channels
,
kernel_size
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
,
groups
=
groups
,
bias
=
bias
)
self
.
register_buffer
(
'weight_gamma'
,
torch
.
ones
(
self
.
out_channels
,
1
,
1
,
1
))
self
.
register_buffer
(
'weight_beta'
,
torch
.
zeros
(
self
.
out_channels
,
1
,
1
,
1
))
def
_get_weight
(
self
,
weight
):
weight_flat
=
weight
.
view
(
weight
.
size
(
0
),
-
1
)
mean
=
weight_flat
.
mean
(
dim
=
1
).
view
(
-
1
,
1
,
1
,
1
)
std
=
torch
.
sqrt
(
weight_flat
.
var
(
dim
=
1
)
+
1e-5
).
view
(
-
1
,
1
,
1
,
1
)
weight
=
(
weight
-
mean
)
/
std
weight
=
self
.
weight_gamma
*
weight
+
self
.
weight_beta
return
weight
def
forward
(
self
,
x
):
weight
=
self
.
_get_weight
(
self
.
weight
)
return
F
.
conv2d
(
x
,
weight
,
self
.
bias
,
self
.
stride
,
self
.
padding
,
self
.
dilation
,
self
.
groups
)
def
_load_from_state_dict
(
self
,
state_dict
,
prefix
,
local_metadata
,
strict
,
missing_keys
,
unexpected_keys
,
error_msgs
):
"""Override default load function.
AWS overrides the function _load_from_state_dict to recover
weight_gamma and weight_beta if they are missing. If weight_gamma and
weight_beta are found in the checkpoint, this function will return
after super()._load_from_state_dict. Otherwise, it will compute the
mean and std of the pretrained weights and store them in weight_beta
and weight_gamma.
"""
self
.
weight_gamma
.
data
.
fill_
(
-
1
)
local_missing_keys
=
[]
super
().
_load_from_state_dict
(
state_dict
,
prefix
,
local_metadata
,
strict
,
local_missing_keys
,
unexpected_keys
,
error_msgs
)
if
self
.
weight_gamma
.
data
.
mean
()
>
0
:
for
k
in
local_missing_keys
:
missing_keys
.
append
(
k
)
return
weight
=
self
.
weight
.
data
weight_flat
=
weight
.
view
(
weight
.
size
(
0
),
-
1
)
mean
=
weight_flat
.
mean
(
dim
=
1
).
view
(
-
1
,
1
,
1
,
1
)
std
=
torch
.
sqrt
(
weight_flat
.
var
(
dim
=
1
)
+
1e-5
).
view
(
-
1
,
1
,
1
,
1
)
self
.
weight_beta
.
data
.
copy_
(
mean
)
self
.
weight_gamma
.
data
.
copy_
(
std
)
missing_gamma_beta
=
[
k
for
k
in
local_missing_keys
if
k
.
endswith
(
'weight_gamma'
)
or
k
.
endswith
(
'weight_beta'
)
]
for
k
in
missing_gamma_beta
:
local_missing_keys
.
remove
(
k
)
for
k
in
local_missing_keys
:
missing_keys
.
append
(
k
)
lavis/common/annotator/uniformer/mmcv/cnn/bricks/depthwise_separable_conv_module.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
torch.nn
as
nn
from
.conv_module
import
ConvModule
class
DepthwiseSeparableConvModule
(
nn
.
Module
):
"""Depthwise separable convolution module.
See https://arxiv.org/pdf/1704.04861.pdf for details.
This module can replace a ConvModule with the conv block replaced by two
conv block: depthwise conv block and pointwise conv block. The depthwise
conv block contains depthwise-conv/norm/activation layers. The pointwise
conv block contains pointwise-conv/norm/activation layers. It should be
noted that there will be norm/activation layer in the depthwise conv block
if `norm_cfg` and `act_cfg` are specified.
Args:
in_channels (int): Number of channels in the input feature map.
Same as that in ``nn._ConvNd``.
out_channels (int): Number of channels produced by the convolution.
Same as that in ``nn._ConvNd``.
kernel_size (int | tuple[int]): Size of the convolving kernel.
Same as that in ``nn._ConvNd``.
stride (int | tuple[int]): Stride of the convolution.
Same as that in ``nn._ConvNd``. Default: 1.
padding (int | tuple[int]): Zero-padding added to both sides of
the input. Same as that in ``nn._ConvNd``. Default: 0.
dilation (int | tuple[int]): Spacing between kernel elements.
Same as that in ``nn._ConvNd``. Default: 1.
norm_cfg (dict): Default norm config for both depthwise ConvModule and
pointwise ConvModule. Default: None.
act_cfg (dict): Default activation config for both depthwise ConvModule
and pointwise ConvModule. Default: dict(type='ReLU').
dw_norm_cfg (dict): Norm config of depthwise ConvModule. If it is
'default', it will be the same as `norm_cfg`. Default: 'default'.
dw_act_cfg (dict): Activation config of depthwise ConvModule. If it is
'default', it will be the same as `act_cfg`. Default: 'default'.
pw_norm_cfg (dict): Norm config of pointwise ConvModule. If it is
'default', it will be the same as `norm_cfg`. Default: 'default'.
pw_act_cfg (dict): Activation config of pointwise ConvModule. If it is
'default', it will be the same as `act_cfg`. Default: 'default'.
kwargs (optional): Other shared arguments for depthwise and pointwise
ConvModule. See ConvModule for ref.
"""
def
__init__
(
self
,
in_channels
,
out_channels
,
kernel_size
,
stride
=
1
,
padding
=
0
,
dilation
=
1
,
norm_cfg
=
None
,
act_cfg
=
dict
(
type
=
'ReLU'
),
dw_norm_cfg
=
'default'
,
dw_act_cfg
=
'default'
,
pw_norm_cfg
=
'default'
,
pw_act_cfg
=
'default'
,
**
kwargs
):
super
(
DepthwiseSeparableConvModule
,
self
).
__init__
()
assert
'groups'
not
in
kwargs
,
'groups should not be specified'
# if norm/activation config of depthwise/pointwise ConvModule is not
# specified, use default config.
dw_norm_cfg
=
dw_norm_cfg
if
dw_norm_cfg
!=
'default'
else
norm_cfg
dw_act_cfg
=
dw_act_cfg
if
dw_act_cfg
!=
'default'
else
act_cfg
pw_norm_cfg
=
pw_norm_cfg
if
pw_norm_cfg
!=
'default'
else
norm_cfg
pw_act_cfg
=
pw_act_cfg
if
pw_act_cfg
!=
'default'
else
act_cfg
# depthwise convolution
self
.
depthwise_conv
=
ConvModule
(
in_channels
,
in_channels
,
kernel_size
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
,
groups
=
in_channels
,
norm_cfg
=
dw_norm_cfg
,
act_cfg
=
dw_act_cfg
,
**
kwargs
)
self
.
pointwise_conv
=
ConvModule
(
in_channels
,
out_channels
,
1
,
norm_cfg
=
pw_norm_cfg
,
act_cfg
=
pw_act_cfg
,
**
kwargs
)
def
forward
(
self
,
x
):
x
=
self
.
depthwise_conv
(
x
)
x
=
self
.
pointwise_conv
(
x
)
return
x
lavis/common/annotator/uniformer/mmcv/cnn/bricks/drop.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
torch
import
torch.nn
as
nn
from
annotator.uniformer.mmcv
import
build_from_cfg
from
.registry
import
DROPOUT_LAYERS
def
drop_path
(
x
,
drop_prob
=
0.
,
training
=
False
):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of
residual blocks).
We follow the implementation
https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py # noqa: E501
"""
if
drop_prob
==
0.
or
not
training
:
return
x
keep_prob
=
1
-
drop_prob
# handle tensors with different dimensions, not just 4D tensors.
shape
=
(
x
.
shape
[
0
],
)
+
(
1
,
)
*
(
x
.
ndim
-
1
)
random_tensor
=
keep_prob
+
torch
.
rand
(
shape
,
dtype
=
x
.
dtype
,
device
=
x
.
device
)
output
=
x
.
div
(
keep_prob
)
*
random_tensor
.
floor
()
return
output
@
DROPOUT_LAYERS
.
register_module
()
class
DropPath
(
nn
.
Module
):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of
residual blocks).
We follow the implementation
https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py # noqa: E501
Args:
drop_prob (float): Probability of the path to be zeroed. Default: 0.1
"""
def
__init__
(
self
,
drop_prob
=
0.1
):
super
(
DropPath
,
self
).
__init__
()
self
.
drop_prob
=
drop_prob
def
forward
(
self
,
x
):
return
drop_path
(
x
,
self
.
drop_prob
,
self
.
training
)
@
DROPOUT_LAYERS
.
register_module
()
class
Dropout
(
nn
.
Dropout
):
"""A wrapper for ``torch.nn.Dropout``, We rename the ``p`` of
``torch.nn.Dropout`` to ``drop_prob`` so as to be consistent with
``DropPath``
Args:
drop_prob (float): Probability of the elements to be
zeroed. Default: 0.5.
inplace (bool): Do the operation inplace or not. Default: False.
"""
def
__init__
(
self
,
drop_prob
=
0.5
,
inplace
=
False
):
super
().
__init__
(
p
=
drop_prob
,
inplace
=
inplace
)
def
build_dropout
(
cfg
,
default_args
=
None
):
"""Builder for drop out layers."""
return
build_from_cfg
(
cfg
,
DROPOUT_LAYERS
,
default_args
)
lavis/common/annotator/uniformer/mmcv/cnn/bricks/generalized_attention.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
math
import
numpy
as
np
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
..utils
import
kaiming_init
from
.registry
import
PLUGIN_LAYERS
@
PLUGIN_LAYERS
.
register_module
()
class
GeneralizedAttention
(
nn
.
Module
):
"""GeneralizedAttention module.
See 'An Empirical Study of Spatial Attention Mechanisms in Deep Networks'
(https://arxiv.org/abs/1711.07971) for details.
Args:
in_channels (int): Channels of the input feature map.
spatial_range (int): The spatial range. -1 indicates no spatial range
constraint. Default: -1.
num_heads (int): The head number of empirical_attention module.
Default: 9.
position_embedding_dim (int): The position embedding dimension.
Default: -1.
position_magnitude (int): A multiplier acting on coord difference.
Default: 1.
kv_stride (int): The feature stride acting on key/value feature map.
Default: 2.
q_stride (int): The feature stride acting on query feature map.
Default: 1.
attention_type (str): A binary indicator string for indicating which
items in generalized empirical_attention module are used.
Default: '1111'.
- '1000' indicates 'query and key content' (appr - appr) item,
- '0100' indicates 'query content and relative position'
(appr - position) item,
- '0010' indicates 'key content only' (bias - appr) item,
- '0001' indicates 'relative position only' (bias - position) item.
"""
_abbr_
=
'gen_attention_block'
def
__init__
(
self
,
in_channels
,
spatial_range
=-
1
,
num_heads
=
9
,
position_embedding_dim
=-
1
,
position_magnitude
=
1
,
kv_stride
=
2
,
q_stride
=
1
,
attention_type
=
'1111'
):
super
(
GeneralizedAttention
,
self
).
__init__
()
# hard range means local range for non-local operation
self
.
position_embedding_dim
=
(
position_embedding_dim
if
position_embedding_dim
>
0
else
in_channels
)
self
.
position_magnitude
=
position_magnitude
self
.
num_heads
=
num_heads
self
.
in_channels
=
in_channels
self
.
spatial_range
=
spatial_range
self
.
kv_stride
=
kv_stride
self
.
q_stride
=
q_stride
self
.
attention_type
=
[
bool
(
int
(
_
))
for
_
in
attention_type
]
self
.
qk_embed_dim
=
in_channels
//
num_heads
out_c
=
self
.
qk_embed_dim
*
num_heads
if
self
.
attention_type
[
0
]
or
self
.
attention_type
[
1
]:
self
.
query_conv
=
nn
.
Conv2d
(
in_channels
=
in_channels
,
out_channels
=
out_c
,
kernel_size
=
1
,
bias
=
False
)
self
.
query_conv
.
kaiming_init
=
True
if
self
.
attention_type
[
0
]
or
self
.
attention_type
[
2
]:
self
.
key_conv
=
nn
.
Conv2d
(
in_channels
=
in_channels
,
out_channels
=
out_c
,
kernel_size
=
1
,
bias
=
False
)
self
.
key_conv
.
kaiming_init
=
True
self
.
v_dim
=
in_channels
//
num_heads
self
.
value_conv
=
nn
.
Conv2d
(
in_channels
=
in_channels
,
out_channels
=
self
.
v_dim
*
num_heads
,
kernel_size
=
1
,
bias
=
False
)
self
.
value_conv
.
kaiming_init
=
True
if
self
.
attention_type
[
1
]
or
self
.
attention_type
[
3
]:
self
.
appr_geom_fc_x
=
nn
.
Linear
(
self
.
position_embedding_dim
//
2
,
out_c
,
bias
=
False
)
self
.
appr_geom_fc_x
.
kaiming_init
=
True
self
.
appr_geom_fc_y
=
nn
.
Linear
(
self
.
position_embedding_dim
//
2
,
out_c
,
bias
=
False
)
self
.
appr_geom_fc_y
.
kaiming_init
=
True
if
self
.
attention_type
[
2
]:
stdv
=
1.0
/
math
.
sqrt
(
self
.
qk_embed_dim
*
2
)
appr_bias_value
=
-
2
*
stdv
*
torch
.
rand
(
out_c
)
+
stdv
self
.
appr_bias
=
nn
.
Parameter
(
appr_bias_value
)
if
self
.
attention_type
[
3
]:
stdv
=
1.0
/
math
.
sqrt
(
self
.
qk_embed_dim
*
2
)
geom_bias_value
=
-
2
*
stdv
*
torch
.
rand
(
out_c
)
+
stdv
self
.
geom_bias
=
nn
.
Parameter
(
geom_bias_value
)
self
.
proj_conv
=
nn
.
Conv2d
(
in_channels
=
self
.
v_dim
*
num_heads
,
out_channels
=
in_channels
,
kernel_size
=
1
,
bias
=
True
)
self
.
proj_conv
.
kaiming_init
=
True
self
.
gamma
=
nn
.
Parameter
(
torch
.
zeros
(
1
))
if
self
.
spatial_range
>=
0
:
# only works when non local is after 3*3 conv
if
in_channels
==
256
:
max_len
=
84
elif
in_channels
==
512
:
max_len
=
42
max_len_kv
=
int
((
max_len
-
1.0
)
/
self
.
kv_stride
+
1
)
local_constraint_map
=
np
.
ones
(
(
max_len
,
max_len
,
max_len_kv
,
max_len_kv
),
dtype
=
np
.
int
)
for
iy
in
range
(
max_len
):
for
ix
in
range
(
max_len
):
local_constraint_map
[
iy
,
ix
,
max
((
iy
-
self
.
spatial_range
)
//
self
.
kv_stride
,
0
):
min
((
iy
+
self
.
spatial_range
+
1
)
//
self
.
kv_stride
+
1
,
max_len
),
max
((
ix
-
self
.
spatial_range
)
//
self
.
kv_stride
,
0
):
min
((
ix
+
self
.
spatial_range
+
1
)
//
self
.
kv_stride
+
1
,
max_len
)]
=
0
self
.
local_constraint_map
=
nn
.
Parameter
(
torch
.
from_numpy
(
local_constraint_map
).
byte
(),
requires_grad
=
False
)
if
self
.
q_stride
>
1
:
self
.
q_downsample
=
nn
.
AvgPool2d
(
kernel_size
=
1
,
stride
=
self
.
q_stride
)
else
:
self
.
q_downsample
=
None
if
self
.
kv_stride
>
1
:
self
.
kv_downsample
=
nn
.
AvgPool2d
(
kernel_size
=
1
,
stride
=
self
.
kv_stride
)
else
:
self
.
kv_downsample
=
None
self
.
init_weights
()
def
get_position_embedding
(
self
,
h
,
w
,
h_kv
,
w_kv
,
q_stride
,
kv_stride
,
device
,
dtype
,
feat_dim
,
wave_length
=
1000
):
# the default type of Tensor is float32, leading to type mismatch
# in fp16 mode. Cast it to support fp16 mode.
h_idxs
=
torch
.
linspace
(
0
,
h
-
1
,
h
).
to
(
device
=
device
,
dtype
=
dtype
)
h_idxs
=
h_idxs
.
view
((
h
,
1
))
*
q_stride
w_idxs
=
torch
.
linspace
(
0
,
w
-
1
,
w
).
to
(
device
=
device
,
dtype
=
dtype
)
w_idxs
=
w_idxs
.
view
((
w
,
1
))
*
q_stride
h_kv_idxs
=
torch
.
linspace
(
0
,
h_kv
-
1
,
h_kv
).
to
(
device
=
device
,
dtype
=
dtype
)
h_kv_idxs
=
h_kv_idxs
.
view
((
h_kv
,
1
))
*
kv_stride
w_kv_idxs
=
torch
.
linspace
(
0
,
w_kv
-
1
,
w_kv
).
to
(
device
=
device
,
dtype
=
dtype
)
w_kv_idxs
=
w_kv_idxs
.
view
((
w_kv
,
1
))
*
kv_stride
# (h, h_kv, 1)
h_diff
=
h_idxs
.
unsqueeze
(
1
)
-
h_kv_idxs
.
unsqueeze
(
0
)
h_diff
*=
self
.
position_magnitude
# (w, w_kv, 1)
w_diff
=
w_idxs
.
unsqueeze
(
1
)
-
w_kv_idxs
.
unsqueeze
(
0
)
w_diff
*=
self
.
position_magnitude
feat_range
=
torch
.
arange
(
0
,
feat_dim
/
4
).
to
(
device
=
device
,
dtype
=
dtype
)
dim_mat
=
torch
.
Tensor
([
wave_length
]).
to
(
device
=
device
,
dtype
=
dtype
)
dim_mat
=
dim_mat
**
((
4.
/
feat_dim
)
*
feat_range
)
dim_mat
=
dim_mat
.
view
((
1
,
1
,
-
1
))
embedding_x
=
torch
.
cat
(
((
w_diff
/
dim_mat
).
sin
(),
(
w_diff
/
dim_mat
).
cos
()),
dim
=
2
)
embedding_y
=
torch
.
cat
(
((
h_diff
/
dim_mat
).
sin
(),
(
h_diff
/
dim_mat
).
cos
()),
dim
=
2
)
return
embedding_x
,
embedding_y
def
forward
(
self
,
x_input
):
num_heads
=
self
.
num_heads
# use empirical_attention
if
self
.
q_downsample
is
not
None
:
x_q
=
self
.
q_downsample
(
x_input
)
else
:
x_q
=
x_input
n
,
_
,
h
,
w
=
x_q
.
shape
if
self
.
kv_downsample
is
not
None
:
x_kv
=
self
.
kv_downsample
(
x_input
)
else
:
x_kv
=
x_input
_
,
_
,
h_kv
,
w_kv
=
x_kv
.
shape
if
self
.
attention_type
[
0
]
or
self
.
attention_type
[
1
]:
proj_query
=
self
.
query_conv
(
x_q
).
view
(
(
n
,
num_heads
,
self
.
qk_embed_dim
,
h
*
w
))
proj_query
=
proj_query
.
permute
(
0
,
1
,
3
,
2
)
if
self
.
attention_type
[
0
]
or
self
.
attention_type
[
2
]:
proj_key
=
self
.
key_conv
(
x_kv
).
view
(
(
n
,
num_heads
,
self
.
qk_embed_dim
,
h_kv
*
w_kv
))
if
self
.
attention_type
[
1
]
or
self
.
attention_type
[
3
]:
position_embed_x
,
position_embed_y
=
self
.
get_position_embedding
(
h
,
w
,
h_kv
,
w_kv
,
self
.
q_stride
,
self
.
kv_stride
,
x_input
.
device
,
x_input
.
dtype
,
self
.
position_embedding_dim
)
# (n, num_heads, w, w_kv, dim)
position_feat_x
=
self
.
appr_geom_fc_x
(
position_embed_x
).
\
view
(
1
,
w
,
w_kv
,
num_heads
,
self
.
qk_embed_dim
).
\
permute
(
0
,
3
,
1
,
2
,
4
).
\
repeat
(
n
,
1
,
1
,
1
,
1
)
# (n, num_heads, h, h_kv, dim)
position_feat_y
=
self
.
appr_geom_fc_y
(
position_embed_y
).
\
view
(
1
,
h
,
h_kv
,
num_heads
,
self
.
qk_embed_dim
).
\
permute
(
0
,
3
,
1
,
2
,
4
).
\
repeat
(
n
,
1
,
1
,
1
,
1
)
position_feat_x
/=
math
.
sqrt
(
2
)
position_feat_y
/=
math
.
sqrt
(
2
)
# accelerate for saliency only
if
(
np
.
sum
(
self
.
attention_type
)
==
1
)
and
self
.
attention_type
[
2
]:
appr_bias
=
self
.
appr_bias
.
\
view
(
1
,
num_heads
,
1
,
self
.
qk_embed_dim
).
\
repeat
(
n
,
1
,
1
,
1
)
energy
=
torch
.
matmul
(
appr_bias
,
proj_key
).
\
view
(
n
,
num_heads
,
1
,
h_kv
*
w_kv
)
h
=
1
w
=
1
else
:
# (n, num_heads, h*w, h_kv*w_kv), query before key, 540mb for
if
not
self
.
attention_type
[
0
]:
energy
=
torch
.
zeros
(
n
,
num_heads
,
h
,
w
,
h_kv
,
w_kv
,
dtype
=
x_input
.
dtype
,
device
=
x_input
.
device
)
# attention_type[0]: appr - appr
# attention_type[1]: appr - position
# attention_type[2]: bias - appr
# attention_type[3]: bias - position
if
self
.
attention_type
[
0
]
or
self
.
attention_type
[
2
]:
if
self
.
attention_type
[
0
]
and
self
.
attention_type
[
2
]:
appr_bias
=
self
.
appr_bias
.
\
view
(
1
,
num_heads
,
1
,
self
.
qk_embed_dim
)
energy
=
torch
.
matmul
(
proj_query
+
appr_bias
,
proj_key
).
\
view
(
n
,
num_heads
,
h
,
w
,
h_kv
,
w_kv
)
elif
self
.
attention_type
[
0
]:
energy
=
torch
.
matmul
(
proj_query
,
proj_key
).
\
view
(
n
,
num_heads
,
h
,
w
,
h_kv
,
w_kv
)
elif
self
.
attention_type
[
2
]:
appr_bias
=
self
.
appr_bias
.
\
view
(
1
,
num_heads
,
1
,
self
.
qk_embed_dim
).
\
repeat
(
n
,
1
,
1
,
1
)
energy
+=
torch
.
matmul
(
appr_bias
,
proj_key
).
\
view
(
n
,
num_heads
,
1
,
1
,
h_kv
,
w_kv
)
if
self
.
attention_type
[
1
]
or
self
.
attention_type
[
3
]:
if
self
.
attention_type
[
1
]
and
self
.
attention_type
[
3
]:
geom_bias
=
self
.
geom_bias
.
\
view
(
1
,
num_heads
,
1
,
self
.
qk_embed_dim
)
proj_query_reshape
=
(
proj_query
+
geom_bias
).
\
view
(
n
,
num_heads
,
h
,
w
,
self
.
qk_embed_dim
)
energy_x
=
torch
.
matmul
(
proj_query_reshape
.
permute
(
0
,
1
,
3
,
2
,
4
),
position_feat_x
.
permute
(
0
,
1
,
2
,
4
,
3
))
energy_x
=
energy_x
.
\
permute
(
0
,
1
,
3
,
2
,
4
).
unsqueeze
(
4
)
energy_y
=
torch
.
matmul
(
proj_query_reshape
,
position_feat_y
.
permute
(
0
,
1
,
2
,
4
,
3
))
energy_y
=
energy_y
.
unsqueeze
(
5
)
energy
+=
energy_x
+
energy_y
elif
self
.
attention_type
[
1
]:
proj_query_reshape
=
proj_query
.
\
view
(
n
,
num_heads
,
h
,
w
,
self
.
qk_embed_dim
)
proj_query_reshape
=
proj_query_reshape
.
\
permute
(
0
,
1
,
3
,
2
,
4
)
position_feat_x_reshape
=
position_feat_x
.
\
permute
(
0
,
1
,
2
,
4
,
3
)
position_feat_y_reshape
=
position_feat_y
.
\
permute
(
0
,
1
,
2
,
4
,
3
)
energy_x
=
torch
.
matmul
(
proj_query_reshape
,
position_feat_x_reshape
)
energy_x
=
energy_x
.
permute
(
0
,
1
,
3
,
2
,
4
).
unsqueeze
(
4
)
energy_y
=
torch
.
matmul
(
proj_query_reshape
,
position_feat_y_reshape
)
energy_y
=
energy_y
.
unsqueeze
(
5
)
energy
+=
energy_x
+
energy_y
elif
self
.
attention_type
[
3
]:
geom_bias
=
self
.
geom_bias
.
\
view
(
1
,
num_heads
,
self
.
qk_embed_dim
,
1
).
\
repeat
(
n
,
1
,
1
,
1
)
position_feat_x_reshape
=
position_feat_x
.
\
view
(
n
,
num_heads
,
w
*
w_kv
,
self
.
qk_embed_dim
)
position_feat_y_reshape
=
position_feat_y
.
\
view
(
n
,
num_heads
,
h
*
h_kv
,
self
.
qk_embed_dim
)
energy_x
=
torch
.
matmul
(
position_feat_x_reshape
,
geom_bias
)
energy_x
=
energy_x
.
view
(
n
,
num_heads
,
1
,
w
,
1
,
w_kv
)
energy_y
=
torch
.
matmul
(
position_feat_y_reshape
,
geom_bias
)
energy_y
=
energy_y
.
view
(
n
,
num_heads
,
h
,
1
,
h_kv
,
1
)
energy
+=
energy_x
+
energy_y
energy
=
energy
.
view
(
n
,
num_heads
,
h
*
w
,
h_kv
*
w_kv
)
if
self
.
spatial_range
>=
0
:
cur_local_constraint_map
=
\
self
.
local_constraint_map
[:
h
,
:
w
,
:
h_kv
,
:
w_kv
].
\
contiguous
().
\
view
(
1
,
1
,
h
*
w
,
h_kv
*
w_kv
)
energy
=
energy
.
masked_fill_
(
cur_local_constraint_map
,
float
(
'-inf'
))
attention
=
F
.
softmax
(
energy
,
3
)
proj_value
=
self
.
value_conv
(
x_kv
)
proj_value_reshape
=
proj_value
.
\
view
((
n
,
num_heads
,
self
.
v_dim
,
h_kv
*
w_kv
)).
\
permute
(
0
,
1
,
3
,
2
)
out
=
torch
.
matmul
(
attention
,
proj_value_reshape
).
\
permute
(
0
,
1
,
3
,
2
).
\
contiguous
().
\
view
(
n
,
self
.
v_dim
*
self
.
num_heads
,
h
,
w
)
out
=
self
.
proj_conv
(
out
)
# output is downsampled, upsample back to input size
if
self
.
q_downsample
is
not
None
:
out
=
F
.
interpolate
(
out
,
size
=
x_input
.
shape
[
2
:],
mode
=
'bilinear'
,
align_corners
=
False
)
out
=
self
.
gamma
*
out
+
x_input
return
out
def
init_weights
(
self
):
for
m
in
self
.
modules
():
if
hasattr
(
m
,
'kaiming_init'
)
and
m
.
kaiming_init
:
kaiming_init
(
m
,
mode
=
'fan_in'
,
nonlinearity
=
'leaky_relu'
,
bias
=
0
,
distribution
=
'uniform'
,
a
=
1
)
lavis/common/annotator/uniformer/mmcv/cnn/bricks/hsigmoid.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
torch.nn
as
nn
from
.registry
import
ACTIVATION_LAYERS
@
ACTIVATION_LAYERS
.
register_module
()
class
HSigmoid
(
nn
.
Module
):
"""Hard Sigmoid Module. Apply the hard sigmoid function:
Hsigmoid(x) = min(max((x + bias) / divisor, min_value), max_value)
Default: Hsigmoid(x) = min(max((x + 1) / 2, 0), 1)
Args:
bias (float): Bias of the input feature map. Default: 1.0.
divisor (float): Divisor of the input feature map. Default: 2.0.
min_value (float): Lower bound value. Default: 0.0.
max_value (float): Upper bound value. Default: 1.0.
Returns:
Tensor: The output tensor.
"""
def
__init__
(
self
,
bias
=
1.0
,
divisor
=
2.0
,
min_value
=
0.0
,
max_value
=
1.0
):
super
(
HSigmoid
,
self
).
__init__
()
self
.
bias
=
bias
self
.
divisor
=
divisor
assert
self
.
divisor
!=
0
self
.
min_value
=
min_value
self
.
max_value
=
max_value
def
forward
(
self
,
x
):
x
=
(
x
+
self
.
bias
)
/
self
.
divisor
return
x
.
clamp_
(
self
.
min_value
,
self
.
max_value
)
lavis/common/annotator/uniformer/mmcv/cnn/bricks/hswish.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
torch.nn
as
nn
from
.registry
import
ACTIVATION_LAYERS
@
ACTIVATION_LAYERS
.
register_module
()
class
HSwish
(
nn
.
Module
):
"""Hard Swish Module.
This module applies the hard swish function:
.. math::
Hswish(x) = x * ReLU6(x + 3) / 6
Args:
inplace (bool): can optionally do the operation in-place.
Default: False.
Returns:
Tensor: The output tensor.
"""
def
__init__
(
self
,
inplace
=
False
):
super
(
HSwish
,
self
).
__init__
()
self
.
act
=
nn
.
ReLU6
(
inplace
)
def
forward
(
self
,
x
):
return
x
*
self
.
act
(
x
+
3
)
/
6
lavis/common/annotator/uniformer/mmcv/cnn/bricks/non_local.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
from
abc
import
ABCMeta
import
torch
import
torch.nn
as
nn
from
..utils
import
constant_init
,
normal_init
from
.conv_module
import
ConvModule
from
.registry
import
PLUGIN_LAYERS
class
_NonLocalNd
(
nn
.
Module
,
metaclass
=
ABCMeta
):
"""Basic Non-local module.
This module is proposed in
"Non-local Neural Networks"
Paper reference: https://arxiv.org/abs/1711.07971
Code reference: https://github.com/AlexHex7/Non-local_pytorch
Args:
in_channels (int): Channels of the input feature map.
reduction (int): Channel reduction ratio. Default: 2.
use_scale (bool): Whether to scale pairwise_weight by
`1/sqrt(inter_channels)` when the mode is `embedded_gaussian`.
Default: True.
conv_cfg (None | dict): The config dict for convolution layers.
If not specified, it will use `nn.Conv2d` for convolution layers.
Default: None.
norm_cfg (None | dict): The config dict for normalization layers.
Default: None. (This parameter is only applicable to conv_out.)
mode (str): Options are `gaussian`, `concatenation`,
`embedded_gaussian` and `dot_product`. Default: embedded_gaussian.
"""
def
__init__
(
self
,
in_channels
,
reduction
=
2
,
use_scale
=
True
,
conv_cfg
=
None
,
norm_cfg
=
None
,
mode
=
'embedded_gaussian'
,
**
kwargs
):
super
(
_NonLocalNd
,
self
).
__init__
()
self
.
in_channels
=
in_channels
self
.
reduction
=
reduction
self
.
use_scale
=
use_scale
self
.
inter_channels
=
max
(
in_channels
//
reduction
,
1
)
self
.
mode
=
mode
if
mode
not
in
[
'gaussian'
,
'embedded_gaussian'
,
'dot_product'
,
'concatenation'
]:
raise
ValueError
(
"Mode should be in 'gaussian', 'concatenation', "
f
"'embedded_gaussian' or 'dot_product', but got "
f
'
{
mode
}
instead.'
)
# g, theta, phi are defaulted as `nn.ConvNd`.
# Here we use ConvModule for potential usage.
self
.
g
=
ConvModule
(
self
.
in_channels
,
self
.
inter_channels
,
kernel_size
=
1
,
conv_cfg
=
conv_cfg
,
act_cfg
=
None
)
self
.
conv_out
=
ConvModule
(
self
.
inter_channels
,
self
.
in_channels
,
kernel_size
=
1
,
conv_cfg
=
conv_cfg
,
norm_cfg
=
norm_cfg
,
act_cfg
=
None
)
if
self
.
mode
!=
'gaussian'
:
self
.
theta
=
ConvModule
(
self
.
in_channels
,
self
.
inter_channels
,
kernel_size
=
1
,
conv_cfg
=
conv_cfg
,
act_cfg
=
None
)
self
.
phi
=
ConvModule
(
self
.
in_channels
,
self
.
inter_channels
,
kernel_size
=
1
,
conv_cfg
=
conv_cfg
,
act_cfg
=
None
)
if
self
.
mode
==
'concatenation'
:
self
.
concat_project
=
ConvModule
(
self
.
inter_channels
*
2
,
1
,
kernel_size
=
1
,
stride
=
1
,
padding
=
0
,
bias
=
False
,
act_cfg
=
dict
(
type
=
'ReLU'
))
self
.
init_weights
(
**
kwargs
)
def
init_weights
(
self
,
std
=
0.01
,
zeros_init
=
True
):
if
self
.
mode
!=
'gaussian'
:
for
m
in
[
self
.
g
,
self
.
theta
,
self
.
phi
]:
normal_init
(
m
.
conv
,
std
=
std
)
else
:
normal_init
(
self
.
g
.
conv
,
std
=
std
)
if
zeros_init
:
if
self
.
conv_out
.
norm_cfg
is
None
:
constant_init
(
self
.
conv_out
.
conv
,
0
)
else
:
constant_init
(
self
.
conv_out
.
norm
,
0
)
else
:
if
self
.
conv_out
.
norm_cfg
is
None
:
normal_init
(
self
.
conv_out
.
conv
,
std
=
std
)
else
:
normal_init
(
self
.
conv_out
.
norm
,
std
=
std
)
def
gaussian
(
self
,
theta_x
,
phi_x
):
# NonLocal1d pairwise_weight: [N, H, H]
# NonLocal2d pairwise_weight: [N, HxW, HxW]
# NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
pairwise_weight
=
torch
.
matmul
(
theta_x
,
phi_x
)
pairwise_weight
=
pairwise_weight
.
softmax
(
dim
=-
1
)
return
pairwise_weight
def
embedded_gaussian
(
self
,
theta_x
,
phi_x
):
# NonLocal1d pairwise_weight: [N, H, H]
# NonLocal2d pairwise_weight: [N, HxW, HxW]
# NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
pairwise_weight
=
torch
.
matmul
(
theta_x
,
phi_x
)
if
self
.
use_scale
:
# theta_x.shape[-1] is `self.inter_channels`
pairwise_weight
/=
theta_x
.
shape
[
-
1
]
**
0.5
pairwise_weight
=
pairwise_weight
.
softmax
(
dim
=-
1
)
return
pairwise_weight
def
dot_product
(
self
,
theta_x
,
phi_x
):
# NonLocal1d pairwise_weight: [N, H, H]
# NonLocal2d pairwise_weight: [N, HxW, HxW]
# NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
pairwise_weight
=
torch
.
matmul
(
theta_x
,
phi_x
)
pairwise_weight
/=
pairwise_weight
.
shape
[
-
1
]
return
pairwise_weight
def
concatenation
(
self
,
theta_x
,
phi_x
):
# NonLocal1d pairwise_weight: [N, H, H]
# NonLocal2d pairwise_weight: [N, HxW, HxW]
# NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
h
=
theta_x
.
size
(
2
)
w
=
phi_x
.
size
(
3
)
theta_x
=
theta_x
.
repeat
(
1
,
1
,
1
,
w
)
phi_x
=
phi_x
.
repeat
(
1
,
1
,
h
,
1
)
concat_feature
=
torch
.
cat
([
theta_x
,
phi_x
],
dim
=
1
)
pairwise_weight
=
self
.
concat_project
(
concat_feature
)
n
,
_
,
h
,
w
=
pairwise_weight
.
size
()
pairwise_weight
=
pairwise_weight
.
view
(
n
,
h
,
w
)
pairwise_weight
/=
pairwise_weight
.
shape
[
-
1
]
return
pairwise_weight
def
forward
(
self
,
x
):
# Assume `reduction = 1`, then `inter_channels = C`
# or `inter_channels = C` when `mode="gaussian"`
# NonLocal1d x: [N, C, H]
# NonLocal2d x: [N, C, H, W]
# NonLocal3d x: [N, C, T, H, W]
n
=
x
.
size
(
0
)
# NonLocal1d g_x: [N, H, C]
# NonLocal2d g_x: [N, HxW, C]
# NonLocal3d g_x: [N, TxHxW, C]
g_x
=
self
.
g
(
x
).
view
(
n
,
self
.
inter_channels
,
-
1
)
g_x
=
g_x
.
permute
(
0
,
2
,
1
)
# NonLocal1d theta_x: [N, H, C], phi_x: [N, C, H]
# NonLocal2d theta_x: [N, HxW, C], phi_x: [N, C, HxW]
# NonLocal3d theta_x: [N, TxHxW, C], phi_x: [N, C, TxHxW]
if
self
.
mode
==
'gaussian'
:
theta_x
=
x
.
view
(
n
,
self
.
in_channels
,
-
1
)
theta_x
=
theta_x
.
permute
(
0
,
2
,
1
)
if
self
.
sub_sample
:
phi_x
=
self
.
phi
(
x
).
view
(
n
,
self
.
in_channels
,
-
1
)
else
:
phi_x
=
x
.
view
(
n
,
self
.
in_channels
,
-
1
)
elif
self
.
mode
==
'concatenation'
:
theta_x
=
self
.
theta
(
x
).
view
(
n
,
self
.
inter_channels
,
-
1
,
1
)
phi_x
=
self
.
phi
(
x
).
view
(
n
,
self
.
inter_channels
,
1
,
-
1
)
else
:
theta_x
=
self
.
theta
(
x
).
view
(
n
,
self
.
inter_channels
,
-
1
)
theta_x
=
theta_x
.
permute
(
0
,
2
,
1
)
phi_x
=
self
.
phi
(
x
).
view
(
n
,
self
.
inter_channels
,
-
1
)
pairwise_func
=
getattr
(
self
,
self
.
mode
)
# NonLocal1d pairwise_weight: [N, H, H]
# NonLocal2d pairwise_weight: [N, HxW, HxW]
# NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
pairwise_weight
=
pairwise_func
(
theta_x
,
phi_x
)
# NonLocal1d y: [N, H, C]
# NonLocal2d y: [N, HxW, C]
# NonLocal3d y: [N, TxHxW, C]
y
=
torch
.
matmul
(
pairwise_weight
,
g_x
)
# NonLocal1d y: [N, C, H]
# NonLocal2d y: [N, C, H, W]
# NonLocal3d y: [N, C, T, H, W]
y
=
y
.
permute
(
0
,
2
,
1
).
contiguous
().
reshape
(
n
,
self
.
inter_channels
,
*
x
.
size
()[
2
:])
output
=
x
+
self
.
conv_out
(
y
)
return
output
class
NonLocal1d
(
_NonLocalNd
):
"""1D Non-local module.
Args:
in_channels (int): Same as `NonLocalND`.
sub_sample (bool): Whether to apply max pooling after pairwise
function (Note that the `sub_sample` is applied on spatial only).
Default: False.
conv_cfg (None | dict): Same as `NonLocalND`.
Default: dict(type='Conv1d').
"""
def
__init__
(
self
,
in_channels
,
sub_sample
=
False
,
conv_cfg
=
dict
(
type
=
'Conv1d'
),
**
kwargs
):
super
(
NonLocal1d
,
self
).
__init__
(
in_channels
,
conv_cfg
=
conv_cfg
,
**
kwargs
)
self
.
sub_sample
=
sub_sample
if
sub_sample
:
max_pool_layer
=
nn
.
MaxPool1d
(
kernel_size
=
2
)
self
.
g
=
nn
.
Sequential
(
self
.
g
,
max_pool_layer
)
if
self
.
mode
!=
'gaussian'
:
self
.
phi
=
nn
.
Sequential
(
self
.
phi
,
max_pool_layer
)
else
:
self
.
phi
=
max_pool_layer
@
PLUGIN_LAYERS
.
register_module
()
class
NonLocal2d
(
_NonLocalNd
):
"""2D Non-local module.
Args:
in_channels (int): Same as `NonLocalND`.
sub_sample (bool): Whether to apply max pooling after pairwise
function (Note that the `sub_sample` is applied on spatial only).
Default: False.
conv_cfg (None | dict): Same as `NonLocalND`.
Default: dict(type='Conv2d').
"""
_abbr_
=
'nonlocal_block'
def
__init__
(
self
,
in_channels
,
sub_sample
=
False
,
conv_cfg
=
dict
(
type
=
'Conv2d'
),
**
kwargs
):
super
(
NonLocal2d
,
self
).
__init__
(
in_channels
,
conv_cfg
=
conv_cfg
,
**
kwargs
)
self
.
sub_sample
=
sub_sample
if
sub_sample
:
max_pool_layer
=
nn
.
MaxPool2d
(
kernel_size
=
(
2
,
2
))
self
.
g
=
nn
.
Sequential
(
self
.
g
,
max_pool_layer
)
if
self
.
mode
!=
'gaussian'
:
self
.
phi
=
nn
.
Sequential
(
self
.
phi
,
max_pool_layer
)
else
:
self
.
phi
=
max_pool_layer
class
NonLocal3d
(
_NonLocalNd
):
"""3D Non-local module.
Args:
in_channels (int): Same as `NonLocalND`.
sub_sample (bool): Whether to apply max pooling after pairwise
function (Note that the `sub_sample` is applied on spatial only).
Default: False.
conv_cfg (None | dict): Same as `NonLocalND`.
Default: dict(type='Conv3d').
"""
def
__init__
(
self
,
in_channels
,
sub_sample
=
False
,
conv_cfg
=
dict
(
type
=
'Conv3d'
),
**
kwargs
):
super
(
NonLocal3d
,
self
).
__init__
(
in_channels
,
conv_cfg
=
conv_cfg
,
**
kwargs
)
self
.
sub_sample
=
sub_sample
if
sub_sample
:
max_pool_layer
=
nn
.
MaxPool3d
(
kernel_size
=
(
1
,
2
,
2
))
self
.
g
=
nn
.
Sequential
(
self
.
g
,
max_pool_layer
)
if
self
.
mode
!=
'gaussian'
:
self
.
phi
=
nn
.
Sequential
(
self
.
phi
,
max_pool_layer
)
else
:
self
.
phi
=
max_pool_layer
lavis/common/annotator/uniformer/mmcv/cnn/bricks/norm.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
inspect
import
torch.nn
as
nn
from
annotator.uniformer.mmcv.utils
import
is_tuple_of
from
annotator.uniformer.mmcv.utils.parrots_wrapper
import
SyncBatchNorm
,
_BatchNorm
,
_InstanceNorm
from
.registry
import
NORM_LAYERS
NORM_LAYERS
.
register_module
(
'BN'
,
module
=
nn
.
BatchNorm2d
)
NORM_LAYERS
.
register_module
(
'BN1d'
,
module
=
nn
.
BatchNorm1d
)
NORM_LAYERS
.
register_module
(
'BN2d'
,
module
=
nn
.
BatchNorm2d
)
NORM_LAYERS
.
register_module
(
'BN3d'
,
module
=
nn
.
BatchNorm3d
)
NORM_LAYERS
.
register_module
(
'SyncBN'
,
module
=
SyncBatchNorm
)
NORM_LAYERS
.
register_module
(
'GN'
,
module
=
nn
.
GroupNorm
)
NORM_LAYERS
.
register_module
(
'LN'
,
module
=
nn
.
LayerNorm
)
NORM_LAYERS
.
register_module
(
'IN'
,
module
=
nn
.
InstanceNorm2d
)
NORM_LAYERS
.
register_module
(
'IN1d'
,
module
=
nn
.
InstanceNorm1d
)
NORM_LAYERS
.
register_module
(
'IN2d'
,
module
=
nn
.
InstanceNorm2d
)
NORM_LAYERS
.
register_module
(
'IN3d'
,
module
=
nn
.
InstanceNorm3d
)
def
infer_abbr
(
class_type
):
"""Infer abbreviation from the class name.
When we build a norm layer with `build_norm_layer()`, we want to preserve
the norm type in variable names, e.g, self.bn1, self.gn. This method will
infer the abbreviation to map class types to abbreviations.
Rule 1: If the class has the property "_abbr_", return the property.
Rule 2: If the parent class is _BatchNorm, GroupNorm, LayerNorm or
InstanceNorm, the abbreviation of this layer will be "bn", "gn", "ln" and
"in" respectively.
Rule 3: If the class name contains "batch", "group", "layer" or "instance",
the abbreviation of this layer will be "bn", "gn", "ln" and "in"
respectively.
Rule 4: Otherwise, the abbreviation falls back to "norm".
Args:
class_type (type): The norm layer type.
Returns:
str: The inferred abbreviation.
"""
if
not
inspect
.
isclass
(
class_type
):
raise
TypeError
(
f
'class_type must be a type, but got
{
type
(
class_type
)
}
'
)
if
hasattr
(
class_type
,
'_abbr_'
):
return
class_type
.
_abbr_
if
issubclass
(
class_type
,
_InstanceNorm
):
# IN is a subclass of BN
return
'in'
elif
issubclass
(
class_type
,
_BatchNorm
):
return
'bn'
elif
issubclass
(
class_type
,
nn
.
GroupNorm
):
return
'gn'
elif
issubclass
(
class_type
,
nn
.
LayerNorm
):
return
'ln'
else
:
class_name
=
class_type
.
__name__
.
lower
()
if
'batch'
in
class_name
:
return
'bn'
elif
'group'
in
class_name
:
return
'gn'
elif
'layer'
in
class_name
:
return
'ln'
elif
'instance'
in
class_name
:
return
'in'
else
:
return
'norm_layer'
def
build_norm_layer
(
cfg
,
num_features
,
postfix
=
''
):
"""Build normalization layer.
Args:
cfg (dict): The norm layer config, which should contain:
- type (str): Layer type.
- layer args: Args needed to instantiate a norm layer.
- requires_grad (bool, optional): Whether stop gradient updates.
num_features (int): Number of input channels.
postfix (int | str): The postfix to be appended into norm abbreviation
to create named layer.
Returns:
(str, nn.Module): The first element is the layer name consisting of
abbreviation and postfix, e.g., bn1, gn. The second element is the
created norm layer.
"""
if
not
isinstance
(
cfg
,
dict
):
raise
TypeError
(
'cfg must be a dict'
)
if
'type'
not
in
cfg
:
raise
KeyError
(
'the cfg dict must contain the key "type"'
)
cfg_
=
cfg
.
copy
()
layer_type
=
cfg_
.
pop
(
'type'
)
if
layer_type
not
in
NORM_LAYERS
:
raise
KeyError
(
f
'Unrecognized norm type
{
layer_type
}
'
)
norm_layer
=
NORM_LAYERS
.
get
(
layer_type
)
abbr
=
infer_abbr
(
norm_layer
)
assert
isinstance
(
postfix
,
(
int
,
str
))
name
=
abbr
+
str
(
postfix
)
requires_grad
=
cfg_
.
pop
(
'requires_grad'
,
True
)
cfg_
.
setdefault
(
'eps'
,
1e-5
)
if
layer_type
!=
'GN'
:
layer
=
norm_layer
(
num_features
,
**
cfg_
)
if
layer_type
==
'SyncBN'
and
hasattr
(
layer
,
'_specify_ddp_gpu_num'
):
layer
.
_specify_ddp_gpu_num
(
1
)
else
:
assert
'num_groups'
in
cfg_
layer
=
norm_layer
(
num_channels
=
num_features
,
**
cfg_
)
for
param
in
layer
.
parameters
():
param
.
requires_grad
=
requires_grad
return
name
,
layer
def
is_norm
(
layer
,
exclude
=
None
):
"""Check if a layer is a normalization layer.
Args:
layer (nn.Module): The layer to be checked.
exclude (type | tuple[type]): Types to be excluded.
Returns:
bool: Whether the layer is a norm layer.
"""
if
exclude
is
not
None
:
if
not
isinstance
(
exclude
,
tuple
):
exclude
=
(
exclude
,
)
if
not
is_tuple_of
(
exclude
,
type
):
raise
TypeError
(
f
'"exclude" must be either None or type or a tuple of types, '
f
'but got
{
type
(
exclude
)
}
:
{
exclude
}
'
)
if
exclude
and
isinstance
(
layer
,
exclude
):
return
False
all_norm_bases
=
(
_BatchNorm
,
_InstanceNorm
,
nn
.
GroupNorm
,
nn
.
LayerNorm
)
return
isinstance
(
layer
,
all_norm_bases
)
lavis/common/annotator/uniformer/mmcv/cnn/bricks/padding.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
torch.nn
as
nn
from
.registry
import
PADDING_LAYERS
PADDING_LAYERS
.
register_module
(
'zero'
,
module
=
nn
.
ZeroPad2d
)
PADDING_LAYERS
.
register_module
(
'reflect'
,
module
=
nn
.
ReflectionPad2d
)
PADDING_LAYERS
.
register_module
(
'replicate'
,
module
=
nn
.
ReplicationPad2d
)
def
build_padding_layer
(
cfg
,
*
args
,
**
kwargs
):
"""Build padding layer.
Args:
cfg (None or dict): The padding layer config, which should contain:
- type (str): Layer type.
- layer args: Args needed to instantiate a padding layer.
Returns:
nn.Module: Created padding layer.
"""
if
not
isinstance
(
cfg
,
dict
):
raise
TypeError
(
'cfg must be a dict'
)
if
'type'
not
in
cfg
:
raise
KeyError
(
'the cfg dict must contain the key "type"'
)
cfg_
=
cfg
.
copy
()
padding_type
=
cfg_
.
pop
(
'type'
)
if
padding_type
not
in
PADDING_LAYERS
:
raise
KeyError
(
f
'Unrecognized padding type
{
padding_type
}
.'
)
else
:
padding_layer
=
PADDING_LAYERS
.
get
(
padding_type
)
layer
=
padding_layer
(
*
args
,
**
kwargs
,
**
cfg_
)
return
layer
lavis/common/annotator/uniformer/mmcv/cnn/bricks/plugin.py
0 → 100644
View file @
c04f261a
import
inspect
import
platform
from
.registry
import
PLUGIN_LAYERS
if
platform
.
system
()
==
'Windows'
:
import
regex
as
re
else
:
import
re
def
infer_abbr
(
class_type
):
"""Infer abbreviation from the class name.
This method will infer the abbreviation to map class types to
abbreviations.
Rule 1: If the class has the property "abbr", return the property.
Rule 2: Otherwise, the abbreviation falls back to snake case of class
name, e.g. the abbreviation of ``FancyBlock`` will be ``fancy_block``.
Args:
class_type (type): The norm layer type.
Returns:
str: The inferred abbreviation.
"""
def
camel2snack
(
word
):
"""Convert camel case word into snack case.
Modified from `inflection lib
<https://inflection.readthedocs.io/en/latest/#inflection.underscore>`_.
Example::
>>> camel2snack("FancyBlock")
'fancy_block'
"""
word
=
re
.
sub
(
r
'([A-Z]+)([A-Z][a-z])'
,
r
'\1_\2'
,
word
)
word
=
re
.
sub
(
r
'([a-z\d])([A-Z])'
,
r
'\1_\2'
,
word
)
word
=
word
.
replace
(
'-'
,
'_'
)
return
word
.
lower
()
if
not
inspect
.
isclass
(
class_type
):
raise
TypeError
(
f
'class_type must be a type, but got
{
type
(
class_type
)
}
'
)
if
hasattr
(
class_type
,
'_abbr_'
):
return
class_type
.
_abbr_
else
:
return
camel2snack
(
class_type
.
__name__
)
def
build_plugin_layer
(
cfg
,
postfix
=
''
,
**
kwargs
):
"""Build plugin layer.
Args:
cfg (None or dict): cfg should contain:
type (str): identify plugin layer type.
layer args: args needed to instantiate a plugin layer.
postfix (int, str): appended into norm abbreviation to
create named layer. Default: ''.
Returns:
tuple[str, nn.Module]:
name (str): abbreviation + postfix
layer (nn.Module): created plugin layer
"""
if
not
isinstance
(
cfg
,
dict
):
raise
TypeError
(
'cfg must be a dict'
)
if
'type'
not
in
cfg
:
raise
KeyError
(
'the cfg dict must contain the key "type"'
)
cfg_
=
cfg
.
copy
()
layer_type
=
cfg_
.
pop
(
'type'
)
if
layer_type
not
in
PLUGIN_LAYERS
:
raise
KeyError
(
f
'Unrecognized plugin type
{
layer_type
}
'
)
plugin_layer
=
PLUGIN_LAYERS
.
get
(
layer_type
)
abbr
=
infer_abbr
(
plugin_layer
)
assert
isinstance
(
postfix
,
(
int
,
str
))
name
=
abbr
+
str
(
postfix
)
layer
=
plugin_layer
(
**
kwargs
,
**
cfg_
)
return
name
,
layer
lavis/common/annotator/uniformer/mmcv/cnn/bricks/registry.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
from
annotator.uniformer.mmcv.utils
import
Registry
CONV_LAYERS
=
Registry
(
'conv layer'
)
NORM_LAYERS
=
Registry
(
'norm layer'
)
ACTIVATION_LAYERS
=
Registry
(
'activation layer'
)
PADDING_LAYERS
=
Registry
(
'padding layer'
)
UPSAMPLE_LAYERS
=
Registry
(
'upsample layer'
)
PLUGIN_LAYERS
=
Registry
(
'plugin layer'
)
DROPOUT_LAYERS
=
Registry
(
'drop out layers'
)
POSITIONAL_ENCODING
=
Registry
(
'position encoding'
)
ATTENTION
=
Registry
(
'attention'
)
FEEDFORWARD_NETWORK
=
Registry
(
'feed-forward Network'
)
TRANSFORMER_LAYER
=
Registry
(
'transformerLayer'
)
TRANSFORMER_LAYER_SEQUENCE
=
Registry
(
'transformer-layers sequence'
)
lavis/common/annotator/uniformer/mmcv/cnn/bricks/scale.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
torch
import
torch.nn
as
nn
class
Scale
(
nn
.
Module
):
"""A learnable scale parameter.
This layer scales the input by a learnable factor. It multiplies a
learnable scale parameter of shape (1,) with input of any shape.
Args:
scale (float): Initial value of scale factor. Default: 1.0
"""
def
__init__
(
self
,
scale
=
1.0
):
super
(
Scale
,
self
).
__init__
()
self
.
scale
=
nn
.
Parameter
(
torch
.
tensor
(
scale
,
dtype
=
torch
.
float
))
def
forward
(
self
,
x
):
return
x
*
self
.
scale
lavis/common/annotator/uniformer/mmcv/cnn/bricks/swish.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
torch
import
torch.nn
as
nn
from
.registry
import
ACTIVATION_LAYERS
@
ACTIVATION_LAYERS
.
register_module
()
class
Swish
(
nn
.
Module
):
"""Swish Module.
This module applies the swish function:
.. math::
Swish(x) = x * Sigmoid(x)
Returns:
Tensor: The output tensor.
"""
def
__init__
(
self
):
super
(
Swish
,
self
).
__init__
()
def
forward
(
self
,
x
):
return
x
*
torch
.
sigmoid
(
x
)
lavis/common/annotator/uniformer/mmcv/cnn/bricks/transformer.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
copy
import
warnings
import
torch
import
torch.nn
as
nn
from
annotator.uniformer.mmcv
import
ConfigDict
,
deprecated_api_warning
from
annotator.uniformer.mmcv.cnn
import
Linear
,
build_activation_layer
,
build_norm_layer
from
annotator.uniformer.mmcv.runner.base_module
import
BaseModule
,
ModuleList
,
Sequential
from
annotator.uniformer.mmcv.utils
import
build_from_cfg
from
.drop
import
build_dropout
from
.registry
import
(
ATTENTION
,
FEEDFORWARD_NETWORK
,
POSITIONAL_ENCODING
,
TRANSFORMER_LAYER
,
TRANSFORMER_LAYER_SEQUENCE
)
# Avoid BC-breaking of importing MultiScaleDeformableAttention from this file
try
:
from
annotator.uniformer.mmcv.ops.multi_scale_deform_attn
import
MultiScaleDeformableAttention
# noqa F401
warnings
.
warn
(
ImportWarning
(
'``MultiScaleDeformableAttention`` has been moved to '
'``mmcv.ops.multi_scale_deform_attn``, please change original path '
# noqa E501
'``from annotator.uniformer.mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention`` '
# noqa E501
'to ``from annotator.uniformer.mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention`` '
# noqa E501
))
except
ImportError
:
warnings
.
warn
(
'Fail to import ``MultiScaleDeformableAttention`` from '
'``mmcv.ops.multi_scale_deform_attn``, '
'You should install ``mmcv-full`` if you need this module. '
)
def
build_positional_encoding
(
cfg
,
default_args
=
None
):
"""Builder for Position Encoding."""
return
build_from_cfg
(
cfg
,
POSITIONAL_ENCODING
,
default_args
)
def
build_attention
(
cfg
,
default_args
=
None
):
"""Builder for attention."""
return
build_from_cfg
(
cfg
,
ATTENTION
,
default_args
)
def
build_feedforward_network
(
cfg
,
default_args
=
None
):
"""Builder for feed-forward network (FFN)."""
return
build_from_cfg
(
cfg
,
FEEDFORWARD_NETWORK
,
default_args
)
def
build_transformer_layer
(
cfg
,
default_args
=
None
):
"""Builder for transformer layer."""
return
build_from_cfg
(
cfg
,
TRANSFORMER_LAYER
,
default_args
)
def
build_transformer_layer_sequence
(
cfg
,
default_args
=
None
):
"""Builder for transformer encoder and transformer decoder."""
return
build_from_cfg
(
cfg
,
TRANSFORMER_LAYER_SEQUENCE
,
default_args
)
@
ATTENTION
.
register_module
()
class
MultiheadAttention
(
BaseModule
):
"""A wrapper for ``torch.nn.MultiheadAttention``.
This module implements MultiheadAttention with identity connection,
and positional encoding is also passed as input.
Args:
embed_dims (int): The embedding dimension.
num_heads (int): Parallel attention heads.
attn_drop (float): A Dropout layer on attn_output_weights.
Default: 0.0.
proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
Default: 0.0.
dropout_layer (obj:`ConfigDict`): The dropout_layer used
when adding the shortcut.
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
Default: None.
batch_first (bool): When it is True, Key, Query and Value are shape of
(batch, n, embed_dim), otherwise (n, batch, embed_dim).
Default to False.
"""
def
__init__
(
self
,
embed_dims
,
num_heads
,
attn_drop
=
0.
,
proj_drop
=
0.
,
dropout_layer
=
dict
(
type
=
'Dropout'
,
drop_prob
=
0.
),
init_cfg
=
None
,
batch_first
=
False
,
**
kwargs
):
super
(
MultiheadAttention
,
self
).
__init__
(
init_cfg
)
if
'dropout'
in
kwargs
:
warnings
.
warn
(
'The arguments `dropout` in MultiheadAttention '
'has been deprecated, now you can separately '
'set `attn_drop`(float), proj_drop(float), '
'and `dropout_layer`(dict) '
)
attn_drop
=
kwargs
[
'dropout'
]
dropout_layer
[
'drop_prob'
]
=
kwargs
.
pop
(
'dropout'
)
self
.
embed_dims
=
embed_dims
self
.
num_heads
=
num_heads
self
.
batch_first
=
batch_first
self
.
attn
=
nn
.
MultiheadAttention
(
embed_dims
,
num_heads
,
attn_drop
,
**
kwargs
)
self
.
proj_drop
=
nn
.
Dropout
(
proj_drop
)
self
.
dropout_layer
=
build_dropout
(
dropout_layer
)
if
dropout_layer
else
nn
.
Identity
()
@
deprecated_api_warning
({
'residual'
:
'identity'
},
cls_name
=
'MultiheadAttention'
)
def
forward
(
self
,
query
,
key
=
None
,
value
=
None
,
identity
=
None
,
query_pos
=
None
,
key_pos
=
None
,
attn_mask
=
None
,
key_padding_mask
=
None
,
**
kwargs
):
"""Forward function for `MultiheadAttention`.
**kwargs allow passing a more general data flow when combining
with other operations in `transformerlayer`.
Args:
query (Tensor): The input query with shape [num_queries, bs,
embed_dims] if self.batch_first is False, else
[bs, num_queries embed_dims].
key (Tensor): The key tensor with shape [num_keys, bs,
embed_dims] if self.batch_first is False, else
[bs, num_keys, embed_dims] .
If None, the ``query`` will be used. Defaults to None.
value (Tensor): The value tensor with same shape as `key`.
Same in `nn.MultiheadAttention.forward`. Defaults to None.
If None, the `key` will be used.
identity (Tensor): This tensor, with the same shape as x,
will be used for the identity link.
If None, `x` will be used. Defaults to None.
query_pos (Tensor): The positional encoding for query, with
the same shape as `x`. If not None, it will
be added to `x` before forward function. Defaults to None.
key_pos (Tensor): The positional encoding for `key`, with the
same shape as `key`. Defaults to None. If not None, it will
be added to `key` before forward function. If None, and
`query_pos` has the same shape as `key`, then `query_pos`
will be used for `key_pos`. Defaults to None.
attn_mask (Tensor): ByteTensor mask with shape [num_queries,
num_keys]. Same in `nn.MultiheadAttention.forward`.
Defaults to None.
key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys].
Defaults to None.
Returns:
Tensor: forwarded results with shape
[num_queries, bs, embed_dims]
if self.batch_first is False, else
[bs, num_queries embed_dims].
"""
if
key
is
None
:
key
=
query
if
value
is
None
:
value
=
key
if
identity
is
None
:
identity
=
query
if
key_pos
is
None
:
if
query_pos
is
not
None
:
# use query_pos if key_pos is not available
if
query_pos
.
shape
==
key
.
shape
:
key_pos
=
query_pos
else
:
warnings
.
warn
(
f
'position encoding of key is'
f
'missing in
{
self
.
__class__
.
__name__
}
.'
)
if
query_pos
is
not
None
:
query
=
query
+
query_pos
if
key_pos
is
not
None
:
key
=
key
+
key_pos
# Because the dataflow('key', 'query', 'value') of
# ``torch.nn.MultiheadAttention`` is (num_query, batch,
# embed_dims), We should adjust the shape of dataflow from
# batch_first (batch, num_query, embed_dims) to num_query_first
# (num_query ,batch, embed_dims), and recover ``attn_output``
# from num_query_first to batch_first.
if
self
.
batch_first
:
query
=
query
.
transpose
(
0
,
1
)
key
=
key
.
transpose
(
0
,
1
)
value
=
value
.
transpose
(
0
,
1
)
out
=
self
.
attn
(
query
=
query
,
key
=
key
,
value
=
value
,
attn_mask
=
attn_mask
,
key_padding_mask
=
key_padding_mask
)[
0
]
if
self
.
batch_first
:
out
=
out
.
transpose
(
0
,
1
)
return
identity
+
self
.
dropout_layer
(
self
.
proj_drop
(
out
))
@
FEEDFORWARD_NETWORK
.
register_module
()
class
FFN
(
BaseModule
):
"""Implements feed-forward networks (FFNs) with identity connection.
Args:
embed_dims (int): The feature dimension. Same as
`MultiheadAttention`. Defaults: 256.
feedforward_channels (int): The hidden dimension of FFNs.
Defaults: 1024.
num_fcs (int, optional): The number of fully-connected layers in
FFNs. Default: 2.
act_cfg (dict, optional): The activation config for FFNs.
Default: dict(type='ReLU')
ffn_drop (float, optional): Probability of an element to be
zeroed in FFN. Default 0.0.
add_identity (bool, optional): Whether to add the
identity connection. Default: `True`.
dropout_layer (obj:`ConfigDict`): The dropout_layer used
when adding the shortcut.
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
Default: None.
"""
@
deprecated_api_warning
(
{
'dropout'
:
'ffn_drop'
,
'add_residual'
:
'add_identity'
},
cls_name
=
'FFN'
)
def
__init__
(
self
,
embed_dims
=
256
,
feedforward_channels
=
1024
,
num_fcs
=
2
,
act_cfg
=
dict
(
type
=
'ReLU'
,
inplace
=
True
),
ffn_drop
=
0.
,
dropout_layer
=
None
,
add_identity
=
True
,
init_cfg
=
None
,
**
kwargs
):
super
(
FFN
,
self
).
__init__
(
init_cfg
)
assert
num_fcs
>=
2
,
'num_fcs should be no less '
\
f
'than 2. got
{
num_fcs
}
.'
self
.
embed_dims
=
embed_dims
self
.
feedforward_channels
=
feedforward_channels
self
.
num_fcs
=
num_fcs
self
.
act_cfg
=
act_cfg
self
.
activate
=
build_activation_layer
(
act_cfg
)
layers
=
[]
in_channels
=
embed_dims
for
_
in
range
(
num_fcs
-
1
):
layers
.
append
(
Sequential
(
Linear
(
in_channels
,
feedforward_channels
),
self
.
activate
,
nn
.
Dropout
(
ffn_drop
)))
in_channels
=
feedforward_channels
layers
.
append
(
Linear
(
feedforward_channels
,
embed_dims
))
layers
.
append
(
nn
.
Dropout
(
ffn_drop
))
self
.
layers
=
Sequential
(
*
layers
)
self
.
dropout_layer
=
build_dropout
(
dropout_layer
)
if
dropout_layer
else
torch
.
nn
.
Identity
()
self
.
add_identity
=
add_identity
@
deprecated_api_warning
({
'residual'
:
'identity'
},
cls_name
=
'FFN'
)
def
forward
(
self
,
x
,
identity
=
None
):
"""Forward function for `FFN`.
The function would add x to the output tensor if residue is None.
"""
out
=
self
.
layers
(
x
)
if
not
self
.
add_identity
:
return
self
.
dropout_layer
(
out
)
if
identity
is
None
:
identity
=
x
return
identity
+
self
.
dropout_layer
(
out
)
@
TRANSFORMER_LAYER
.
register_module
()
class
BaseTransformerLayer
(
BaseModule
):
"""Base `TransformerLayer` for vision transformer.
It can be built from `mmcv.ConfigDict` and support more flexible
customization, for example, using any number of `FFN or LN ` and
use different kinds of `attention` by specifying a list of `ConfigDict`
named `attn_cfgs`. It is worth mentioning that it supports `prenorm`
when you specifying `norm` as the first element of `operation_order`.
More details about the `prenorm`: `On Layer Normalization in the
Transformer Architecture <https://arxiv.org/abs/2002.04745>`_ .
Args:
attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
Configs for `self_attention` or `cross_attention` modules,
The order of the configs in the list should be consistent with
corresponding attentions in operation_order.
If it is a dict, all of the attention modules in operation_order
will be built with this config. Default: None.
ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
Configs for FFN, The order of the configs in the list should be
consistent with corresponding ffn in operation_order.
If it is a dict, all of the attention modules in operation_order
will be built with this config.
operation_order (tuple[str]): The execution order of operation
in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
Support `prenorm` when you specifying first element as `norm`.
Default:None.
norm_cfg (dict): Config dict for normalization layer.
Default: dict(type='LN').
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
Default: None.
batch_first (bool): Key, Query and Value are shape
of (batch, n, embed_dim)
or (n, batch, embed_dim). Default to False.
"""
def
__init__
(
self
,
attn_cfgs
=
None
,
ffn_cfgs
=
dict
(
type
=
'FFN'
,
embed_dims
=
256
,
feedforward_channels
=
1024
,
num_fcs
=
2
,
ffn_drop
=
0.
,
act_cfg
=
dict
(
type
=
'ReLU'
,
inplace
=
True
),
),
operation_order
=
None
,
norm_cfg
=
dict
(
type
=
'LN'
),
init_cfg
=
None
,
batch_first
=
False
,
**
kwargs
):
deprecated_args
=
dict
(
feedforward_channels
=
'feedforward_channels'
,
ffn_dropout
=
'ffn_drop'
,
ffn_num_fcs
=
'num_fcs'
)
for
ori_name
,
new_name
in
deprecated_args
.
items
():
if
ori_name
in
kwargs
:
warnings
.
warn
(
f
'The arguments `
{
ori_name
}
` in BaseTransformerLayer '
f
'has been deprecated, now you should set `
{
new_name
}
` '
f
'and other FFN related arguments '
f
'to a dict named `ffn_cfgs`. '
)
ffn_cfgs
[
new_name
]
=
kwargs
[
ori_name
]
super
(
BaseTransformerLayer
,
self
).
__init__
(
init_cfg
)
self
.
batch_first
=
batch_first
assert
set
(
operation_order
)
&
set
(
[
'self_attn'
,
'norm'
,
'ffn'
,
'cross_attn'
])
==
\
set
(
operation_order
),
f
'The operation_order of'
\
f
'
{
self
.
__class__
.
__name__
}
should '
\
f
'contains all four operation type '
\
f
"
{
[
'self_attn'
,
'norm'
,
'ffn'
,
'cross_attn'
]
}
"
num_attn
=
operation_order
.
count
(
'self_attn'
)
+
operation_order
.
count
(
'cross_attn'
)
if
isinstance
(
attn_cfgs
,
dict
):
attn_cfgs
=
[
copy
.
deepcopy
(
attn_cfgs
)
for
_
in
range
(
num_attn
)]
else
:
assert
num_attn
==
len
(
attn_cfgs
),
f
'The length '
\
f
'of attn_cfg
{
num_attn
}
is '
\
f
'not consistent with the number of attention'
\
f
'in operation_order
{
operation_order
}
.'
self
.
num_attn
=
num_attn
self
.
operation_order
=
operation_order
self
.
norm_cfg
=
norm_cfg
self
.
pre_norm
=
operation_order
[
0
]
==
'norm'
self
.
attentions
=
ModuleList
()
index
=
0
for
operation_name
in
operation_order
:
if
operation_name
in
[
'self_attn'
,
'cross_attn'
]:
if
'batch_first'
in
attn_cfgs
[
index
]:
assert
self
.
batch_first
==
attn_cfgs
[
index
][
'batch_first'
]
else
:
attn_cfgs
[
index
][
'batch_first'
]
=
self
.
batch_first
attention
=
build_attention
(
attn_cfgs
[
index
])
# Some custom attentions used as `self_attn`
# or `cross_attn` can have different behavior.
attention
.
operation_name
=
operation_name
self
.
attentions
.
append
(
attention
)
index
+=
1
self
.
embed_dims
=
self
.
attentions
[
0
].
embed_dims
self
.
ffns
=
ModuleList
()
num_ffns
=
operation_order
.
count
(
'ffn'
)
if
isinstance
(
ffn_cfgs
,
dict
):
ffn_cfgs
=
ConfigDict
(
ffn_cfgs
)
if
isinstance
(
ffn_cfgs
,
dict
):
ffn_cfgs
=
[
copy
.
deepcopy
(
ffn_cfgs
)
for
_
in
range
(
num_ffns
)]
assert
len
(
ffn_cfgs
)
==
num_ffns
for
ffn_index
in
range
(
num_ffns
):
if
'embed_dims'
not
in
ffn_cfgs
[
ffn_index
]:
ffn_cfgs
[
'embed_dims'
]
=
self
.
embed_dims
else
:
assert
ffn_cfgs
[
ffn_index
][
'embed_dims'
]
==
self
.
embed_dims
self
.
ffns
.
append
(
build_feedforward_network
(
ffn_cfgs
[
ffn_index
],
dict
(
type
=
'FFN'
)))
self
.
norms
=
ModuleList
()
num_norms
=
operation_order
.
count
(
'norm'
)
for
_
in
range
(
num_norms
):
self
.
norms
.
append
(
build_norm_layer
(
norm_cfg
,
self
.
embed_dims
)[
1
])
def
forward
(
self
,
query
,
key
=
None
,
value
=
None
,
query_pos
=
None
,
key_pos
=
None
,
attn_masks
=
None
,
query_key_padding_mask
=
None
,
key_padding_mask
=
None
,
**
kwargs
):
"""Forward function for `TransformerDecoderLayer`.
**kwargs contains some specific arguments of attentions.
Args:
query (Tensor): The input query with shape
[num_queries, bs, embed_dims] if
self.batch_first is False, else
[bs, num_queries embed_dims].
key (Tensor): The key tensor with shape [num_keys, bs,
embed_dims] if self.batch_first is False, else
[bs, num_keys, embed_dims] .
value (Tensor): The value tensor with same shape as `key`.
query_pos (Tensor): The positional encoding for `query`.
Default: None.
key_pos (Tensor): The positional encoding for `key`.
Default: None.
attn_masks (List[Tensor] | None): 2D Tensor used in
calculation of corresponding attention. The length of
it should equal to the number of `attention` in
`operation_order`. Default: None.
query_key_padding_mask (Tensor): ByteTensor for `query`, with
shape [bs, num_queries]. Only used in `self_attn` layer.
Defaults to None.
key_padding_mask (Tensor): ByteTensor for `query`, with
shape [bs, num_keys]. Default: None.
Returns:
Tensor: forwarded results with shape [num_queries, bs, embed_dims].
"""
norm_index
=
0
attn_index
=
0
ffn_index
=
0
identity
=
query
if
attn_masks
is
None
:
attn_masks
=
[
None
for
_
in
range
(
self
.
num_attn
)]
elif
isinstance
(
attn_masks
,
torch
.
Tensor
):
attn_masks
=
[
copy
.
deepcopy
(
attn_masks
)
for
_
in
range
(
self
.
num_attn
)
]
warnings
.
warn
(
f
'Use same attn_mask in all attentions in '
f
'
{
self
.
__class__
.
__name__
}
'
)
else
:
assert
len
(
attn_masks
)
==
self
.
num_attn
,
f
'The length of '
\
f
'attn_masks
{
len
(
attn_masks
)
}
must be equal '
\
f
'to the number of attention in '
\
f
'operation_order
{
self
.
num_attn
}
'
for
layer
in
self
.
operation_order
:
if
layer
==
'self_attn'
:
temp_key
=
temp_value
=
query
query
=
self
.
attentions
[
attn_index
](
query
,
temp_key
,
temp_value
,
identity
if
self
.
pre_norm
else
None
,
query_pos
=
query_pos
,
key_pos
=
query_pos
,
attn_mask
=
attn_masks
[
attn_index
],
key_padding_mask
=
query_key_padding_mask
,
**
kwargs
)
attn_index
+=
1
identity
=
query
elif
layer
==
'norm'
:
query
=
self
.
norms
[
norm_index
](
query
)
norm_index
+=
1
elif
layer
==
'cross_attn'
:
query
=
self
.
attentions
[
attn_index
](
query
,
key
,
value
,
identity
if
self
.
pre_norm
else
None
,
query_pos
=
query_pos
,
key_pos
=
key_pos
,
attn_mask
=
attn_masks
[
attn_index
],
key_padding_mask
=
key_padding_mask
,
**
kwargs
)
attn_index
+=
1
identity
=
query
elif
layer
==
'ffn'
:
query
=
self
.
ffns
[
ffn_index
](
query
,
identity
if
self
.
pre_norm
else
None
)
ffn_index
+=
1
return
query
@
TRANSFORMER_LAYER_SEQUENCE
.
register_module
()
class
TransformerLayerSequence
(
BaseModule
):
"""Base class for TransformerEncoder and TransformerDecoder in vision
transformer.
As base-class of Encoder and Decoder in vision transformer.
Support customization such as specifying different kind
of `transformer_layer` in `transformer_coder`.
Args:
transformerlayer (list[obj:`mmcv.ConfigDict`] |
obj:`mmcv.ConfigDict`): Config of transformerlayer
in TransformerCoder. If it is obj:`mmcv.ConfigDict`,
it would be repeated `num_layer` times to a
list[`mmcv.ConfigDict`]. Default: None.
num_layers (int): The number of `TransformerLayer`. Default: None.
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
Default: None.
"""
def
__init__
(
self
,
transformerlayers
=
None
,
num_layers
=
None
,
init_cfg
=
None
):
super
(
TransformerLayerSequence
,
self
).
__init__
(
init_cfg
)
if
isinstance
(
transformerlayers
,
dict
):
transformerlayers
=
[
copy
.
deepcopy
(
transformerlayers
)
for
_
in
range
(
num_layers
)
]
else
:
assert
isinstance
(
transformerlayers
,
list
)
and
\
len
(
transformerlayers
)
==
num_layers
self
.
num_layers
=
num_layers
self
.
layers
=
ModuleList
()
for
i
in
range
(
num_layers
):
self
.
layers
.
append
(
build_transformer_layer
(
transformerlayers
[
i
]))
self
.
embed_dims
=
self
.
layers
[
0
].
embed_dims
self
.
pre_norm
=
self
.
layers
[
0
].
pre_norm
def
forward
(
self
,
query
,
key
,
value
,
query_pos
=
None
,
key_pos
=
None
,
attn_masks
=
None
,
query_key_padding_mask
=
None
,
key_padding_mask
=
None
,
**
kwargs
):
"""Forward function for `TransformerCoder`.
Args:
query (Tensor): Input query with shape
`(num_queries, bs, embed_dims)`.
key (Tensor): The key tensor with shape
`(num_keys, bs, embed_dims)`.
value (Tensor): The value tensor with shape
`(num_keys, bs, embed_dims)`.
query_pos (Tensor): The positional encoding for `query`.
Default: None.
key_pos (Tensor): The positional encoding for `key`.
Default: None.
attn_masks (List[Tensor], optional): Each element is 2D Tensor
which is used in calculation of corresponding attention in
operation_order. Default: None.
query_key_padding_mask (Tensor): ByteTensor for `query`, with
shape [bs, num_queries]. Only used in self-attention
Default: None.
key_padding_mask (Tensor): ByteTensor for `query`, with
shape [bs, num_keys]. Default: None.
Returns:
Tensor: results with shape [num_queries, bs, embed_dims].
"""
for
layer
in
self
.
layers
:
query
=
layer
(
query
,
key
,
value
,
query_pos
=
query_pos
,
key_pos
=
key_pos
,
attn_masks
=
attn_masks
,
query_key_padding_mask
=
query_key_padding_mask
,
key_padding_mask
=
key_padding_mask
,
**
kwargs
)
return
query
lavis/common/annotator/uniformer/mmcv/cnn/bricks/upsample.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
..utils
import
xavier_init
from
.registry
import
UPSAMPLE_LAYERS
UPSAMPLE_LAYERS
.
register_module
(
'nearest'
,
module
=
nn
.
Upsample
)
UPSAMPLE_LAYERS
.
register_module
(
'bilinear'
,
module
=
nn
.
Upsample
)
@
UPSAMPLE_LAYERS
.
register_module
(
name
=
'pixel_shuffle'
)
class
PixelShufflePack
(
nn
.
Module
):
"""Pixel Shuffle upsample layer.
This module packs `F.pixel_shuffle()` and a nn.Conv2d module together to
achieve a simple upsampling with pixel shuffle.
Args:
in_channels (int): Number of input channels.
out_channels (int): Number of output channels.
scale_factor (int): Upsample ratio.
upsample_kernel (int): Kernel size of the conv layer to expand the
channels.
"""
def
__init__
(
self
,
in_channels
,
out_channels
,
scale_factor
,
upsample_kernel
):
super
(
PixelShufflePack
,
self
).
__init__
()
self
.
in_channels
=
in_channels
self
.
out_channels
=
out_channels
self
.
scale_factor
=
scale_factor
self
.
upsample_kernel
=
upsample_kernel
self
.
upsample_conv
=
nn
.
Conv2d
(
self
.
in_channels
,
self
.
out_channels
*
scale_factor
*
scale_factor
,
self
.
upsample_kernel
,
padding
=
(
self
.
upsample_kernel
-
1
)
//
2
)
self
.
init_weights
()
def
init_weights
(
self
):
xavier_init
(
self
.
upsample_conv
,
distribution
=
'uniform'
)
def
forward
(
self
,
x
):
x
=
self
.
upsample_conv
(
x
)
x
=
F
.
pixel_shuffle
(
x
,
self
.
scale_factor
)
return
x
def
build_upsample_layer
(
cfg
,
*
args
,
**
kwargs
):
"""Build upsample layer.
Args:
cfg (dict): The upsample layer config, which should contain:
- type (str): Layer type.
- scale_factor (int): Upsample ratio, which is not applicable to
deconv.
- layer args: Args needed to instantiate a upsample layer.
args (argument list): Arguments passed to the ``__init__``
method of the corresponding conv layer.
kwargs (keyword arguments): Keyword arguments passed to the
``__init__`` method of the corresponding conv layer.
Returns:
nn.Module: Created upsample layer.
"""
if
not
isinstance
(
cfg
,
dict
):
raise
TypeError
(
f
'cfg must be a dict, but got
{
type
(
cfg
)
}
'
)
if
'type'
not
in
cfg
:
raise
KeyError
(
f
'the cfg dict must contain the key "type", but got
{
cfg
}
'
)
cfg_
=
cfg
.
copy
()
layer_type
=
cfg_
.
pop
(
'type'
)
if
layer_type
not
in
UPSAMPLE_LAYERS
:
raise
KeyError
(
f
'Unrecognized upsample type
{
layer_type
}
'
)
else
:
upsample
=
UPSAMPLE_LAYERS
.
get
(
layer_type
)
if
upsample
is
nn
.
Upsample
:
cfg_
[
'mode'
]
=
layer_type
layer
=
upsample
(
*
args
,
**
kwargs
,
**
cfg_
)
return
layer
lavis/common/annotator/uniformer/mmcv/cnn/bricks/wrappers.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
r
"""Modified from https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/wrappers.py # noqa: E501
Wrap some nn modules to support empty tensor input. Currently, these wrappers
are mainly used in mask heads like fcn_mask_head and maskiou_heads since mask
heads are trained on only positive RoIs.
"""
import
math
import
torch
import
torch.nn
as
nn
from
torch.nn.modules.utils
import
_pair
,
_triple
from
.registry
import
CONV_LAYERS
,
UPSAMPLE_LAYERS
if
torch
.
__version__
==
'parrots'
:
TORCH_VERSION
=
torch
.
__version__
else
:
# torch.__version__ could be 1.3.1+cu92, we only need the first two
# for comparison
TORCH_VERSION
=
tuple
(
int
(
x
)
for
x
in
torch
.
__version__
.
split
(
'.'
)[:
2
])
def
obsolete_torch_version
(
torch_version
,
version_threshold
):
return
torch_version
==
'parrots'
or
torch_version
<=
version_threshold
class
NewEmptyTensorOp
(
torch
.
autograd
.
Function
):
@
staticmethod
def
forward
(
ctx
,
x
,
new_shape
):
ctx
.
shape
=
x
.
shape
return
x
.
new_empty
(
new_shape
)
@
staticmethod
def
backward
(
ctx
,
grad
):
shape
=
ctx
.
shape
return
NewEmptyTensorOp
.
apply
(
grad
,
shape
),
None
@
CONV_LAYERS
.
register_module
(
'Conv'
,
force
=
True
)
class
Conv2d
(
nn
.
Conv2d
):
def
forward
(
self
,
x
):
if
x
.
numel
()
==
0
and
obsolete_torch_version
(
TORCH_VERSION
,
(
1
,
4
)):
out_shape
=
[
x
.
shape
[
0
],
self
.
out_channels
]
for
i
,
k
,
p
,
s
,
d
in
zip
(
x
.
shape
[
-
2
:],
self
.
kernel_size
,
self
.
padding
,
self
.
stride
,
self
.
dilation
):
o
=
(
i
+
2
*
p
-
(
d
*
(
k
-
1
)
+
1
))
//
s
+
1
out_shape
.
append
(
o
)
empty
=
NewEmptyTensorOp
.
apply
(
x
,
out_shape
)
if
self
.
training
:
# produce dummy gradient to avoid DDP warning.
dummy
=
sum
(
x
.
view
(
-
1
)[
0
]
for
x
in
self
.
parameters
())
*
0.0
return
empty
+
dummy
else
:
return
empty
return
super
().
forward
(
x
)
@
CONV_LAYERS
.
register_module
(
'Conv3d'
,
force
=
True
)
class
Conv3d
(
nn
.
Conv3d
):
def
forward
(
self
,
x
):
if
x
.
numel
()
==
0
and
obsolete_torch_version
(
TORCH_VERSION
,
(
1
,
4
)):
out_shape
=
[
x
.
shape
[
0
],
self
.
out_channels
]
for
i
,
k
,
p
,
s
,
d
in
zip
(
x
.
shape
[
-
3
:],
self
.
kernel_size
,
self
.
padding
,
self
.
stride
,
self
.
dilation
):
o
=
(
i
+
2
*
p
-
(
d
*
(
k
-
1
)
+
1
))
//
s
+
1
out_shape
.
append
(
o
)
empty
=
NewEmptyTensorOp
.
apply
(
x
,
out_shape
)
if
self
.
training
:
# produce dummy gradient to avoid DDP warning.
dummy
=
sum
(
x
.
view
(
-
1
)[
0
]
for
x
in
self
.
parameters
())
*
0.0
return
empty
+
dummy
else
:
return
empty
return
super
().
forward
(
x
)
@
CONV_LAYERS
.
register_module
()
@
CONV_LAYERS
.
register_module
(
'deconv'
)
@
UPSAMPLE_LAYERS
.
register_module
(
'deconv'
,
force
=
True
)
class
ConvTranspose2d
(
nn
.
ConvTranspose2d
):
def
forward
(
self
,
x
):
if
x
.
numel
()
==
0
and
obsolete_torch_version
(
TORCH_VERSION
,
(
1
,
4
)):
out_shape
=
[
x
.
shape
[
0
],
self
.
out_channels
]
for
i
,
k
,
p
,
s
,
d
,
op
in
zip
(
x
.
shape
[
-
2
:],
self
.
kernel_size
,
self
.
padding
,
self
.
stride
,
self
.
dilation
,
self
.
output_padding
):
out_shape
.
append
((
i
-
1
)
*
s
-
2
*
p
+
(
d
*
(
k
-
1
)
+
1
)
+
op
)
empty
=
NewEmptyTensorOp
.
apply
(
x
,
out_shape
)
if
self
.
training
:
# produce dummy gradient to avoid DDP warning.
dummy
=
sum
(
x
.
view
(
-
1
)[
0
]
for
x
in
self
.
parameters
())
*
0.0
return
empty
+
dummy
else
:
return
empty
return
super
().
forward
(
x
)
@
CONV_LAYERS
.
register_module
()
@
CONV_LAYERS
.
register_module
(
'deconv3d'
)
@
UPSAMPLE_LAYERS
.
register_module
(
'deconv3d'
,
force
=
True
)
class
ConvTranspose3d
(
nn
.
ConvTranspose3d
):
def
forward
(
self
,
x
):
if
x
.
numel
()
==
0
and
obsolete_torch_version
(
TORCH_VERSION
,
(
1
,
4
)):
out_shape
=
[
x
.
shape
[
0
],
self
.
out_channels
]
for
i
,
k
,
p
,
s
,
d
,
op
in
zip
(
x
.
shape
[
-
3
:],
self
.
kernel_size
,
self
.
padding
,
self
.
stride
,
self
.
dilation
,
self
.
output_padding
):
out_shape
.
append
((
i
-
1
)
*
s
-
2
*
p
+
(
d
*
(
k
-
1
)
+
1
)
+
op
)
empty
=
NewEmptyTensorOp
.
apply
(
x
,
out_shape
)
if
self
.
training
:
# produce dummy gradient to avoid DDP warning.
dummy
=
sum
(
x
.
view
(
-
1
)[
0
]
for
x
in
self
.
parameters
())
*
0.0
return
empty
+
dummy
else
:
return
empty
return
super
().
forward
(
x
)
class
MaxPool2d
(
nn
.
MaxPool2d
):
def
forward
(
self
,
x
):
# PyTorch 1.9 does not support empty tensor inference yet
if
x
.
numel
()
==
0
and
obsolete_torch_version
(
TORCH_VERSION
,
(
1
,
9
)):
out_shape
=
list
(
x
.
shape
[:
2
])
for
i
,
k
,
p
,
s
,
d
in
zip
(
x
.
shape
[
-
2
:],
_pair
(
self
.
kernel_size
),
_pair
(
self
.
padding
),
_pair
(
self
.
stride
),
_pair
(
self
.
dilation
)):
o
=
(
i
+
2
*
p
-
(
d
*
(
k
-
1
)
+
1
))
/
s
+
1
o
=
math
.
ceil
(
o
)
if
self
.
ceil_mode
else
math
.
floor
(
o
)
out_shape
.
append
(
o
)
empty
=
NewEmptyTensorOp
.
apply
(
x
,
out_shape
)
return
empty
return
super
().
forward
(
x
)
class
MaxPool3d
(
nn
.
MaxPool3d
):
def
forward
(
self
,
x
):
# PyTorch 1.9 does not support empty tensor inference yet
if
x
.
numel
()
==
0
and
obsolete_torch_version
(
TORCH_VERSION
,
(
1
,
9
)):
out_shape
=
list
(
x
.
shape
[:
2
])
for
i
,
k
,
p
,
s
,
d
in
zip
(
x
.
shape
[
-
3
:],
_triple
(
self
.
kernel_size
),
_triple
(
self
.
padding
),
_triple
(
self
.
stride
),
_triple
(
self
.
dilation
)):
o
=
(
i
+
2
*
p
-
(
d
*
(
k
-
1
)
+
1
))
/
s
+
1
o
=
math
.
ceil
(
o
)
if
self
.
ceil_mode
else
math
.
floor
(
o
)
out_shape
.
append
(
o
)
empty
=
NewEmptyTensorOp
.
apply
(
x
,
out_shape
)
return
empty
return
super
().
forward
(
x
)
class
Linear
(
torch
.
nn
.
Linear
):
def
forward
(
self
,
x
):
# empty tensor forward of Linear layer is supported in Pytorch 1.6
if
x
.
numel
()
==
0
and
obsolete_torch_version
(
TORCH_VERSION
,
(
1
,
5
)):
out_shape
=
[
x
.
shape
[
0
],
self
.
out_features
]
empty
=
NewEmptyTensorOp
.
apply
(
x
,
out_shape
)
if
self
.
training
:
# produce dummy gradient to avoid DDP warning.
dummy
=
sum
(
x
.
view
(
-
1
)[
0
]
for
x
in
self
.
parameters
())
*
0.0
return
empty
+
dummy
else
:
return
empty
return
super
().
forward
(
x
)
lavis/common/annotator/uniformer/mmcv/cnn/builder.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
from
..runner
import
Sequential
from
..utils
import
Registry
,
build_from_cfg
def
build_model_from_cfg
(
cfg
,
registry
,
default_args
=
None
):
"""Build a PyTorch model from config dict(s). Different from
``build_from_cfg``, if cfg is a list, a ``nn.Sequential`` will be built.
Args:
cfg (dict, list[dict]): The config of modules, is is either a config
dict or a list of config dicts. If cfg is a list, a
the built modules will be wrapped with ``nn.Sequential``.
registry (:obj:`Registry`): A registry the module belongs to.
default_args (dict, optional): Default arguments to build the module.
Defaults to None.
Returns:
nn.Module: A built nn module.
"""
if
isinstance
(
cfg
,
list
):
modules
=
[
build_from_cfg
(
cfg_
,
registry
,
default_args
)
for
cfg_
in
cfg
]
return
Sequential
(
*
modules
)
else
:
return
build_from_cfg
(
cfg
,
registry
,
default_args
)
MODELS
=
Registry
(
'model'
,
build_func
=
build_model_from_cfg
)
Prev
1
…
6
7
8
9
10
11
12
13
14
…
22
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment