Commit c04f261a authored by dongchy920's avatar dongchy920
Browse files

InstruceBLIP

parents
Pipeline #1594 canceled with stages
# Copyright (c) OpenMMLab. All rights reserved.
from torch import nn
from .registry import CONV_LAYERS
CONV_LAYERS.register_module('Conv1d', module=nn.Conv1d)
CONV_LAYERS.register_module('Conv2d', module=nn.Conv2d)
CONV_LAYERS.register_module('Conv3d', module=nn.Conv3d)
CONV_LAYERS.register_module('Conv', module=nn.Conv2d)
def build_conv_layer(cfg, *args, **kwargs):
"""Build convolution layer.
Args:
cfg (None or dict): The conv layer config, which should contain:
- type (str): Layer type.
- layer args: Args needed to instantiate an conv layer.
args (argument list): Arguments passed to the `__init__`
method of the corresponding conv layer.
kwargs (keyword arguments): Keyword arguments passed to the `__init__`
method of the corresponding conv layer.
Returns:
nn.Module: Created conv layer.
"""
if cfg is None:
cfg_ = dict(type='Conv2d')
else:
if not isinstance(cfg, dict):
raise TypeError('cfg must be a dict')
if 'type' not in cfg:
raise KeyError('the cfg dict must contain the key "type"')
cfg_ = cfg.copy()
layer_type = cfg_.pop('type')
if layer_type not in CONV_LAYERS:
raise KeyError(f'Unrecognized norm type {layer_type}')
else:
conv_layer = CONV_LAYERS.get(layer_type)
layer = conv_layer(*args, **kwargs, **cfg_)
return layer
# Copyright (c) OpenMMLab. All rights reserved.
import math
from torch import nn
from torch.nn import functional as F
from .registry import CONV_LAYERS
@CONV_LAYERS.register_module()
class Conv2dAdaptivePadding(nn.Conv2d):
"""Implementation of 2D convolution in tensorflow with `padding` as "same",
which applies padding to input (if needed) so that input image gets fully
covered by filter and stride you specified. For stride 1, this will ensure
that output image size is same as input. For stride of 2, output dimensions
will be half, for example.
Args:
in_channels (int): Number of channels in the input image
out_channels (int): Number of channels produced by the convolution
kernel_size (int or tuple): Size of the convolving kernel
stride (int or tuple, optional): Stride of the convolution. Default: 1
padding (int or tuple, optional): Zero-padding added to both sides of
the input. Default: 0
dilation (int or tuple, optional): Spacing between kernel elements.
Default: 1
groups (int, optional): Number of blocked connections from input
channels to output channels. Default: 1
bias (bool, optional): If ``True``, adds a learnable bias to the
output. Default: ``True``
"""
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True):
super().__init__(in_channels, out_channels, kernel_size, stride, 0,
dilation, groups, bias)
def forward(self, x):
img_h, img_w = x.size()[-2:]
kernel_h, kernel_w = self.weight.size()[-2:]
stride_h, stride_w = self.stride
output_h = math.ceil(img_h / stride_h)
output_w = math.ceil(img_w / stride_w)
pad_h = (
max((output_h - 1) * self.stride[0] +
(kernel_h - 1) * self.dilation[0] + 1 - img_h, 0))
pad_w = (
max((output_w - 1) * self.stride[1] +
(kernel_w - 1) * self.dilation[1] + 1 - img_w, 0))
if pad_h > 0 or pad_w > 0:
x = F.pad(x, [
pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2
])
return F.conv2d(x, self.weight, self.bias, self.stride, self.padding,
self.dilation, self.groups)
# Copyright (c) OpenMMLab. All rights reserved.
import warnings
import torch.nn as nn
from annotator.uniformer.mmcv.utils import _BatchNorm, _InstanceNorm
from ..utils import constant_init, kaiming_init
from .activation import build_activation_layer
from .conv import build_conv_layer
from .norm import build_norm_layer
from .padding import build_padding_layer
from .registry import PLUGIN_LAYERS
@PLUGIN_LAYERS.register_module()
class ConvModule(nn.Module):
"""A conv block that bundles conv/norm/activation layers.
This block simplifies the usage of convolution layers, which are commonly
used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).
It is based upon three build methods: `build_conv_layer()`,
`build_norm_layer()` and `build_activation_layer()`.
Besides, we add some additional features in this module.
1. Automatically set `bias` of the conv layer.
2. Spectral norm is supported.
3. More padding modes are supported. Before PyTorch 1.5, nn.Conv2d only
supports zero and circular padding, and we add "reflect" padding mode.
Args:
in_channels (int): Number of channels in the input feature map.
Same as that in ``nn._ConvNd``.
out_channels (int): Number of channels produced by the convolution.
Same as that in ``nn._ConvNd``.
kernel_size (int | tuple[int]): Size of the convolving kernel.
Same as that in ``nn._ConvNd``.
stride (int | tuple[int]): Stride of the convolution.
Same as that in ``nn._ConvNd``.
padding (int | tuple[int]): Zero-padding added to both sides of
the input. Same as that in ``nn._ConvNd``.
dilation (int | tuple[int]): Spacing between kernel elements.
Same as that in ``nn._ConvNd``.
groups (int): Number of blocked connections from input channels to
output channels. Same as that in ``nn._ConvNd``.
bias (bool | str): If specified as `auto`, it will be decided by the
norm_cfg. Bias will be set as True if `norm_cfg` is None, otherwise
False. Default: "auto".
conv_cfg (dict): Config dict for convolution layer. Default: None,
which means using conv2d.
norm_cfg (dict): Config dict for normalization layer. Default: None.
act_cfg (dict): Config dict for activation layer.
Default: dict(type='ReLU').
inplace (bool): Whether to use inplace mode for activation.
Default: True.
with_spectral_norm (bool): Whether use spectral norm in conv module.
Default: False.
padding_mode (str): If the `padding_mode` has not been supported by
current `Conv2d` in PyTorch, we will use our own padding layer
instead. Currently, we support ['zeros', 'circular'] with official
implementation and ['reflect'] with our own implementation.
Default: 'zeros'.
order (tuple[str]): The order of conv/norm/activation layers. It is a
sequence of "conv", "norm" and "act". Common examples are
("conv", "norm", "act") and ("act", "conv", "norm").
Default: ('conv', 'norm', 'act').
"""
_abbr_ = 'conv_block'
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias='auto',
conv_cfg=None,
norm_cfg=None,
act_cfg=dict(type='ReLU'),
inplace=True,
with_spectral_norm=False,
padding_mode='zeros',
order=('conv', 'norm', 'act')):
super(ConvModule, self).__init__()
assert conv_cfg is None or isinstance(conv_cfg, dict)
assert norm_cfg is None or isinstance(norm_cfg, dict)
assert act_cfg is None or isinstance(act_cfg, dict)
official_padding_mode = ['zeros', 'circular']
self.conv_cfg = conv_cfg
self.norm_cfg = norm_cfg
self.act_cfg = act_cfg
self.inplace = inplace
self.with_spectral_norm = with_spectral_norm
self.with_explicit_padding = padding_mode not in official_padding_mode
self.order = order
assert isinstance(self.order, tuple) and len(self.order) == 3
assert set(order) == set(['conv', 'norm', 'act'])
self.with_norm = norm_cfg is not None
self.with_activation = act_cfg is not None
# if the conv layer is before a norm layer, bias is unnecessary.
if bias == 'auto':
bias = not self.with_norm
self.with_bias = bias
if self.with_explicit_padding:
pad_cfg = dict(type=padding_mode)
self.padding_layer = build_padding_layer(pad_cfg, padding)
# reset padding to 0 for conv module
conv_padding = 0 if self.with_explicit_padding else padding
# build convolution layer
self.conv = build_conv_layer(
conv_cfg,
in_channels,
out_channels,
kernel_size,
stride=stride,
padding=conv_padding,
dilation=dilation,
groups=groups,
bias=bias)
# export the attributes of self.conv to a higher level for convenience
self.in_channels = self.conv.in_channels
self.out_channels = self.conv.out_channels
self.kernel_size = self.conv.kernel_size
self.stride = self.conv.stride
self.padding = padding
self.dilation = self.conv.dilation
self.transposed = self.conv.transposed
self.output_padding = self.conv.output_padding
self.groups = self.conv.groups
if self.with_spectral_norm:
self.conv = nn.utils.spectral_norm(self.conv)
# build normalization layers
if self.with_norm:
# norm layer is after conv layer
if order.index('norm') > order.index('conv'):
norm_channels = out_channels
else:
norm_channels = in_channels
self.norm_name, norm = build_norm_layer(norm_cfg, norm_channels)
self.add_module(self.norm_name, norm)
if self.with_bias:
if isinstance(norm, (_BatchNorm, _InstanceNorm)):
warnings.warn(
'Unnecessary conv bias before batch/instance norm')
else:
self.norm_name = None
# build activation layer
if self.with_activation:
act_cfg_ = act_cfg.copy()
# nn.Tanh has no 'inplace' argument
if act_cfg_['type'] not in [
'Tanh', 'PReLU', 'Sigmoid', 'HSigmoid', 'Swish'
]:
act_cfg_.setdefault('inplace', inplace)
self.activate = build_activation_layer(act_cfg_)
# Use msra init by default
self.init_weights()
@property
def norm(self):
if self.norm_name:
return getattr(self, self.norm_name)
else:
return None
def init_weights(self):
# 1. It is mainly for customized conv layers with their own
# initialization manners by calling their own ``init_weights()``,
# and we do not want ConvModule to override the initialization.
# 2. For customized conv layers without their own initialization
# manners (that is, they don't have their own ``init_weights()``)
# and PyTorch's conv layers, they will be initialized by
# this method with default ``kaiming_init``.
# Note: For PyTorch's conv layers, they will be overwritten by our
# initialization implementation using default ``kaiming_init``.
if not hasattr(self.conv, 'init_weights'):
if self.with_activation and self.act_cfg['type'] == 'LeakyReLU':
nonlinearity = 'leaky_relu'
a = self.act_cfg.get('negative_slope', 0.01)
else:
nonlinearity = 'relu'
a = 0
kaiming_init(self.conv, a=a, nonlinearity=nonlinearity)
if self.with_norm:
constant_init(self.norm, 1, bias=0)
def forward(self, x, activate=True, norm=True):
for layer in self.order:
if layer == 'conv':
if self.with_explicit_padding:
x = self.padding_layer(x)
x = self.conv(x)
elif layer == 'norm' and norm and self.with_norm:
x = self.norm(x)
elif layer == 'act' and activate and self.with_activation:
x = self.activate(x)
return x
# Copyright (c) OpenMMLab. All rights reserved.
import torch
import torch.nn as nn
import torch.nn.functional as F
from .registry import CONV_LAYERS
def conv_ws_2d(input,
weight,
bias=None,
stride=1,
padding=0,
dilation=1,
groups=1,
eps=1e-5):
c_in = weight.size(0)
weight_flat = weight.view(c_in, -1)
mean = weight_flat.mean(dim=1, keepdim=True).view(c_in, 1, 1, 1)
std = weight_flat.std(dim=1, keepdim=True).view(c_in, 1, 1, 1)
weight = (weight - mean) / (std + eps)
return F.conv2d(input, weight, bias, stride, padding, dilation, groups)
@CONV_LAYERS.register_module('ConvWS')
class ConvWS2d(nn.Conv2d):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
eps=1e-5):
super(ConvWS2d, self).__init__(
in_channels,
out_channels,
kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
groups=groups,
bias=bias)
self.eps = eps
def forward(self, x):
return conv_ws_2d(x, self.weight, self.bias, self.stride, self.padding,
self.dilation, self.groups, self.eps)
@CONV_LAYERS.register_module(name='ConvAWS')
class ConvAWS2d(nn.Conv2d):
"""AWS (Adaptive Weight Standardization)
This is a variant of Weight Standardization
(https://arxiv.org/pdf/1903.10520.pdf)
It is used in DetectoRS to avoid NaN
(https://arxiv.org/pdf/2006.02334.pdf)
Args:
in_channels (int): Number of channels in the input image
out_channels (int): Number of channels produced by the convolution
kernel_size (int or tuple): Size of the conv kernel
stride (int or tuple, optional): Stride of the convolution. Default: 1
padding (int or tuple, optional): Zero-padding added to both sides of
the input. Default: 0
dilation (int or tuple, optional): Spacing between kernel elements.
Default: 1
groups (int, optional): Number of blocked connections from input
channels to output channels. Default: 1
bias (bool, optional): If set True, adds a learnable bias to the
output. Default: True
"""
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True):
super().__init__(
in_channels,
out_channels,
kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
groups=groups,
bias=bias)
self.register_buffer('weight_gamma',
torch.ones(self.out_channels, 1, 1, 1))
self.register_buffer('weight_beta',
torch.zeros(self.out_channels, 1, 1, 1))
def _get_weight(self, weight):
weight_flat = weight.view(weight.size(0), -1)
mean = weight_flat.mean(dim=1).view(-1, 1, 1, 1)
std = torch.sqrt(weight_flat.var(dim=1) + 1e-5).view(-1, 1, 1, 1)
weight = (weight - mean) / std
weight = self.weight_gamma * weight + self.weight_beta
return weight
def forward(self, x):
weight = self._get_weight(self.weight)
return F.conv2d(x, weight, self.bias, self.stride, self.padding,
self.dilation, self.groups)
def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
missing_keys, unexpected_keys, error_msgs):
"""Override default load function.
AWS overrides the function _load_from_state_dict to recover
weight_gamma and weight_beta if they are missing. If weight_gamma and
weight_beta are found in the checkpoint, this function will return
after super()._load_from_state_dict. Otherwise, it will compute the
mean and std of the pretrained weights and store them in weight_beta
and weight_gamma.
"""
self.weight_gamma.data.fill_(-1)
local_missing_keys = []
super()._load_from_state_dict(state_dict, prefix, local_metadata,
strict, local_missing_keys,
unexpected_keys, error_msgs)
if self.weight_gamma.data.mean() > 0:
for k in local_missing_keys:
missing_keys.append(k)
return
weight = self.weight.data
weight_flat = weight.view(weight.size(0), -1)
mean = weight_flat.mean(dim=1).view(-1, 1, 1, 1)
std = torch.sqrt(weight_flat.var(dim=1) + 1e-5).view(-1, 1, 1, 1)
self.weight_beta.data.copy_(mean)
self.weight_gamma.data.copy_(std)
missing_gamma_beta = [
k for k in local_missing_keys
if k.endswith('weight_gamma') or k.endswith('weight_beta')
]
for k in missing_gamma_beta:
local_missing_keys.remove(k)
for k in local_missing_keys:
missing_keys.append(k)
# Copyright (c) OpenMMLab. All rights reserved.
import torch.nn as nn
from .conv_module import ConvModule
class DepthwiseSeparableConvModule(nn.Module):
"""Depthwise separable convolution module.
See https://arxiv.org/pdf/1704.04861.pdf for details.
This module can replace a ConvModule with the conv block replaced by two
conv block: depthwise conv block and pointwise conv block. The depthwise
conv block contains depthwise-conv/norm/activation layers. The pointwise
conv block contains pointwise-conv/norm/activation layers. It should be
noted that there will be norm/activation layer in the depthwise conv block
if `norm_cfg` and `act_cfg` are specified.
Args:
in_channels (int): Number of channels in the input feature map.
Same as that in ``nn._ConvNd``.
out_channels (int): Number of channels produced by the convolution.
Same as that in ``nn._ConvNd``.
kernel_size (int | tuple[int]): Size of the convolving kernel.
Same as that in ``nn._ConvNd``.
stride (int | tuple[int]): Stride of the convolution.
Same as that in ``nn._ConvNd``. Default: 1.
padding (int | tuple[int]): Zero-padding added to both sides of
the input. Same as that in ``nn._ConvNd``. Default: 0.
dilation (int | tuple[int]): Spacing between kernel elements.
Same as that in ``nn._ConvNd``. Default: 1.
norm_cfg (dict): Default norm config for both depthwise ConvModule and
pointwise ConvModule. Default: None.
act_cfg (dict): Default activation config for both depthwise ConvModule
and pointwise ConvModule. Default: dict(type='ReLU').
dw_norm_cfg (dict): Norm config of depthwise ConvModule. If it is
'default', it will be the same as `norm_cfg`. Default: 'default'.
dw_act_cfg (dict): Activation config of depthwise ConvModule. If it is
'default', it will be the same as `act_cfg`. Default: 'default'.
pw_norm_cfg (dict): Norm config of pointwise ConvModule. If it is
'default', it will be the same as `norm_cfg`. Default: 'default'.
pw_act_cfg (dict): Activation config of pointwise ConvModule. If it is
'default', it will be the same as `act_cfg`. Default: 'default'.
kwargs (optional): Other shared arguments for depthwise and pointwise
ConvModule. See ConvModule for ref.
"""
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
norm_cfg=None,
act_cfg=dict(type='ReLU'),
dw_norm_cfg='default',
dw_act_cfg='default',
pw_norm_cfg='default',
pw_act_cfg='default',
**kwargs):
super(DepthwiseSeparableConvModule, self).__init__()
assert 'groups' not in kwargs, 'groups should not be specified'
# if norm/activation config of depthwise/pointwise ConvModule is not
# specified, use default config.
dw_norm_cfg = dw_norm_cfg if dw_norm_cfg != 'default' else norm_cfg
dw_act_cfg = dw_act_cfg if dw_act_cfg != 'default' else act_cfg
pw_norm_cfg = pw_norm_cfg if pw_norm_cfg != 'default' else norm_cfg
pw_act_cfg = pw_act_cfg if pw_act_cfg != 'default' else act_cfg
# depthwise convolution
self.depthwise_conv = ConvModule(
in_channels,
in_channels,
kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
groups=in_channels,
norm_cfg=dw_norm_cfg,
act_cfg=dw_act_cfg,
**kwargs)
self.pointwise_conv = ConvModule(
in_channels,
out_channels,
1,
norm_cfg=pw_norm_cfg,
act_cfg=pw_act_cfg,
**kwargs)
def forward(self, x):
x = self.depthwise_conv(x)
x = self.pointwise_conv(x)
return x
# Copyright (c) OpenMMLab. All rights reserved.
import torch
import torch.nn as nn
from annotator.uniformer.mmcv import build_from_cfg
from .registry import DROPOUT_LAYERS
def drop_path(x, drop_prob=0., training=False):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of
residual blocks).
We follow the implementation
https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py # noqa: E501
"""
if drop_prob == 0. or not training:
return x
keep_prob = 1 - drop_prob
# handle tensors with different dimensions, not just 4D tensors.
shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
random_tensor = keep_prob + torch.rand(
shape, dtype=x.dtype, device=x.device)
output = x.div(keep_prob) * random_tensor.floor()
return output
@DROPOUT_LAYERS.register_module()
class DropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of
residual blocks).
We follow the implementation
https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py # noqa: E501
Args:
drop_prob (float): Probability of the path to be zeroed. Default: 0.1
"""
def __init__(self, drop_prob=0.1):
super(DropPath, self).__init__()
self.drop_prob = drop_prob
def forward(self, x):
return drop_path(x, self.drop_prob, self.training)
@DROPOUT_LAYERS.register_module()
class Dropout(nn.Dropout):
"""A wrapper for ``torch.nn.Dropout``, We rename the ``p`` of
``torch.nn.Dropout`` to ``drop_prob`` so as to be consistent with
``DropPath``
Args:
drop_prob (float): Probability of the elements to be
zeroed. Default: 0.5.
inplace (bool): Do the operation inplace or not. Default: False.
"""
def __init__(self, drop_prob=0.5, inplace=False):
super().__init__(p=drop_prob, inplace=inplace)
def build_dropout(cfg, default_args=None):
"""Builder for drop out layers."""
return build_from_cfg(cfg, DROPOUT_LAYERS, default_args)
# Copyright (c) OpenMMLab. All rights reserved.
import math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from ..utils import kaiming_init
from .registry import PLUGIN_LAYERS
@PLUGIN_LAYERS.register_module()
class GeneralizedAttention(nn.Module):
"""GeneralizedAttention module.
See 'An Empirical Study of Spatial Attention Mechanisms in Deep Networks'
(https://arxiv.org/abs/1711.07971) for details.
Args:
in_channels (int): Channels of the input feature map.
spatial_range (int): The spatial range. -1 indicates no spatial range
constraint. Default: -1.
num_heads (int): The head number of empirical_attention module.
Default: 9.
position_embedding_dim (int): The position embedding dimension.
Default: -1.
position_magnitude (int): A multiplier acting on coord difference.
Default: 1.
kv_stride (int): The feature stride acting on key/value feature map.
Default: 2.
q_stride (int): The feature stride acting on query feature map.
Default: 1.
attention_type (str): A binary indicator string for indicating which
items in generalized empirical_attention module are used.
Default: '1111'.
- '1000' indicates 'query and key content' (appr - appr) item,
- '0100' indicates 'query content and relative position'
(appr - position) item,
- '0010' indicates 'key content only' (bias - appr) item,
- '0001' indicates 'relative position only' (bias - position) item.
"""
_abbr_ = 'gen_attention_block'
def __init__(self,
in_channels,
spatial_range=-1,
num_heads=9,
position_embedding_dim=-1,
position_magnitude=1,
kv_stride=2,
q_stride=1,
attention_type='1111'):
super(GeneralizedAttention, self).__init__()
# hard range means local range for non-local operation
self.position_embedding_dim = (
position_embedding_dim
if position_embedding_dim > 0 else in_channels)
self.position_magnitude = position_magnitude
self.num_heads = num_heads
self.in_channels = in_channels
self.spatial_range = spatial_range
self.kv_stride = kv_stride
self.q_stride = q_stride
self.attention_type = [bool(int(_)) for _ in attention_type]
self.qk_embed_dim = in_channels // num_heads
out_c = self.qk_embed_dim * num_heads
if self.attention_type[0] or self.attention_type[1]:
self.query_conv = nn.Conv2d(
in_channels=in_channels,
out_channels=out_c,
kernel_size=1,
bias=False)
self.query_conv.kaiming_init = True
if self.attention_type[0] or self.attention_type[2]:
self.key_conv = nn.Conv2d(
in_channels=in_channels,
out_channels=out_c,
kernel_size=1,
bias=False)
self.key_conv.kaiming_init = True
self.v_dim = in_channels // num_heads
self.value_conv = nn.Conv2d(
in_channels=in_channels,
out_channels=self.v_dim * num_heads,
kernel_size=1,
bias=False)
self.value_conv.kaiming_init = True
if self.attention_type[1] or self.attention_type[3]:
self.appr_geom_fc_x = nn.Linear(
self.position_embedding_dim // 2, out_c, bias=False)
self.appr_geom_fc_x.kaiming_init = True
self.appr_geom_fc_y = nn.Linear(
self.position_embedding_dim // 2, out_c, bias=False)
self.appr_geom_fc_y.kaiming_init = True
if self.attention_type[2]:
stdv = 1.0 / math.sqrt(self.qk_embed_dim * 2)
appr_bias_value = -2 * stdv * torch.rand(out_c) + stdv
self.appr_bias = nn.Parameter(appr_bias_value)
if self.attention_type[3]:
stdv = 1.0 / math.sqrt(self.qk_embed_dim * 2)
geom_bias_value = -2 * stdv * torch.rand(out_c) + stdv
self.geom_bias = nn.Parameter(geom_bias_value)
self.proj_conv = nn.Conv2d(
in_channels=self.v_dim * num_heads,
out_channels=in_channels,
kernel_size=1,
bias=True)
self.proj_conv.kaiming_init = True
self.gamma = nn.Parameter(torch.zeros(1))
if self.spatial_range >= 0:
# only works when non local is after 3*3 conv
if in_channels == 256:
max_len = 84
elif in_channels == 512:
max_len = 42
max_len_kv = int((max_len - 1.0) / self.kv_stride + 1)
local_constraint_map = np.ones(
(max_len, max_len, max_len_kv, max_len_kv), dtype=np.int)
for iy in range(max_len):
for ix in range(max_len):
local_constraint_map[
iy, ix,
max((iy - self.spatial_range) //
self.kv_stride, 0):min((iy + self.spatial_range +
1) // self.kv_stride +
1, max_len),
max((ix - self.spatial_range) //
self.kv_stride, 0):min((ix + self.spatial_range +
1) // self.kv_stride +
1, max_len)] = 0
self.local_constraint_map = nn.Parameter(
torch.from_numpy(local_constraint_map).byte(),
requires_grad=False)
if self.q_stride > 1:
self.q_downsample = nn.AvgPool2d(
kernel_size=1, stride=self.q_stride)
else:
self.q_downsample = None
if self.kv_stride > 1:
self.kv_downsample = nn.AvgPool2d(
kernel_size=1, stride=self.kv_stride)
else:
self.kv_downsample = None
self.init_weights()
def get_position_embedding(self,
h,
w,
h_kv,
w_kv,
q_stride,
kv_stride,
device,
dtype,
feat_dim,
wave_length=1000):
# the default type of Tensor is float32, leading to type mismatch
# in fp16 mode. Cast it to support fp16 mode.
h_idxs = torch.linspace(0, h - 1, h).to(device=device, dtype=dtype)
h_idxs = h_idxs.view((h, 1)) * q_stride
w_idxs = torch.linspace(0, w - 1, w).to(device=device, dtype=dtype)
w_idxs = w_idxs.view((w, 1)) * q_stride
h_kv_idxs = torch.linspace(0, h_kv - 1, h_kv).to(
device=device, dtype=dtype)
h_kv_idxs = h_kv_idxs.view((h_kv, 1)) * kv_stride
w_kv_idxs = torch.linspace(0, w_kv - 1, w_kv).to(
device=device, dtype=dtype)
w_kv_idxs = w_kv_idxs.view((w_kv, 1)) * kv_stride
# (h, h_kv, 1)
h_diff = h_idxs.unsqueeze(1) - h_kv_idxs.unsqueeze(0)
h_diff *= self.position_magnitude
# (w, w_kv, 1)
w_diff = w_idxs.unsqueeze(1) - w_kv_idxs.unsqueeze(0)
w_diff *= self.position_magnitude
feat_range = torch.arange(0, feat_dim / 4).to(
device=device, dtype=dtype)
dim_mat = torch.Tensor([wave_length]).to(device=device, dtype=dtype)
dim_mat = dim_mat**((4. / feat_dim) * feat_range)
dim_mat = dim_mat.view((1, 1, -1))
embedding_x = torch.cat(
((w_diff / dim_mat).sin(), (w_diff / dim_mat).cos()), dim=2)
embedding_y = torch.cat(
((h_diff / dim_mat).sin(), (h_diff / dim_mat).cos()), dim=2)
return embedding_x, embedding_y
def forward(self, x_input):
num_heads = self.num_heads
# use empirical_attention
if self.q_downsample is not None:
x_q = self.q_downsample(x_input)
else:
x_q = x_input
n, _, h, w = x_q.shape
if self.kv_downsample is not None:
x_kv = self.kv_downsample(x_input)
else:
x_kv = x_input
_, _, h_kv, w_kv = x_kv.shape
if self.attention_type[0] or self.attention_type[1]:
proj_query = self.query_conv(x_q).view(
(n, num_heads, self.qk_embed_dim, h * w))
proj_query = proj_query.permute(0, 1, 3, 2)
if self.attention_type[0] or self.attention_type[2]:
proj_key = self.key_conv(x_kv).view(
(n, num_heads, self.qk_embed_dim, h_kv * w_kv))
if self.attention_type[1] or self.attention_type[3]:
position_embed_x, position_embed_y = self.get_position_embedding(
h, w, h_kv, w_kv, self.q_stride, self.kv_stride,
x_input.device, x_input.dtype, self.position_embedding_dim)
# (n, num_heads, w, w_kv, dim)
position_feat_x = self.appr_geom_fc_x(position_embed_x).\
view(1, w, w_kv, num_heads, self.qk_embed_dim).\
permute(0, 3, 1, 2, 4).\
repeat(n, 1, 1, 1, 1)
# (n, num_heads, h, h_kv, dim)
position_feat_y = self.appr_geom_fc_y(position_embed_y).\
view(1, h, h_kv, num_heads, self.qk_embed_dim).\
permute(0, 3, 1, 2, 4).\
repeat(n, 1, 1, 1, 1)
position_feat_x /= math.sqrt(2)
position_feat_y /= math.sqrt(2)
# accelerate for saliency only
if (np.sum(self.attention_type) == 1) and self.attention_type[2]:
appr_bias = self.appr_bias.\
view(1, num_heads, 1, self.qk_embed_dim).\
repeat(n, 1, 1, 1)
energy = torch.matmul(appr_bias, proj_key).\
view(n, num_heads, 1, h_kv * w_kv)
h = 1
w = 1
else:
# (n, num_heads, h*w, h_kv*w_kv), query before key, 540mb for
if not self.attention_type[0]:
energy = torch.zeros(
n,
num_heads,
h,
w,
h_kv,
w_kv,
dtype=x_input.dtype,
device=x_input.device)
# attention_type[0]: appr - appr
# attention_type[1]: appr - position
# attention_type[2]: bias - appr
# attention_type[3]: bias - position
if self.attention_type[0] or self.attention_type[2]:
if self.attention_type[0] and self.attention_type[2]:
appr_bias = self.appr_bias.\
view(1, num_heads, 1, self.qk_embed_dim)
energy = torch.matmul(proj_query + appr_bias, proj_key).\
view(n, num_heads, h, w, h_kv, w_kv)
elif self.attention_type[0]:
energy = torch.matmul(proj_query, proj_key).\
view(n, num_heads, h, w, h_kv, w_kv)
elif self.attention_type[2]:
appr_bias = self.appr_bias.\
view(1, num_heads, 1, self.qk_embed_dim).\
repeat(n, 1, 1, 1)
energy += torch.matmul(appr_bias, proj_key).\
view(n, num_heads, 1, 1, h_kv, w_kv)
if self.attention_type[1] or self.attention_type[3]:
if self.attention_type[1] and self.attention_type[3]:
geom_bias = self.geom_bias.\
view(1, num_heads, 1, self.qk_embed_dim)
proj_query_reshape = (proj_query + geom_bias).\
view(n, num_heads, h, w, self.qk_embed_dim)
energy_x = torch.matmul(
proj_query_reshape.permute(0, 1, 3, 2, 4),
position_feat_x.permute(0, 1, 2, 4, 3))
energy_x = energy_x.\
permute(0, 1, 3, 2, 4).unsqueeze(4)
energy_y = torch.matmul(
proj_query_reshape,
position_feat_y.permute(0, 1, 2, 4, 3))
energy_y = energy_y.unsqueeze(5)
energy += energy_x + energy_y
elif self.attention_type[1]:
proj_query_reshape = proj_query.\
view(n, num_heads, h, w, self.qk_embed_dim)
proj_query_reshape = proj_query_reshape.\
permute(0, 1, 3, 2, 4)
position_feat_x_reshape = position_feat_x.\
permute(0, 1, 2, 4, 3)
position_feat_y_reshape = position_feat_y.\
permute(0, 1, 2, 4, 3)
energy_x = torch.matmul(proj_query_reshape,
position_feat_x_reshape)
energy_x = energy_x.permute(0, 1, 3, 2, 4).unsqueeze(4)
energy_y = torch.matmul(proj_query_reshape,
position_feat_y_reshape)
energy_y = energy_y.unsqueeze(5)
energy += energy_x + energy_y
elif self.attention_type[3]:
geom_bias = self.geom_bias.\
view(1, num_heads, self.qk_embed_dim, 1).\
repeat(n, 1, 1, 1)
position_feat_x_reshape = position_feat_x.\
view(n, num_heads, w*w_kv, self.qk_embed_dim)
position_feat_y_reshape = position_feat_y.\
view(n, num_heads, h * h_kv, self.qk_embed_dim)
energy_x = torch.matmul(position_feat_x_reshape, geom_bias)
energy_x = energy_x.view(n, num_heads, 1, w, 1, w_kv)
energy_y = torch.matmul(position_feat_y_reshape, geom_bias)
energy_y = energy_y.view(n, num_heads, h, 1, h_kv, 1)
energy += energy_x + energy_y
energy = energy.view(n, num_heads, h * w, h_kv * w_kv)
if self.spatial_range >= 0:
cur_local_constraint_map = \
self.local_constraint_map[:h, :w, :h_kv, :w_kv].\
contiguous().\
view(1, 1, h*w, h_kv*w_kv)
energy = energy.masked_fill_(cur_local_constraint_map,
float('-inf'))
attention = F.softmax(energy, 3)
proj_value = self.value_conv(x_kv)
proj_value_reshape = proj_value.\
view((n, num_heads, self.v_dim, h_kv * w_kv)).\
permute(0, 1, 3, 2)
out = torch.matmul(attention, proj_value_reshape).\
permute(0, 1, 3, 2).\
contiguous().\
view(n, self.v_dim * self.num_heads, h, w)
out = self.proj_conv(out)
# output is downsampled, upsample back to input size
if self.q_downsample is not None:
out = F.interpolate(
out,
size=x_input.shape[2:],
mode='bilinear',
align_corners=False)
out = self.gamma * out + x_input
return out
def init_weights(self):
for m in self.modules():
if hasattr(m, 'kaiming_init') and m.kaiming_init:
kaiming_init(
m,
mode='fan_in',
nonlinearity='leaky_relu',
bias=0,
distribution='uniform',
a=1)
# Copyright (c) OpenMMLab. All rights reserved.
import torch.nn as nn
from .registry import ACTIVATION_LAYERS
@ACTIVATION_LAYERS.register_module()
class HSigmoid(nn.Module):
"""Hard Sigmoid Module. Apply the hard sigmoid function:
Hsigmoid(x) = min(max((x + bias) / divisor, min_value), max_value)
Default: Hsigmoid(x) = min(max((x + 1) / 2, 0), 1)
Args:
bias (float): Bias of the input feature map. Default: 1.0.
divisor (float): Divisor of the input feature map. Default: 2.0.
min_value (float): Lower bound value. Default: 0.0.
max_value (float): Upper bound value. Default: 1.0.
Returns:
Tensor: The output tensor.
"""
def __init__(self, bias=1.0, divisor=2.0, min_value=0.0, max_value=1.0):
super(HSigmoid, self).__init__()
self.bias = bias
self.divisor = divisor
assert self.divisor != 0
self.min_value = min_value
self.max_value = max_value
def forward(self, x):
x = (x + self.bias) / self.divisor
return x.clamp_(self.min_value, self.max_value)
# Copyright (c) OpenMMLab. All rights reserved.
import torch.nn as nn
from .registry import ACTIVATION_LAYERS
@ACTIVATION_LAYERS.register_module()
class HSwish(nn.Module):
"""Hard Swish Module.
This module applies the hard swish function:
.. math::
Hswish(x) = x * ReLU6(x + 3) / 6
Args:
inplace (bool): can optionally do the operation in-place.
Default: False.
Returns:
Tensor: The output tensor.
"""
def __init__(self, inplace=False):
super(HSwish, self).__init__()
self.act = nn.ReLU6(inplace)
def forward(self, x):
return x * self.act(x + 3) / 6
# Copyright (c) OpenMMLab. All rights reserved.
from abc import ABCMeta
import torch
import torch.nn as nn
from ..utils import constant_init, normal_init
from .conv_module import ConvModule
from .registry import PLUGIN_LAYERS
class _NonLocalNd(nn.Module, metaclass=ABCMeta):
"""Basic Non-local module.
This module is proposed in
"Non-local Neural Networks"
Paper reference: https://arxiv.org/abs/1711.07971
Code reference: https://github.com/AlexHex7/Non-local_pytorch
Args:
in_channels (int): Channels of the input feature map.
reduction (int): Channel reduction ratio. Default: 2.
use_scale (bool): Whether to scale pairwise_weight by
`1/sqrt(inter_channels)` when the mode is `embedded_gaussian`.
Default: True.
conv_cfg (None | dict): The config dict for convolution layers.
If not specified, it will use `nn.Conv2d` for convolution layers.
Default: None.
norm_cfg (None | dict): The config dict for normalization layers.
Default: None. (This parameter is only applicable to conv_out.)
mode (str): Options are `gaussian`, `concatenation`,
`embedded_gaussian` and `dot_product`. Default: embedded_gaussian.
"""
def __init__(self,
in_channels,
reduction=2,
use_scale=True,
conv_cfg=None,
norm_cfg=None,
mode='embedded_gaussian',
**kwargs):
super(_NonLocalNd, self).__init__()
self.in_channels = in_channels
self.reduction = reduction
self.use_scale = use_scale
self.inter_channels = max(in_channels // reduction, 1)
self.mode = mode
if mode not in [
'gaussian', 'embedded_gaussian', 'dot_product', 'concatenation'
]:
raise ValueError("Mode should be in 'gaussian', 'concatenation', "
f"'embedded_gaussian' or 'dot_product', but got "
f'{mode} instead.')
# g, theta, phi are defaulted as `nn.ConvNd`.
# Here we use ConvModule for potential usage.
self.g = ConvModule(
self.in_channels,
self.inter_channels,
kernel_size=1,
conv_cfg=conv_cfg,
act_cfg=None)
self.conv_out = ConvModule(
self.inter_channels,
self.in_channels,
kernel_size=1,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=None)
if self.mode != 'gaussian':
self.theta = ConvModule(
self.in_channels,
self.inter_channels,
kernel_size=1,
conv_cfg=conv_cfg,
act_cfg=None)
self.phi = ConvModule(
self.in_channels,
self.inter_channels,
kernel_size=1,
conv_cfg=conv_cfg,
act_cfg=None)
if self.mode == 'concatenation':
self.concat_project = ConvModule(
self.inter_channels * 2,
1,
kernel_size=1,
stride=1,
padding=0,
bias=False,
act_cfg=dict(type='ReLU'))
self.init_weights(**kwargs)
def init_weights(self, std=0.01, zeros_init=True):
if self.mode != 'gaussian':
for m in [self.g, self.theta, self.phi]:
normal_init(m.conv, std=std)
else:
normal_init(self.g.conv, std=std)
if zeros_init:
if self.conv_out.norm_cfg is None:
constant_init(self.conv_out.conv, 0)
else:
constant_init(self.conv_out.norm, 0)
else:
if self.conv_out.norm_cfg is None:
normal_init(self.conv_out.conv, std=std)
else:
normal_init(self.conv_out.norm, std=std)
def gaussian(self, theta_x, phi_x):
# NonLocal1d pairwise_weight: [N, H, H]
# NonLocal2d pairwise_weight: [N, HxW, HxW]
# NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
pairwise_weight = torch.matmul(theta_x, phi_x)
pairwise_weight = pairwise_weight.softmax(dim=-1)
return pairwise_weight
def embedded_gaussian(self, theta_x, phi_x):
# NonLocal1d pairwise_weight: [N, H, H]
# NonLocal2d pairwise_weight: [N, HxW, HxW]
# NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
pairwise_weight = torch.matmul(theta_x, phi_x)
if self.use_scale:
# theta_x.shape[-1] is `self.inter_channels`
pairwise_weight /= theta_x.shape[-1]**0.5
pairwise_weight = pairwise_weight.softmax(dim=-1)
return pairwise_weight
def dot_product(self, theta_x, phi_x):
# NonLocal1d pairwise_weight: [N, H, H]
# NonLocal2d pairwise_weight: [N, HxW, HxW]
# NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
pairwise_weight = torch.matmul(theta_x, phi_x)
pairwise_weight /= pairwise_weight.shape[-1]
return pairwise_weight
def concatenation(self, theta_x, phi_x):
# NonLocal1d pairwise_weight: [N, H, H]
# NonLocal2d pairwise_weight: [N, HxW, HxW]
# NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
h = theta_x.size(2)
w = phi_x.size(3)
theta_x = theta_x.repeat(1, 1, 1, w)
phi_x = phi_x.repeat(1, 1, h, 1)
concat_feature = torch.cat([theta_x, phi_x], dim=1)
pairwise_weight = self.concat_project(concat_feature)
n, _, h, w = pairwise_weight.size()
pairwise_weight = pairwise_weight.view(n, h, w)
pairwise_weight /= pairwise_weight.shape[-1]
return pairwise_weight
def forward(self, x):
# Assume `reduction = 1`, then `inter_channels = C`
# or `inter_channels = C` when `mode="gaussian"`
# NonLocal1d x: [N, C, H]
# NonLocal2d x: [N, C, H, W]
# NonLocal3d x: [N, C, T, H, W]
n = x.size(0)
# NonLocal1d g_x: [N, H, C]
# NonLocal2d g_x: [N, HxW, C]
# NonLocal3d g_x: [N, TxHxW, C]
g_x = self.g(x).view(n, self.inter_channels, -1)
g_x = g_x.permute(0, 2, 1)
# NonLocal1d theta_x: [N, H, C], phi_x: [N, C, H]
# NonLocal2d theta_x: [N, HxW, C], phi_x: [N, C, HxW]
# NonLocal3d theta_x: [N, TxHxW, C], phi_x: [N, C, TxHxW]
if self.mode == 'gaussian':
theta_x = x.view(n, self.in_channels, -1)
theta_x = theta_x.permute(0, 2, 1)
if self.sub_sample:
phi_x = self.phi(x).view(n, self.in_channels, -1)
else:
phi_x = x.view(n, self.in_channels, -1)
elif self.mode == 'concatenation':
theta_x = self.theta(x).view(n, self.inter_channels, -1, 1)
phi_x = self.phi(x).view(n, self.inter_channels, 1, -1)
else:
theta_x = self.theta(x).view(n, self.inter_channels, -1)
theta_x = theta_x.permute(0, 2, 1)
phi_x = self.phi(x).view(n, self.inter_channels, -1)
pairwise_func = getattr(self, self.mode)
# NonLocal1d pairwise_weight: [N, H, H]
# NonLocal2d pairwise_weight: [N, HxW, HxW]
# NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
pairwise_weight = pairwise_func(theta_x, phi_x)
# NonLocal1d y: [N, H, C]
# NonLocal2d y: [N, HxW, C]
# NonLocal3d y: [N, TxHxW, C]
y = torch.matmul(pairwise_weight, g_x)
# NonLocal1d y: [N, C, H]
# NonLocal2d y: [N, C, H, W]
# NonLocal3d y: [N, C, T, H, W]
y = y.permute(0, 2, 1).contiguous().reshape(n, self.inter_channels,
*x.size()[2:])
output = x + self.conv_out(y)
return output
class NonLocal1d(_NonLocalNd):
"""1D Non-local module.
Args:
in_channels (int): Same as `NonLocalND`.
sub_sample (bool): Whether to apply max pooling after pairwise
function (Note that the `sub_sample` is applied on spatial only).
Default: False.
conv_cfg (None | dict): Same as `NonLocalND`.
Default: dict(type='Conv1d').
"""
def __init__(self,
in_channels,
sub_sample=False,
conv_cfg=dict(type='Conv1d'),
**kwargs):
super(NonLocal1d, self).__init__(
in_channels, conv_cfg=conv_cfg, **kwargs)
self.sub_sample = sub_sample
if sub_sample:
max_pool_layer = nn.MaxPool1d(kernel_size=2)
self.g = nn.Sequential(self.g, max_pool_layer)
if self.mode != 'gaussian':
self.phi = nn.Sequential(self.phi, max_pool_layer)
else:
self.phi = max_pool_layer
@PLUGIN_LAYERS.register_module()
class NonLocal2d(_NonLocalNd):
"""2D Non-local module.
Args:
in_channels (int): Same as `NonLocalND`.
sub_sample (bool): Whether to apply max pooling after pairwise
function (Note that the `sub_sample` is applied on spatial only).
Default: False.
conv_cfg (None | dict): Same as `NonLocalND`.
Default: dict(type='Conv2d').
"""
_abbr_ = 'nonlocal_block'
def __init__(self,
in_channels,
sub_sample=False,
conv_cfg=dict(type='Conv2d'),
**kwargs):
super(NonLocal2d, self).__init__(
in_channels, conv_cfg=conv_cfg, **kwargs)
self.sub_sample = sub_sample
if sub_sample:
max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2))
self.g = nn.Sequential(self.g, max_pool_layer)
if self.mode != 'gaussian':
self.phi = nn.Sequential(self.phi, max_pool_layer)
else:
self.phi = max_pool_layer
class NonLocal3d(_NonLocalNd):
"""3D Non-local module.
Args:
in_channels (int): Same as `NonLocalND`.
sub_sample (bool): Whether to apply max pooling after pairwise
function (Note that the `sub_sample` is applied on spatial only).
Default: False.
conv_cfg (None | dict): Same as `NonLocalND`.
Default: dict(type='Conv3d').
"""
def __init__(self,
in_channels,
sub_sample=False,
conv_cfg=dict(type='Conv3d'),
**kwargs):
super(NonLocal3d, self).__init__(
in_channels, conv_cfg=conv_cfg, **kwargs)
self.sub_sample = sub_sample
if sub_sample:
max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2))
self.g = nn.Sequential(self.g, max_pool_layer)
if self.mode != 'gaussian':
self.phi = nn.Sequential(self.phi, max_pool_layer)
else:
self.phi = max_pool_layer
# Copyright (c) OpenMMLab. All rights reserved.
import inspect
import torch.nn as nn
from annotator.uniformer.mmcv.utils import is_tuple_of
from annotator.uniformer.mmcv.utils.parrots_wrapper import SyncBatchNorm, _BatchNorm, _InstanceNorm
from .registry import NORM_LAYERS
NORM_LAYERS.register_module('BN', module=nn.BatchNorm2d)
NORM_LAYERS.register_module('BN1d', module=nn.BatchNorm1d)
NORM_LAYERS.register_module('BN2d', module=nn.BatchNorm2d)
NORM_LAYERS.register_module('BN3d', module=nn.BatchNorm3d)
NORM_LAYERS.register_module('SyncBN', module=SyncBatchNorm)
NORM_LAYERS.register_module('GN', module=nn.GroupNorm)
NORM_LAYERS.register_module('LN', module=nn.LayerNorm)
NORM_LAYERS.register_module('IN', module=nn.InstanceNorm2d)
NORM_LAYERS.register_module('IN1d', module=nn.InstanceNorm1d)
NORM_LAYERS.register_module('IN2d', module=nn.InstanceNorm2d)
NORM_LAYERS.register_module('IN3d', module=nn.InstanceNorm3d)
def infer_abbr(class_type):
"""Infer abbreviation from the class name.
When we build a norm layer with `build_norm_layer()`, we want to preserve
the norm type in variable names, e.g, self.bn1, self.gn. This method will
infer the abbreviation to map class types to abbreviations.
Rule 1: If the class has the property "_abbr_", return the property.
Rule 2: If the parent class is _BatchNorm, GroupNorm, LayerNorm or
InstanceNorm, the abbreviation of this layer will be "bn", "gn", "ln" and
"in" respectively.
Rule 3: If the class name contains "batch", "group", "layer" or "instance",
the abbreviation of this layer will be "bn", "gn", "ln" and "in"
respectively.
Rule 4: Otherwise, the abbreviation falls back to "norm".
Args:
class_type (type): The norm layer type.
Returns:
str: The inferred abbreviation.
"""
if not inspect.isclass(class_type):
raise TypeError(
f'class_type must be a type, but got {type(class_type)}')
if hasattr(class_type, '_abbr_'):
return class_type._abbr_
if issubclass(class_type, _InstanceNorm): # IN is a subclass of BN
return 'in'
elif issubclass(class_type, _BatchNorm):
return 'bn'
elif issubclass(class_type, nn.GroupNorm):
return 'gn'
elif issubclass(class_type, nn.LayerNorm):
return 'ln'
else:
class_name = class_type.__name__.lower()
if 'batch' in class_name:
return 'bn'
elif 'group' in class_name:
return 'gn'
elif 'layer' in class_name:
return 'ln'
elif 'instance' in class_name:
return 'in'
else:
return 'norm_layer'
def build_norm_layer(cfg, num_features, postfix=''):
"""Build normalization layer.
Args:
cfg (dict): The norm layer config, which should contain:
- type (str): Layer type.
- layer args: Args needed to instantiate a norm layer.
- requires_grad (bool, optional): Whether stop gradient updates.
num_features (int): Number of input channels.
postfix (int | str): The postfix to be appended into norm abbreviation
to create named layer.
Returns:
(str, nn.Module): The first element is the layer name consisting of
abbreviation and postfix, e.g., bn1, gn. The second element is the
created norm layer.
"""
if not isinstance(cfg, dict):
raise TypeError('cfg must be a dict')
if 'type' not in cfg:
raise KeyError('the cfg dict must contain the key "type"')
cfg_ = cfg.copy()
layer_type = cfg_.pop('type')
if layer_type not in NORM_LAYERS:
raise KeyError(f'Unrecognized norm type {layer_type}')
norm_layer = NORM_LAYERS.get(layer_type)
abbr = infer_abbr(norm_layer)
assert isinstance(postfix, (int, str))
name = abbr + str(postfix)
requires_grad = cfg_.pop('requires_grad', True)
cfg_.setdefault('eps', 1e-5)
if layer_type != 'GN':
layer = norm_layer(num_features, **cfg_)
if layer_type == 'SyncBN' and hasattr(layer, '_specify_ddp_gpu_num'):
layer._specify_ddp_gpu_num(1)
else:
assert 'num_groups' in cfg_
layer = norm_layer(num_channels=num_features, **cfg_)
for param in layer.parameters():
param.requires_grad = requires_grad
return name, layer
def is_norm(layer, exclude=None):
"""Check if a layer is a normalization layer.
Args:
layer (nn.Module): The layer to be checked.
exclude (type | tuple[type]): Types to be excluded.
Returns:
bool: Whether the layer is a norm layer.
"""
if exclude is not None:
if not isinstance(exclude, tuple):
exclude = (exclude, )
if not is_tuple_of(exclude, type):
raise TypeError(
f'"exclude" must be either None or type or a tuple of types, '
f'but got {type(exclude)}: {exclude}')
if exclude and isinstance(layer, exclude):
return False
all_norm_bases = (_BatchNorm, _InstanceNorm, nn.GroupNorm, nn.LayerNorm)
return isinstance(layer, all_norm_bases)
# Copyright (c) OpenMMLab. All rights reserved.
import torch.nn as nn
from .registry import PADDING_LAYERS
PADDING_LAYERS.register_module('zero', module=nn.ZeroPad2d)
PADDING_LAYERS.register_module('reflect', module=nn.ReflectionPad2d)
PADDING_LAYERS.register_module('replicate', module=nn.ReplicationPad2d)
def build_padding_layer(cfg, *args, **kwargs):
"""Build padding layer.
Args:
cfg (None or dict): The padding layer config, which should contain:
- type (str): Layer type.
- layer args: Args needed to instantiate a padding layer.
Returns:
nn.Module: Created padding layer.
"""
if not isinstance(cfg, dict):
raise TypeError('cfg must be a dict')
if 'type' not in cfg:
raise KeyError('the cfg dict must contain the key "type"')
cfg_ = cfg.copy()
padding_type = cfg_.pop('type')
if padding_type not in PADDING_LAYERS:
raise KeyError(f'Unrecognized padding type {padding_type}.')
else:
padding_layer = PADDING_LAYERS.get(padding_type)
layer = padding_layer(*args, **kwargs, **cfg_)
return layer
import inspect
import platform
from .registry import PLUGIN_LAYERS
if platform.system() == 'Windows':
import regex as re
else:
import re
def infer_abbr(class_type):
"""Infer abbreviation from the class name.
This method will infer the abbreviation to map class types to
abbreviations.
Rule 1: If the class has the property "abbr", return the property.
Rule 2: Otherwise, the abbreviation falls back to snake case of class
name, e.g. the abbreviation of ``FancyBlock`` will be ``fancy_block``.
Args:
class_type (type): The norm layer type.
Returns:
str: The inferred abbreviation.
"""
def camel2snack(word):
"""Convert camel case word into snack case.
Modified from `inflection lib
<https://inflection.readthedocs.io/en/latest/#inflection.underscore>`_.
Example::
>>> camel2snack("FancyBlock")
'fancy_block'
"""
word = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1_\2', word)
word = re.sub(r'([a-z\d])([A-Z])', r'\1_\2', word)
word = word.replace('-', '_')
return word.lower()
if not inspect.isclass(class_type):
raise TypeError(
f'class_type must be a type, but got {type(class_type)}')
if hasattr(class_type, '_abbr_'):
return class_type._abbr_
else:
return camel2snack(class_type.__name__)
def build_plugin_layer(cfg, postfix='', **kwargs):
"""Build plugin layer.
Args:
cfg (None or dict): cfg should contain:
type (str): identify plugin layer type.
layer args: args needed to instantiate a plugin layer.
postfix (int, str): appended into norm abbreviation to
create named layer. Default: ''.
Returns:
tuple[str, nn.Module]:
name (str): abbreviation + postfix
layer (nn.Module): created plugin layer
"""
if not isinstance(cfg, dict):
raise TypeError('cfg must be a dict')
if 'type' not in cfg:
raise KeyError('the cfg dict must contain the key "type"')
cfg_ = cfg.copy()
layer_type = cfg_.pop('type')
if layer_type not in PLUGIN_LAYERS:
raise KeyError(f'Unrecognized plugin type {layer_type}')
plugin_layer = PLUGIN_LAYERS.get(layer_type)
abbr = infer_abbr(plugin_layer)
assert isinstance(postfix, (int, str))
name = abbr + str(postfix)
layer = plugin_layer(**kwargs, **cfg_)
return name, layer
# Copyright (c) OpenMMLab. All rights reserved.
from annotator.uniformer.mmcv.utils import Registry
CONV_LAYERS = Registry('conv layer')
NORM_LAYERS = Registry('norm layer')
ACTIVATION_LAYERS = Registry('activation layer')
PADDING_LAYERS = Registry('padding layer')
UPSAMPLE_LAYERS = Registry('upsample layer')
PLUGIN_LAYERS = Registry('plugin layer')
DROPOUT_LAYERS = Registry('drop out layers')
POSITIONAL_ENCODING = Registry('position encoding')
ATTENTION = Registry('attention')
FEEDFORWARD_NETWORK = Registry('feed-forward Network')
TRANSFORMER_LAYER = Registry('transformerLayer')
TRANSFORMER_LAYER_SEQUENCE = Registry('transformer-layers sequence')
# Copyright (c) OpenMMLab. All rights reserved.
import torch
import torch.nn as nn
class Scale(nn.Module):
"""A learnable scale parameter.
This layer scales the input by a learnable factor. It multiplies a
learnable scale parameter of shape (1,) with input of any shape.
Args:
scale (float): Initial value of scale factor. Default: 1.0
"""
def __init__(self, scale=1.0):
super(Scale, self).__init__()
self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float))
def forward(self, x):
return x * self.scale
# Copyright (c) OpenMMLab. All rights reserved.
import torch
import torch.nn as nn
from .registry import ACTIVATION_LAYERS
@ACTIVATION_LAYERS.register_module()
class Swish(nn.Module):
"""Swish Module.
This module applies the swish function:
.. math::
Swish(x) = x * Sigmoid(x)
Returns:
Tensor: The output tensor.
"""
def __init__(self):
super(Swish, self).__init__()
def forward(self, x):
return x * torch.sigmoid(x)
# Copyright (c) OpenMMLab. All rights reserved.
import copy
import warnings
import torch
import torch.nn as nn
from annotator.uniformer.mmcv import ConfigDict, deprecated_api_warning
from annotator.uniformer.mmcv.cnn import Linear, build_activation_layer, build_norm_layer
from annotator.uniformer.mmcv.runner.base_module import BaseModule, ModuleList, Sequential
from annotator.uniformer.mmcv.utils import build_from_cfg
from .drop import build_dropout
from .registry import (ATTENTION, FEEDFORWARD_NETWORK, POSITIONAL_ENCODING,
TRANSFORMER_LAYER, TRANSFORMER_LAYER_SEQUENCE)
# Avoid BC-breaking of importing MultiScaleDeformableAttention from this file
try:
from annotator.uniformer.mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention # noqa F401
warnings.warn(
ImportWarning(
'``MultiScaleDeformableAttention`` has been moved to '
'``mmcv.ops.multi_scale_deform_attn``, please change original path ' # noqa E501
'``from annotator.uniformer.mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention`` ' # noqa E501
'to ``from annotator.uniformer.mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention`` ' # noqa E501
))
except ImportError:
warnings.warn('Fail to import ``MultiScaleDeformableAttention`` from '
'``mmcv.ops.multi_scale_deform_attn``, '
'You should install ``mmcv-full`` if you need this module. ')
def build_positional_encoding(cfg, default_args=None):
"""Builder for Position Encoding."""
return build_from_cfg(cfg, POSITIONAL_ENCODING, default_args)
def build_attention(cfg, default_args=None):
"""Builder for attention."""
return build_from_cfg(cfg, ATTENTION, default_args)
def build_feedforward_network(cfg, default_args=None):
"""Builder for feed-forward network (FFN)."""
return build_from_cfg(cfg, FEEDFORWARD_NETWORK, default_args)
def build_transformer_layer(cfg, default_args=None):
"""Builder for transformer layer."""
return build_from_cfg(cfg, TRANSFORMER_LAYER, default_args)
def build_transformer_layer_sequence(cfg, default_args=None):
"""Builder for transformer encoder and transformer decoder."""
return build_from_cfg(cfg, TRANSFORMER_LAYER_SEQUENCE, default_args)
@ATTENTION.register_module()
class MultiheadAttention(BaseModule):
"""A wrapper for ``torch.nn.MultiheadAttention``.
This module implements MultiheadAttention with identity connection,
and positional encoding is also passed as input.
Args:
embed_dims (int): The embedding dimension.
num_heads (int): Parallel attention heads.
attn_drop (float): A Dropout layer on attn_output_weights.
Default: 0.0.
proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
Default: 0.0.
dropout_layer (obj:`ConfigDict`): The dropout_layer used
when adding the shortcut.
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
Default: None.
batch_first (bool): When it is True, Key, Query and Value are shape of
(batch, n, embed_dim), otherwise (n, batch, embed_dim).
Default to False.
"""
def __init__(self,
embed_dims,
num_heads,
attn_drop=0.,
proj_drop=0.,
dropout_layer=dict(type='Dropout', drop_prob=0.),
init_cfg=None,
batch_first=False,
**kwargs):
super(MultiheadAttention, self).__init__(init_cfg)
if 'dropout' in kwargs:
warnings.warn('The arguments `dropout` in MultiheadAttention '
'has been deprecated, now you can separately '
'set `attn_drop`(float), proj_drop(float), '
'and `dropout_layer`(dict) ')
attn_drop = kwargs['dropout']
dropout_layer['drop_prob'] = kwargs.pop('dropout')
self.embed_dims = embed_dims
self.num_heads = num_heads
self.batch_first = batch_first
self.attn = nn.MultiheadAttention(embed_dims, num_heads, attn_drop,
**kwargs)
self.proj_drop = nn.Dropout(proj_drop)
self.dropout_layer = build_dropout(
dropout_layer) if dropout_layer else nn.Identity()
@deprecated_api_warning({'residual': 'identity'},
cls_name='MultiheadAttention')
def forward(self,
query,
key=None,
value=None,
identity=None,
query_pos=None,
key_pos=None,
attn_mask=None,
key_padding_mask=None,
**kwargs):
"""Forward function for `MultiheadAttention`.
**kwargs allow passing a more general data flow when combining
with other operations in `transformerlayer`.
Args:
query (Tensor): The input query with shape [num_queries, bs,
embed_dims] if self.batch_first is False, else
[bs, num_queries embed_dims].
key (Tensor): The key tensor with shape [num_keys, bs,
embed_dims] if self.batch_first is False, else
[bs, num_keys, embed_dims] .
If None, the ``query`` will be used. Defaults to None.
value (Tensor): The value tensor with same shape as `key`.
Same in `nn.MultiheadAttention.forward`. Defaults to None.
If None, the `key` will be used.
identity (Tensor): This tensor, with the same shape as x,
will be used for the identity link.
If None, `x` will be used. Defaults to None.
query_pos (Tensor): The positional encoding for query, with
the same shape as `x`. If not None, it will
be added to `x` before forward function. Defaults to None.
key_pos (Tensor): The positional encoding for `key`, with the
same shape as `key`. Defaults to None. If not None, it will
be added to `key` before forward function. If None, and
`query_pos` has the same shape as `key`, then `query_pos`
will be used for `key_pos`. Defaults to None.
attn_mask (Tensor): ByteTensor mask with shape [num_queries,
num_keys]. Same in `nn.MultiheadAttention.forward`.
Defaults to None.
key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys].
Defaults to None.
Returns:
Tensor: forwarded results with shape
[num_queries, bs, embed_dims]
if self.batch_first is False, else
[bs, num_queries embed_dims].
"""
if key is None:
key = query
if value is None:
value = key
if identity is None:
identity = query
if key_pos is None:
if query_pos is not None:
# use query_pos if key_pos is not available
if query_pos.shape == key.shape:
key_pos = query_pos
else:
warnings.warn(f'position encoding of key is'
f'missing in {self.__class__.__name__}.')
if query_pos is not None:
query = query + query_pos
if key_pos is not None:
key = key + key_pos
# Because the dataflow('key', 'query', 'value') of
# ``torch.nn.MultiheadAttention`` is (num_query, batch,
# embed_dims), We should adjust the shape of dataflow from
# batch_first (batch, num_query, embed_dims) to num_query_first
# (num_query ,batch, embed_dims), and recover ``attn_output``
# from num_query_first to batch_first.
if self.batch_first:
query = query.transpose(0, 1)
key = key.transpose(0, 1)
value = value.transpose(0, 1)
out = self.attn(
query=query,
key=key,
value=value,
attn_mask=attn_mask,
key_padding_mask=key_padding_mask)[0]
if self.batch_first:
out = out.transpose(0, 1)
return identity + self.dropout_layer(self.proj_drop(out))
@FEEDFORWARD_NETWORK.register_module()
class FFN(BaseModule):
"""Implements feed-forward networks (FFNs) with identity connection.
Args:
embed_dims (int): The feature dimension. Same as
`MultiheadAttention`. Defaults: 256.
feedforward_channels (int): The hidden dimension of FFNs.
Defaults: 1024.
num_fcs (int, optional): The number of fully-connected layers in
FFNs. Default: 2.
act_cfg (dict, optional): The activation config for FFNs.
Default: dict(type='ReLU')
ffn_drop (float, optional): Probability of an element to be
zeroed in FFN. Default 0.0.
add_identity (bool, optional): Whether to add the
identity connection. Default: `True`.
dropout_layer (obj:`ConfigDict`): The dropout_layer used
when adding the shortcut.
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
Default: None.
"""
@deprecated_api_warning(
{
'dropout': 'ffn_drop',
'add_residual': 'add_identity'
},
cls_name='FFN')
def __init__(self,
embed_dims=256,
feedforward_channels=1024,
num_fcs=2,
act_cfg=dict(type='ReLU', inplace=True),
ffn_drop=0.,
dropout_layer=None,
add_identity=True,
init_cfg=None,
**kwargs):
super(FFN, self).__init__(init_cfg)
assert num_fcs >= 2, 'num_fcs should be no less ' \
f'than 2. got {num_fcs}.'
self.embed_dims = embed_dims
self.feedforward_channels = feedforward_channels
self.num_fcs = num_fcs
self.act_cfg = act_cfg
self.activate = build_activation_layer(act_cfg)
layers = []
in_channels = embed_dims
for _ in range(num_fcs - 1):
layers.append(
Sequential(
Linear(in_channels, feedforward_channels), self.activate,
nn.Dropout(ffn_drop)))
in_channels = feedforward_channels
layers.append(Linear(feedforward_channels, embed_dims))
layers.append(nn.Dropout(ffn_drop))
self.layers = Sequential(*layers)
self.dropout_layer = build_dropout(
dropout_layer) if dropout_layer else torch.nn.Identity()
self.add_identity = add_identity
@deprecated_api_warning({'residual': 'identity'}, cls_name='FFN')
def forward(self, x, identity=None):
"""Forward function for `FFN`.
The function would add x to the output tensor if residue is None.
"""
out = self.layers(x)
if not self.add_identity:
return self.dropout_layer(out)
if identity is None:
identity = x
return identity + self.dropout_layer(out)
@TRANSFORMER_LAYER.register_module()
class BaseTransformerLayer(BaseModule):
"""Base `TransformerLayer` for vision transformer.
It can be built from `mmcv.ConfigDict` and support more flexible
customization, for example, using any number of `FFN or LN ` and
use different kinds of `attention` by specifying a list of `ConfigDict`
named `attn_cfgs`. It is worth mentioning that it supports `prenorm`
when you specifying `norm` as the first element of `operation_order`.
More details about the `prenorm`: `On Layer Normalization in the
Transformer Architecture <https://arxiv.org/abs/2002.04745>`_ .
Args:
attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
Configs for `self_attention` or `cross_attention` modules,
The order of the configs in the list should be consistent with
corresponding attentions in operation_order.
If it is a dict, all of the attention modules in operation_order
will be built with this config. Default: None.
ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
Configs for FFN, The order of the configs in the list should be
consistent with corresponding ffn in operation_order.
If it is a dict, all of the attention modules in operation_order
will be built with this config.
operation_order (tuple[str]): The execution order of operation
in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
Support `prenorm` when you specifying first element as `norm`.
Default:None.
norm_cfg (dict): Config dict for normalization layer.
Default: dict(type='LN').
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
Default: None.
batch_first (bool): Key, Query and Value are shape
of (batch, n, embed_dim)
or (n, batch, embed_dim). Default to False.
"""
def __init__(self,
attn_cfgs=None,
ffn_cfgs=dict(
type='FFN',
embed_dims=256,
feedforward_channels=1024,
num_fcs=2,
ffn_drop=0.,
act_cfg=dict(type='ReLU', inplace=True),
),
operation_order=None,
norm_cfg=dict(type='LN'),
init_cfg=None,
batch_first=False,
**kwargs):
deprecated_args = dict(
feedforward_channels='feedforward_channels',
ffn_dropout='ffn_drop',
ffn_num_fcs='num_fcs')
for ori_name, new_name in deprecated_args.items():
if ori_name in kwargs:
warnings.warn(
f'The arguments `{ori_name}` in BaseTransformerLayer '
f'has been deprecated, now you should set `{new_name}` '
f'and other FFN related arguments '
f'to a dict named `ffn_cfgs`. ')
ffn_cfgs[new_name] = kwargs[ori_name]
super(BaseTransformerLayer, self).__init__(init_cfg)
self.batch_first = batch_first
assert set(operation_order) & set(
['self_attn', 'norm', 'ffn', 'cross_attn']) == \
set(operation_order), f'The operation_order of' \
f' {self.__class__.__name__} should ' \
f'contains all four operation type ' \
f"{['self_attn', 'norm', 'ffn', 'cross_attn']}"
num_attn = operation_order.count('self_attn') + operation_order.count(
'cross_attn')
if isinstance(attn_cfgs, dict):
attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)]
else:
assert num_attn == len(attn_cfgs), f'The length ' \
f'of attn_cfg {num_attn} is ' \
f'not consistent with the number of attention' \
f'in operation_order {operation_order}.'
self.num_attn = num_attn
self.operation_order = operation_order
self.norm_cfg = norm_cfg
self.pre_norm = operation_order[0] == 'norm'
self.attentions = ModuleList()
index = 0
for operation_name in operation_order:
if operation_name in ['self_attn', 'cross_attn']:
if 'batch_first' in attn_cfgs[index]:
assert self.batch_first == attn_cfgs[index]['batch_first']
else:
attn_cfgs[index]['batch_first'] = self.batch_first
attention = build_attention(attn_cfgs[index])
# Some custom attentions used as `self_attn`
# or `cross_attn` can have different behavior.
attention.operation_name = operation_name
self.attentions.append(attention)
index += 1
self.embed_dims = self.attentions[0].embed_dims
self.ffns = ModuleList()
num_ffns = operation_order.count('ffn')
if isinstance(ffn_cfgs, dict):
ffn_cfgs = ConfigDict(ffn_cfgs)
if isinstance(ffn_cfgs, dict):
ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)]
assert len(ffn_cfgs) == num_ffns
for ffn_index in range(num_ffns):
if 'embed_dims' not in ffn_cfgs[ffn_index]:
ffn_cfgs['embed_dims'] = self.embed_dims
else:
assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims
self.ffns.append(
build_feedforward_network(ffn_cfgs[ffn_index],
dict(type='FFN')))
self.norms = ModuleList()
num_norms = operation_order.count('norm')
for _ in range(num_norms):
self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1])
def forward(self,
query,
key=None,
value=None,
query_pos=None,
key_pos=None,
attn_masks=None,
query_key_padding_mask=None,
key_padding_mask=None,
**kwargs):
"""Forward function for `TransformerDecoderLayer`.
**kwargs contains some specific arguments of attentions.
Args:
query (Tensor): The input query with shape
[num_queries, bs, embed_dims] if
self.batch_first is False, else
[bs, num_queries embed_dims].
key (Tensor): The key tensor with shape [num_keys, bs,
embed_dims] if self.batch_first is False, else
[bs, num_keys, embed_dims] .
value (Tensor): The value tensor with same shape as `key`.
query_pos (Tensor): The positional encoding for `query`.
Default: None.
key_pos (Tensor): The positional encoding for `key`.
Default: None.
attn_masks (List[Tensor] | None): 2D Tensor used in
calculation of corresponding attention. The length of
it should equal to the number of `attention` in
`operation_order`. Default: None.
query_key_padding_mask (Tensor): ByteTensor for `query`, with
shape [bs, num_queries]. Only used in `self_attn` layer.
Defaults to None.
key_padding_mask (Tensor): ByteTensor for `query`, with
shape [bs, num_keys]. Default: None.
Returns:
Tensor: forwarded results with shape [num_queries, bs, embed_dims].
"""
norm_index = 0
attn_index = 0
ffn_index = 0
identity = query
if attn_masks is None:
attn_masks = [None for _ in range(self.num_attn)]
elif isinstance(attn_masks, torch.Tensor):
attn_masks = [
copy.deepcopy(attn_masks) for _ in range(self.num_attn)
]
warnings.warn(f'Use same attn_mask in all attentions in '
f'{self.__class__.__name__} ')
else:
assert len(attn_masks) == self.num_attn, f'The length of ' \
f'attn_masks {len(attn_masks)} must be equal ' \
f'to the number of attention in ' \
f'operation_order {self.num_attn}'
for layer in self.operation_order:
if layer == 'self_attn':
temp_key = temp_value = query
query = self.attentions[attn_index](
query,
temp_key,
temp_value,
identity if self.pre_norm else None,
query_pos=query_pos,
key_pos=query_pos,
attn_mask=attn_masks[attn_index],
key_padding_mask=query_key_padding_mask,
**kwargs)
attn_index += 1
identity = query
elif layer == 'norm':
query = self.norms[norm_index](query)
norm_index += 1
elif layer == 'cross_attn':
query = self.attentions[attn_index](
query,
key,
value,
identity if self.pre_norm else None,
query_pos=query_pos,
key_pos=key_pos,
attn_mask=attn_masks[attn_index],
key_padding_mask=key_padding_mask,
**kwargs)
attn_index += 1
identity = query
elif layer == 'ffn':
query = self.ffns[ffn_index](
query, identity if self.pre_norm else None)
ffn_index += 1
return query
@TRANSFORMER_LAYER_SEQUENCE.register_module()
class TransformerLayerSequence(BaseModule):
"""Base class for TransformerEncoder and TransformerDecoder in vision
transformer.
As base-class of Encoder and Decoder in vision transformer.
Support customization such as specifying different kind
of `transformer_layer` in `transformer_coder`.
Args:
transformerlayer (list[obj:`mmcv.ConfigDict`] |
obj:`mmcv.ConfigDict`): Config of transformerlayer
in TransformerCoder. If it is obj:`mmcv.ConfigDict`,
it would be repeated `num_layer` times to a
list[`mmcv.ConfigDict`]. Default: None.
num_layers (int): The number of `TransformerLayer`. Default: None.
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
Default: None.
"""
def __init__(self, transformerlayers=None, num_layers=None, init_cfg=None):
super(TransformerLayerSequence, self).__init__(init_cfg)
if isinstance(transformerlayers, dict):
transformerlayers = [
copy.deepcopy(transformerlayers) for _ in range(num_layers)
]
else:
assert isinstance(transformerlayers, list) and \
len(transformerlayers) == num_layers
self.num_layers = num_layers
self.layers = ModuleList()
for i in range(num_layers):
self.layers.append(build_transformer_layer(transformerlayers[i]))
self.embed_dims = self.layers[0].embed_dims
self.pre_norm = self.layers[0].pre_norm
def forward(self,
query,
key,
value,
query_pos=None,
key_pos=None,
attn_masks=None,
query_key_padding_mask=None,
key_padding_mask=None,
**kwargs):
"""Forward function for `TransformerCoder`.
Args:
query (Tensor): Input query with shape
`(num_queries, bs, embed_dims)`.
key (Tensor): The key tensor with shape
`(num_keys, bs, embed_dims)`.
value (Tensor): The value tensor with shape
`(num_keys, bs, embed_dims)`.
query_pos (Tensor): The positional encoding for `query`.
Default: None.
key_pos (Tensor): The positional encoding for `key`.
Default: None.
attn_masks (List[Tensor], optional): Each element is 2D Tensor
which is used in calculation of corresponding attention in
operation_order. Default: None.
query_key_padding_mask (Tensor): ByteTensor for `query`, with
shape [bs, num_queries]. Only used in self-attention
Default: None.
key_padding_mask (Tensor): ByteTensor for `query`, with
shape [bs, num_keys]. Default: None.
Returns:
Tensor: results with shape [num_queries, bs, embed_dims].
"""
for layer in self.layers:
query = layer(
query,
key,
value,
query_pos=query_pos,
key_pos=key_pos,
attn_masks=attn_masks,
query_key_padding_mask=query_key_padding_mask,
key_padding_mask=key_padding_mask,
**kwargs)
return query
# Copyright (c) OpenMMLab. All rights reserved.
import torch.nn as nn
import torch.nn.functional as F
from ..utils import xavier_init
from .registry import UPSAMPLE_LAYERS
UPSAMPLE_LAYERS.register_module('nearest', module=nn.Upsample)
UPSAMPLE_LAYERS.register_module('bilinear', module=nn.Upsample)
@UPSAMPLE_LAYERS.register_module(name='pixel_shuffle')
class PixelShufflePack(nn.Module):
"""Pixel Shuffle upsample layer.
This module packs `F.pixel_shuffle()` and a nn.Conv2d module together to
achieve a simple upsampling with pixel shuffle.
Args:
in_channels (int): Number of input channels.
out_channels (int): Number of output channels.
scale_factor (int): Upsample ratio.
upsample_kernel (int): Kernel size of the conv layer to expand the
channels.
"""
def __init__(self, in_channels, out_channels, scale_factor,
upsample_kernel):
super(PixelShufflePack, self).__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.scale_factor = scale_factor
self.upsample_kernel = upsample_kernel
self.upsample_conv = nn.Conv2d(
self.in_channels,
self.out_channels * scale_factor * scale_factor,
self.upsample_kernel,
padding=(self.upsample_kernel - 1) // 2)
self.init_weights()
def init_weights(self):
xavier_init(self.upsample_conv, distribution='uniform')
def forward(self, x):
x = self.upsample_conv(x)
x = F.pixel_shuffle(x, self.scale_factor)
return x
def build_upsample_layer(cfg, *args, **kwargs):
"""Build upsample layer.
Args:
cfg (dict): The upsample layer config, which should contain:
- type (str): Layer type.
- scale_factor (int): Upsample ratio, which is not applicable to
deconv.
- layer args: Args needed to instantiate a upsample layer.
args (argument list): Arguments passed to the ``__init__``
method of the corresponding conv layer.
kwargs (keyword arguments): Keyword arguments passed to the
``__init__`` method of the corresponding conv layer.
Returns:
nn.Module: Created upsample layer.
"""
if not isinstance(cfg, dict):
raise TypeError(f'cfg must be a dict, but got {type(cfg)}')
if 'type' not in cfg:
raise KeyError(
f'the cfg dict must contain the key "type", but got {cfg}')
cfg_ = cfg.copy()
layer_type = cfg_.pop('type')
if layer_type not in UPSAMPLE_LAYERS:
raise KeyError(f'Unrecognized upsample type {layer_type}')
else:
upsample = UPSAMPLE_LAYERS.get(layer_type)
if upsample is nn.Upsample:
cfg_['mode'] = layer_type
layer = upsample(*args, **kwargs, **cfg_)
return layer
# Copyright (c) OpenMMLab. All rights reserved.
r"""Modified from https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/wrappers.py # noqa: E501
Wrap some nn modules to support empty tensor input. Currently, these wrappers
are mainly used in mask heads like fcn_mask_head and maskiou_heads since mask
heads are trained on only positive RoIs.
"""
import math
import torch
import torch.nn as nn
from torch.nn.modules.utils import _pair, _triple
from .registry import CONV_LAYERS, UPSAMPLE_LAYERS
if torch.__version__ == 'parrots':
TORCH_VERSION = torch.__version__
else:
# torch.__version__ could be 1.3.1+cu92, we only need the first two
# for comparison
TORCH_VERSION = tuple(int(x) for x in torch.__version__.split('.')[:2])
def obsolete_torch_version(torch_version, version_threshold):
return torch_version == 'parrots' or torch_version <= version_threshold
class NewEmptyTensorOp(torch.autograd.Function):
@staticmethod
def forward(ctx, x, new_shape):
ctx.shape = x.shape
return x.new_empty(new_shape)
@staticmethod
def backward(ctx, grad):
shape = ctx.shape
return NewEmptyTensorOp.apply(grad, shape), None
@CONV_LAYERS.register_module('Conv', force=True)
class Conv2d(nn.Conv2d):
def forward(self, x):
if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)):
out_shape = [x.shape[0], self.out_channels]
for i, k, p, s, d in zip(x.shape[-2:], self.kernel_size,
self.padding, self.stride, self.dilation):
o = (i + 2 * p - (d * (k - 1) + 1)) // s + 1
out_shape.append(o)
empty = NewEmptyTensorOp.apply(x, out_shape)
if self.training:
# produce dummy gradient to avoid DDP warning.
dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
return empty + dummy
else:
return empty
return super().forward(x)
@CONV_LAYERS.register_module('Conv3d', force=True)
class Conv3d(nn.Conv3d):
def forward(self, x):
if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)):
out_shape = [x.shape[0], self.out_channels]
for i, k, p, s, d in zip(x.shape[-3:], self.kernel_size,
self.padding, self.stride, self.dilation):
o = (i + 2 * p - (d * (k - 1) + 1)) // s + 1
out_shape.append(o)
empty = NewEmptyTensorOp.apply(x, out_shape)
if self.training:
# produce dummy gradient to avoid DDP warning.
dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
return empty + dummy
else:
return empty
return super().forward(x)
@CONV_LAYERS.register_module()
@CONV_LAYERS.register_module('deconv')
@UPSAMPLE_LAYERS.register_module('deconv', force=True)
class ConvTranspose2d(nn.ConvTranspose2d):
def forward(self, x):
if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)):
out_shape = [x.shape[0], self.out_channels]
for i, k, p, s, d, op in zip(x.shape[-2:], self.kernel_size,
self.padding, self.stride,
self.dilation, self.output_padding):
out_shape.append((i - 1) * s - 2 * p + (d * (k - 1) + 1) + op)
empty = NewEmptyTensorOp.apply(x, out_shape)
if self.training:
# produce dummy gradient to avoid DDP warning.
dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
return empty + dummy
else:
return empty
return super().forward(x)
@CONV_LAYERS.register_module()
@CONV_LAYERS.register_module('deconv3d')
@UPSAMPLE_LAYERS.register_module('deconv3d', force=True)
class ConvTranspose3d(nn.ConvTranspose3d):
def forward(self, x):
if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)):
out_shape = [x.shape[0], self.out_channels]
for i, k, p, s, d, op in zip(x.shape[-3:], self.kernel_size,
self.padding, self.stride,
self.dilation, self.output_padding):
out_shape.append((i - 1) * s - 2 * p + (d * (k - 1) + 1) + op)
empty = NewEmptyTensorOp.apply(x, out_shape)
if self.training:
# produce dummy gradient to avoid DDP warning.
dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
return empty + dummy
else:
return empty
return super().forward(x)
class MaxPool2d(nn.MaxPool2d):
def forward(self, x):
# PyTorch 1.9 does not support empty tensor inference yet
if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 9)):
out_shape = list(x.shape[:2])
for i, k, p, s, d in zip(x.shape[-2:], _pair(self.kernel_size),
_pair(self.padding), _pair(self.stride),
_pair(self.dilation)):
o = (i + 2 * p - (d * (k - 1) + 1)) / s + 1
o = math.ceil(o) if self.ceil_mode else math.floor(o)
out_shape.append(o)
empty = NewEmptyTensorOp.apply(x, out_shape)
return empty
return super().forward(x)
class MaxPool3d(nn.MaxPool3d):
def forward(self, x):
# PyTorch 1.9 does not support empty tensor inference yet
if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 9)):
out_shape = list(x.shape[:2])
for i, k, p, s, d in zip(x.shape[-3:], _triple(self.kernel_size),
_triple(self.padding),
_triple(self.stride),
_triple(self.dilation)):
o = (i + 2 * p - (d * (k - 1) + 1)) / s + 1
o = math.ceil(o) if self.ceil_mode else math.floor(o)
out_shape.append(o)
empty = NewEmptyTensorOp.apply(x, out_shape)
return empty
return super().forward(x)
class Linear(torch.nn.Linear):
def forward(self, x):
# empty tensor forward of Linear layer is supported in Pytorch 1.6
if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 5)):
out_shape = [x.shape[0], self.out_features]
empty = NewEmptyTensorOp.apply(x, out_shape)
if self.training:
# produce dummy gradient to avoid DDP warning.
dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
return empty + dummy
else:
return empty
return super().forward(x)
# Copyright (c) OpenMMLab. All rights reserved.
from ..runner import Sequential
from ..utils import Registry, build_from_cfg
def build_model_from_cfg(cfg, registry, default_args=None):
"""Build a PyTorch model from config dict(s). Different from
``build_from_cfg``, if cfg is a list, a ``nn.Sequential`` will be built.
Args:
cfg (dict, list[dict]): The config of modules, is is either a config
dict or a list of config dicts. If cfg is a list, a
the built modules will be wrapped with ``nn.Sequential``.
registry (:obj:`Registry`): A registry the module belongs to.
default_args (dict, optional): Default arguments to build the module.
Defaults to None.
Returns:
nn.Module: A built nn module.
"""
if isinstance(cfg, list):
modules = [
build_from_cfg(cfg_, registry, default_args) for cfg_ in cfg
]
return Sequential(*modules)
else:
return build_from_cfg(cfg, registry, default_args)
MODELS = Registry('model', build_func=build_model_from_cfg)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment