Commit 85529f35 authored by unknown's avatar unknown
Browse files

添加openmmlab测试用例

parent b21b0c01
import torch.nn as nn
import torch.utils.checkpoint as cp
from mmcv.cnn import (ConvModule, build_conv_layer, build_norm_layer,
constant_init)
from mmcv.utils.parrots_wrapper import _BatchNorm
from ..builder import BACKBONES
from .base_backbone import BaseBackbone
class BasicBlock(nn.Module):
"""BasicBlock for ResNet.
Args:
in_channels (int): Input channels of this block.
out_channels (int): Output channels of this block.
expansion (int): The ratio of ``out_channels/mid_channels`` where
``mid_channels`` is the output channels of conv1. This is a
reserved argument in BasicBlock and should always be 1. Default: 1.
stride (int): stride of the block. Default: 1
dilation (int): dilation of convolution. Default: 1
downsample (nn.Module, optional): downsample operation on identity
branch. Default: None.
style (str): `pytorch` or `caffe`. It is unused and reserved for
unified API with Bottleneck.
with_cp (bool): Use checkpoint or not. Using checkpoint will save some
memory while slowing down the training speed.
conv_cfg (dict, optional): dictionary to construct and config conv
layer. Default: None
norm_cfg (dict): dictionary to construct and config norm layer.
Default: dict(type='BN')
"""
def __init__(self,
in_channels,
out_channels,
expansion=1,
stride=1,
dilation=1,
downsample=None,
style='pytorch',
with_cp=False,
conv_cfg=None,
norm_cfg=dict(type='BN')):
super(BasicBlock, self).__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.expansion = expansion
assert self.expansion == 1
assert out_channels % expansion == 0
self.mid_channels = out_channels // expansion
self.stride = stride
self.dilation = dilation
self.style = style
self.with_cp = with_cp
self.conv_cfg = conv_cfg
self.norm_cfg = norm_cfg
self.norm1_name, norm1 = build_norm_layer(
norm_cfg, self.mid_channels, postfix=1)
self.norm2_name, norm2 = build_norm_layer(
norm_cfg, out_channels, postfix=2)
self.conv1 = build_conv_layer(
conv_cfg,
in_channels,
self.mid_channels,
3,
stride=stride,
padding=dilation,
dilation=dilation,
bias=False)
self.add_module(self.norm1_name, norm1)
self.conv2 = build_conv_layer(
conv_cfg,
self.mid_channels,
out_channels,
3,
padding=1,
bias=False)
self.add_module(self.norm2_name, norm2)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
@property
def norm1(self):
return getattr(self, self.norm1_name)
@property
def norm2(self):
return getattr(self, self.norm2_name)
def forward(self, x):
def _inner_forward(x):
identity = x
out = self.conv1(x)
out = self.norm1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.norm2(out)
if self.downsample is not None:
identity = self.downsample(x)
out += identity
return out
if self.with_cp and x.requires_grad:
out = cp.checkpoint(_inner_forward, x)
else:
out = _inner_forward(x)
out = self.relu(out)
return out
class Bottleneck(nn.Module):
"""Bottleneck block for ResNet.
Args:
in_channels (int): Input channels of this block.
out_channels (int): Output channels of this block.
expansion (int): The ratio of ``out_channels/mid_channels`` where
``mid_channels`` is the input/output channels of conv2. Default: 4.
stride (int): stride of the block. Default: 1
dilation (int): dilation of convolution. Default: 1
downsample (nn.Module, optional): downsample operation on identity
branch. Default: None.
style (str): ``"pytorch"`` or ``"caffe"``. If set to "pytorch", the
stride-two layer is the 3x3 conv layer, otherwise the stride-two
layer is the first 1x1 conv layer. Default: "pytorch".
with_cp (bool): Use checkpoint or not. Using checkpoint will save some
memory while slowing down the training speed.
conv_cfg (dict, optional): dictionary to construct and config conv
layer. Default: None
norm_cfg (dict): dictionary to construct and config norm layer.
Default: dict(type='BN')
"""
def __init__(self,
in_channels,
out_channels,
expansion=4,
stride=1,
dilation=1,
downsample=None,
style='pytorch',
with_cp=False,
conv_cfg=None,
norm_cfg=dict(type='BN')):
super(Bottleneck, self).__init__()
assert style in ['pytorch', 'caffe']
self.in_channels = in_channels
self.out_channels = out_channels
self.expansion = expansion
assert out_channels % expansion == 0
self.mid_channels = out_channels // expansion
self.stride = stride
self.dilation = dilation
self.style = style
self.with_cp = with_cp
self.conv_cfg = conv_cfg
self.norm_cfg = norm_cfg
if self.style == 'pytorch':
self.conv1_stride = 1
self.conv2_stride = stride
else:
self.conv1_stride = stride
self.conv2_stride = 1
self.norm1_name, norm1 = build_norm_layer(
norm_cfg, self.mid_channels, postfix=1)
self.norm2_name, norm2 = build_norm_layer(
norm_cfg, self.mid_channels, postfix=2)
self.norm3_name, norm3 = build_norm_layer(
norm_cfg, out_channels, postfix=3)
self.conv1 = build_conv_layer(
conv_cfg,
in_channels,
self.mid_channels,
kernel_size=1,
stride=self.conv1_stride,
bias=False)
self.add_module(self.norm1_name, norm1)
self.conv2 = build_conv_layer(
conv_cfg,
self.mid_channels,
self.mid_channels,
kernel_size=3,
stride=self.conv2_stride,
padding=dilation,
dilation=dilation,
bias=False)
self.add_module(self.norm2_name, norm2)
self.conv3 = build_conv_layer(
conv_cfg,
self.mid_channels,
out_channels,
kernel_size=1,
bias=False)
self.add_module(self.norm3_name, norm3)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
@property
def norm1(self):
return getattr(self, self.norm1_name)
@property
def norm2(self):
return getattr(self, self.norm2_name)
@property
def norm3(self):
return getattr(self, self.norm3_name)
def forward(self, x):
def _inner_forward(x):
identity = x
out = self.conv1(x)
out = self.norm1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.norm2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.norm3(out)
if self.downsample is not None:
identity = self.downsample(x)
out += identity
return out
if self.with_cp and x.requires_grad:
out = cp.checkpoint(_inner_forward, x)
else:
out = _inner_forward(x)
out = self.relu(out)
return out
def get_expansion(block, expansion=None):
"""Get the expansion of a residual block.
The block expansion will be obtained by the following order:
1. If ``expansion`` is given, just return it.
2. If ``block`` has the attribute ``expansion``, then return
``block.expansion``.
3. Return the default value according the the block type:
1 for ``BasicBlock`` and 4 for ``Bottleneck``.
Args:
block (class): The block class.
expansion (int | None): The given expansion ratio.
Returns:
int: The expansion of the block.
"""
if isinstance(expansion, int):
assert expansion > 0
elif expansion is None:
if hasattr(block, 'expansion'):
expansion = block.expansion
elif issubclass(block, BasicBlock):
expansion = 1
elif issubclass(block, Bottleneck):
expansion = 4
else:
raise TypeError(f'expansion is not specified for {block.__name__}')
else:
raise TypeError('expansion must be an integer or None')
return expansion
class ResLayer(nn.Sequential):
"""ResLayer to build ResNet style backbone.
Args:
block (nn.Module): Residual block used to build ResLayer.
num_blocks (int): Number of blocks.
in_channels (int): Input channels of this block.
out_channels (int): Output channels of this block.
expansion (int, optional): The expansion for BasicBlock/Bottleneck.
If not specified, it will firstly be obtained via
``block.expansion``. If the block has no attribute "expansion",
the following default values will be used: 1 for BasicBlock and
4 for Bottleneck. Default: None.
stride (int): stride of the first block. Default: 1.
avg_down (bool): Use AvgPool instead of stride conv when
downsampling in the bottleneck. Default: False
conv_cfg (dict, optional): dictionary to construct and config conv
layer. Default: None
norm_cfg (dict): dictionary to construct and config norm layer.
Default: dict(type='BN')
"""
def __init__(self,
block,
num_blocks,
in_channels,
out_channels,
expansion=None,
stride=1,
avg_down=False,
conv_cfg=None,
norm_cfg=dict(type='BN'),
**kwargs):
self.block = block
self.expansion = get_expansion(block, expansion)
downsample = None
if stride != 1 or in_channels != out_channels:
downsample = []
conv_stride = stride
if avg_down and stride != 1:
conv_stride = 1
downsample.append(
nn.AvgPool2d(
kernel_size=stride,
stride=stride,
ceil_mode=True,
count_include_pad=False))
downsample.extend([
build_conv_layer(
conv_cfg,
in_channels,
out_channels,
kernel_size=1,
stride=conv_stride,
bias=False),
build_norm_layer(norm_cfg, out_channels)[1]
])
downsample = nn.Sequential(*downsample)
layers = []
layers.append(
block(
in_channels=in_channels,
out_channels=out_channels,
expansion=self.expansion,
stride=stride,
downsample=downsample,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
**kwargs))
in_channels = out_channels
for i in range(1, num_blocks):
layers.append(
block(
in_channels=in_channels,
out_channels=out_channels,
expansion=self.expansion,
stride=1,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
**kwargs))
super(ResLayer, self).__init__(*layers)
@BACKBONES.register_module()
class ResNet(BaseBackbone):
"""ResNet backbone.
Please refer to the `paper <https://arxiv.org/abs/1512.03385>`_ for
details.
Args:
depth (int): Network depth, from {18, 34, 50, 101, 152}.
in_channels (int): Number of input image channels. Default: 3.
stem_channels (int): Output channels of the stem layer. Default: 64.
base_channels (int): Middle channels of the first stage. Default: 64.
num_stages (int): Stages of the network. Default: 4.
strides (Sequence[int]): Strides of the first block of each stage.
Default: ``(1, 2, 2, 2)``.
dilations (Sequence[int]): Dilation of each stage.
Default: ``(1, 1, 1, 1)``.
out_indices (Sequence[int]): Output from which stages. If only one
stage is specified, a single tensor (feature map) is returned,
otherwise multiple stages are specified, a tuple of tensors will
be returned. Default: ``(3, )``.
style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
layer is the 3x3 conv layer, otherwise the stride-two layer is
the first 1x1 conv layer.
deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv.
Default: False.
avg_down (bool): Use AvgPool instead of stride conv when
downsampling in the bottleneck. Default: False.
frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
-1 means not freezing any parameters. Default: -1.
conv_cfg (dict | None): The config dict for conv layers. Default: None.
norm_cfg (dict): The config dict for norm layers.
norm_eval (bool): Whether to set norm layers to eval mode, namely,
freeze running stats (mean and var). Note: Effect on Batch Norm
and its variants only. Default: False.
with_cp (bool): Use checkpoint or not. Using checkpoint will save some
memory while slowing down the training speed. Default: False.
zero_init_residual (bool): Whether to use zero init for last norm layer
in resblocks to let them behave as identity. Default: True.
Example:
>>> from mmcls.models import ResNet
>>> import torch
>>> self = ResNet(depth=18)
>>> self.eval()
>>> inputs = torch.rand(1, 3, 32, 32)
>>> level_outputs = self.forward(inputs)
>>> for level_out in level_outputs:
... print(tuple(level_out.shape))
(1, 64, 8, 8)
(1, 128, 4, 4)
(1, 256, 2, 2)
(1, 512, 1, 1)
"""
arch_settings = {
18: (BasicBlock, (2, 2, 2, 2)),
34: (BasicBlock, (3, 4, 6, 3)),
50: (Bottleneck, (3, 4, 6, 3)),
101: (Bottleneck, (3, 4, 23, 3)),
152: (Bottleneck, (3, 8, 36, 3))
}
def __init__(self,
depth,
in_channels=3,
stem_channels=64,
base_channels=64,
expansion=None,
num_stages=4,
strides=(1, 2, 2, 2),
dilations=(1, 1, 1, 1),
out_indices=(3, ),
style='pytorch',
deep_stem=False,
avg_down=False,
frozen_stages=-1,
conv_cfg=None,
norm_cfg=dict(type='BN', requires_grad=True),
norm_eval=False,
with_cp=False,
zero_init_residual=True,
init_cfg=[
dict(type='Kaiming', layer=['Conv2d']),
dict(
type='Constant',
val=1,
layer=['_BatchNorm', 'GroupNorm'])
]):
super(ResNet, self).__init__(init_cfg)
if depth not in self.arch_settings:
raise KeyError(f'invalid depth {depth} for resnet')
self.depth = depth
self.stem_channels = stem_channels
self.base_channels = base_channels
self.num_stages = num_stages
assert num_stages >= 1 and num_stages <= 4
self.strides = strides
self.dilations = dilations
assert len(strides) == len(dilations) == num_stages
self.out_indices = out_indices
assert max(out_indices) < num_stages
self.style = style
self.deep_stem = deep_stem
self.avg_down = avg_down
self.frozen_stages = frozen_stages
self.conv_cfg = conv_cfg
self.norm_cfg = norm_cfg
self.with_cp = with_cp
self.norm_eval = norm_eval
self.zero_init_residual = zero_init_residual
self.block, stage_blocks = self.arch_settings[depth]
self.stage_blocks = stage_blocks[:num_stages]
self.expansion = get_expansion(self.block, expansion)
self._make_stem_layer(in_channels, stem_channels)
self.res_layers = []
_in_channels = stem_channels
_out_channels = base_channels * self.expansion
for i, num_blocks in enumerate(self.stage_blocks):
stride = strides[i]
dilation = dilations[i]
res_layer = self.make_res_layer(
block=self.block,
num_blocks=num_blocks,
in_channels=_in_channels,
out_channels=_out_channels,
expansion=self.expansion,
stride=stride,
dilation=dilation,
style=self.style,
avg_down=self.avg_down,
with_cp=with_cp,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg)
_in_channels = _out_channels
_out_channels *= 2
layer_name = f'layer{i + 1}'
self.add_module(layer_name, res_layer)
self.res_layers.append(layer_name)
self._freeze_stages()
self.feat_dim = res_layer[-1].out_channels
def make_res_layer(self, **kwargs):
return ResLayer(**kwargs)
@property
def norm1(self):
return getattr(self, self.norm1_name)
def _make_stem_layer(self, in_channels, stem_channels):
if self.deep_stem:
self.stem = nn.Sequential(
ConvModule(
in_channels,
stem_channels // 2,
kernel_size=3,
stride=2,
padding=1,
conv_cfg=self.conv_cfg,
norm_cfg=self.norm_cfg,
inplace=True),
ConvModule(
stem_channels // 2,
stem_channels // 2,
kernel_size=3,
stride=1,
padding=1,
conv_cfg=self.conv_cfg,
norm_cfg=self.norm_cfg,
inplace=True),
ConvModule(
stem_channels // 2,
stem_channels,
kernel_size=3,
stride=1,
padding=1,
conv_cfg=self.conv_cfg,
norm_cfg=self.norm_cfg,
inplace=True))
else:
self.conv1 = build_conv_layer(
self.conv_cfg,
in_channels,
stem_channels,
kernel_size=7,
stride=2,
padding=3,
bias=False)
self.norm1_name, norm1 = build_norm_layer(
self.norm_cfg, stem_channels, postfix=1)
self.add_module(self.norm1_name, norm1)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
def _freeze_stages(self):
if self.frozen_stages >= 0:
if self.deep_stem:
self.stem.eval()
for param in self.stem.parameters():
param.requires_grad = False
else:
self.norm1.eval()
for m in [self.conv1, self.norm1]:
for param in m.parameters():
param.requires_grad = False
for i in range(1, self.frozen_stages + 1):
m = getattr(self, f'layer{i}')
m.eval()
for param in m.parameters():
param.requires_grad = False
# def init_weights(self, pretrained=None):
def init_weights(self):
super(ResNet, self).init_weights()
if self.zero_init_residual:
for m in self.modules():
if isinstance(m, Bottleneck):
constant_init(m.norm3, 0)
elif isinstance(m, BasicBlock):
constant_init(m.norm2, 0)
def forward(self, x):
if self.deep_stem:
x = self.stem(x)
else:
x = self.conv1(x)
x = self.norm1(x)
x = self.relu(x)
x = self.maxpool(x)
outs = []
for i, layer_name in enumerate(self.res_layers):
res_layer = getattr(self, layer_name)
x = res_layer(x)
if i in self.out_indices:
outs.append(x)
if len(outs) == 1:
return outs[0]
else:
return tuple(outs)
def train(self, mode=True):
super(ResNet, self).train(mode)
self._freeze_stages()
if mode and self.norm_eval:
for m in self.modules():
# trick: eval have effect on BatchNorm only
if isinstance(m, _BatchNorm):
m.eval()
@BACKBONES.register_module()
class ResNetV1d(ResNet):
"""ResNetV1d variant described in `Bag of Tricks.
<https://arxiv.org/pdf/1812.01187.pdf>`_.
Compared with default ResNet(ResNetV1b), ResNetV1d replaces the 7x7 conv in
the input stem with three 3x3 convs. And in the downsampling block, a 2x2
avg_pool with stride 2 is added before conv, whose stride is changed to 1.
"""
def __init__(self, **kwargs):
super(ResNetV1d, self).__init__(
deep_stem=True, avg_down=True, **kwargs)
import torch.nn as nn
from mmcv.cnn import build_conv_layer, build_norm_layer
from ..builder import BACKBONES
from .resnet import ResNet
@BACKBONES.register_module()
class ResNet_CIFAR(ResNet):
"""ResNet backbone for CIFAR.
Compared to standard ResNet, it uses `kernel_size=3` and `stride=1` in
conv1, and does not apply MaxPoolinng after stem. It has been proven to
be more efficient than standard ResNet in other public codebase, e.g.,
`https://github.com/kuangliu/pytorch-cifar/blob/master/models/resnet.py`.
Args:
depth (int): Network depth, from {18, 34, 50, 101, 152}.
in_channels (int): Number of input image channels. Default: 3.
stem_channels (int): Output channels of the stem layer. Default: 64.
base_channels (int): Middle channels of the first stage. Default: 64.
num_stages (int): Stages of the network. Default: 4.
strides (Sequence[int]): Strides of the first block of each stage.
Default: ``(1, 2, 2, 2)``.
dilations (Sequence[int]): Dilation of each stage.
Default: ``(1, 1, 1, 1)``.
out_indices (Sequence[int]): Output from which stages. If only one
stage is specified, a single tensor (feature map) is returned,
otherwise multiple stages are specified, a tuple of tensors will
be returned. Default: ``(3, )``.
style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
layer is the 3x3 conv layer, otherwise the stride-two layer is
the first 1x1 conv layer.
deep_stem (bool): This network has specific designed stem, thus it is
asserted to be False.
avg_down (bool): Use AvgPool instead of stride conv when
downsampling in the bottleneck. Default: False.
frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
-1 means not freezing any parameters. Default: -1.
conv_cfg (dict | None): The config dict for conv layers. Default: None.
norm_cfg (dict): The config dict for norm layers.
norm_eval (bool): Whether to set norm layers to eval mode, namely,
freeze running stats (mean and var). Note: Effect on Batch Norm
and its variants only. Default: False.
with_cp (bool): Use checkpoint or not. Using checkpoint will save some
memory while slowing down the training speed. Default: False.
zero_init_residual (bool): Whether to use zero init for last norm layer
in resblocks to let them behave as identity. Default: True.
"""
def __init__(self, depth, deep_stem=False, **kwargs):
super(ResNet_CIFAR, self).__init__(
depth, deep_stem=deep_stem, **kwargs)
assert not self.deep_stem, 'ResNet_CIFAR do not support deep_stem'
def _make_stem_layer(self, in_channels, base_channels):
self.conv1 = build_conv_layer(
self.conv_cfg,
in_channels,
base_channels,
kernel_size=3,
stride=1,
padding=1,
bias=False)
self.norm1_name, norm1 = build_norm_layer(
self.norm_cfg, base_channels, postfix=1)
self.add_module(self.norm1_name, norm1)
self.relu = nn.ReLU(inplace=True)
def forward(self, x):
x = self.conv1(x)
x = self.norm1(x)
x = self.relu(x)
outs = []
for i, layer_name in enumerate(self.res_layers):
res_layer = getattr(self, layer_name)
x = res_layer(x)
if i in self.out_indices:
outs.append(x)
if len(outs) == 1:
return outs[0]
else:
return tuple(outs)
from mmcv.cnn import build_conv_layer, build_norm_layer
from ..builder import BACKBONES
from .resnet import Bottleneck as _Bottleneck
from .resnet import ResLayer, ResNet
class Bottleneck(_Bottleneck):
"""Bottleneck block for ResNeXt.
Args:
in_channels (int): Input channels of this block.
out_channels (int): Output channels of this block.
groups (int): Groups of conv2.
width_per_group (int): Width per group of conv2. 64x4d indicates
``groups=64, width_per_group=4`` and 32x8d indicates
``groups=32, width_per_group=8``.
stride (int): stride of the block. Default: 1
dilation (int): dilation of convolution. Default: 1
downsample (nn.Module, optional): downsample operation on identity
branch. Default: None
style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
layer is the 3x3 conv layer, otherwise the stride-two layer is
the first 1x1 conv layer.
conv_cfg (dict, optional): dictionary to construct and config conv
layer. Default: None
norm_cfg (dict): dictionary to construct and config norm layer.
Default: dict(type='BN')
with_cp (bool): Use checkpoint or not. Using checkpoint will save some
memory while slowing down the training speed.
"""
def __init__(self,
in_channels,
out_channels,
base_channels=64,
groups=32,
width_per_group=4,
**kwargs):
super(Bottleneck, self).__init__(in_channels, out_channels, **kwargs)
self.groups = groups
self.width_per_group = width_per_group
# For ResNet bottleneck, middle channels are determined by expansion
# and out_channels, but for ResNeXt bottleneck, it is determined by
# groups and width_per_group and the stage it is located in.
if groups != 1:
assert self.mid_channels % base_channels == 0
self.mid_channels = (
groups * width_per_group * self.mid_channels // base_channels)
self.norm1_name, norm1 = build_norm_layer(
self.norm_cfg, self.mid_channels, postfix=1)
self.norm2_name, norm2 = build_norm_layer(
self.norm_cfg, self.mid_channels, postfix=2)
self.norm3_name, norm3 = build_norm_layer(
self.norm_cfg, self.out_channels, postfix=3)
self.conv1 = build_conv_layer(
self.conv_cfg,
self.in_channels,
self.mid_channels,
kernel_size=1,
stride=self.conv1_stride,
bias=False)
self.add_module(self.norm1_name, norm1)
self.conv2 = build_conv_layer(
self.conv_cfg,
self.mid_channels,
self.mid_channels,
kernel_size=3,
stride=self.conv2_stride,
padding=self.dilation,
dilation=self.dilation,
groups=groups,
bias=False)
self.add_module(self.norm2_name, norm2)
self.conv3 = build_conv_layer(
self.conv_cfg,
self.mid_channels,
self.out_channels,
kernel_size=1,
bias=False)
self.add_module(self.norm3_name, norm3)
@BACKBONES.register_module()
class ResNeXt(ResNet):
"""ResNeXt backbone.
Please refer to the `paper <https://arxiv.org/abs/1611.05431>`_ for
details.
Args:
depth (int): Network depth, from {50, 101, 152}.
groups (int): Groups of conv2 in Bottleneck. Default: 32.
width_per_group (int): Width per group of conv2 in Bottleneck.
Default: 4.
in_channels (int): Number of input image channels. Default: 3.
stem_channels (int): Output channels of the stem layer. Default: 64.
num_stages (int): Stages of the network. Default: 4.
strides (Sequence[int]): Strides of the first block of each stage.
Default: ``(1, 2, 2, 2)``.
dilations (Sequence[int]): Dilation of each stage.
Default: ``(1, 1, 1, 1)``.
out_indices (Sequence[int]): Output from which stages. If only one
stage is specified, a single tensor (feature map) is returned,
otherwise multiple stages are specified, a tuple of tensors will
be returned. Default: ``(3, )``.
style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
layer is the 3x3 conv layer, otherwise the stride-two layer is
the first 1x1 conv layer.
deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv.
Default: False.
avg_down (bool): Use AvgPool instead of stride conv when
downsampling in the bottleneck. Default: False.
frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
-1 means not freezing any parameters. Default: -1.
conv_cfg (dict | None): The config dict for conv layers. Default: None.
norm_cfg (dict): The config dict for norm layers.
norm_eval (bool): Whether to set norm layers to eval mode, namely,
freeze running stats (mean and var). Note: Effect on Batch Norm
and its variants only. Default: False.
with_cp (bool): Use checkpoint or not. Using checkpoint will save some
memory while slowing down the training speed. Default: False.
zero_init_residual (bool): Whether to use zero init for last norm layer
in resblocks to let them behave as identity. Default: True.
"""
arch_settings = {
50: (Bottleneck, (3, 4, 6, 3)),
101: (Bottleneck, (3, 4, 23, 3)),
152: (Bottleneck, (3, 8, 36, 3))
}
def __init__(self, depth, groups=32, width_per_group=4, **kwargs):
self.groups = groups
self.width_per_group = width_per_group
super(ResNeXt, self).__init__(depth, **kwargs)
def make_res_layer(self, **kwargs):
return ResLayer(
groups=self.groups,
width_per_group=self.width_per_group,
base_channels=self.base_channels,
**kwargs)
import torch.utils.checkpoint as cp
from ..builder import BACKBONES
from ..utils.se_layer import SELayer
from .resnet import Bottleneck, ResLayer, ResNet
class SEBottleneck(Bottleneck):
"""SEBottleneck block for SEResNet.
Args:
in_channels (int): The input channels of the SEBottleneck block.
out_channels (int): The output channel of the SEBottleneck block.
se_ratio (int): Squeeze ratio in SELayer. Default: 16
"""
def __init__(self, in_channels, out_channels, se_ratio=16, **kwargs):
super(SEBottleneck, self).__init__(in_channels, out_channels, **kwargs)
self.se_layer = SELayer(out_channels, ratio=se_ratio)
def forward(self, x):
def _inner_forward(x):
identity = x
out = self.conv1(x)
out = self.norm1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.norm2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.norm3(out)
out = self.se_layer(out)
if self.downsample is not None:
identity = self.downsample(x)
out += identity
return out
if self.with_cp and x.requires_grad:
out = cp.checkpoint(_inner_forward, x)
else:
out = _inner_forward(x)
out = self.relu(out)
return out
@BACKBONES.register_module()
class SEResNet(ResNet):
"""SEResNet backbone.
Please refer to the `paper <https://arxiv.org/abs/1709.01507>`_ for
details.
Args:
depth (int): Network depth, from {50, 101, 152}.
se_ratio (int): Squeeze ratio in SELayer. Default: 16.
in_channels (int): Number of input image channels. Default: 3.
stem_channels (int): Output channels of the stem layer. Default: 64.
num_stages (int): Stages of the network. Default: 4.
strides (Sequence[int]): Strides of the first block of each stage.
Default: ``(1, 2, 2, 2)``.
dilations (Sequence[int]): Dilation of each stage.
Default: ``(1, 1, 1, 1)``.
out_indices (Sequence[int]): Output from which stages. If only one
stage is specified, a single tensor (feature map) is returned,
otherwise multiple stages are specified, a tuple of tensors will
be returned. Default: ``(3, )``.
style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
layer is the 3x3 conv layer, otherwise the stride-two layer is
the first 1x1 conv layer.
deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv.
Default: False.
avg_down (bool): Use AvgPool instead of stride conv when
downsampling in the bottleneck. Default: False.
frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
-1 means not freezing any parameters. Default: -1.
conv_cfg (dict | None): The config dict for conv layers. Default: None.
norm_cfg (dict): The config dict for norm layers.
norm_eval (bool): Whether to set norm layers to eval mode, namely,
freeze running stats (mean and var). Note: Effect on Batch Norm
and its variants only. Default: False.
with_cp (bool): Use checkpoint or not. Using checkpoint will save some
memory while slowing down the training speed. Default: False.
zero_init_residual (bool): Whether to use zero init for last norm layer
in resblocks to let them behave as identity. Default: True.
Example:
>>> from mmcls.models import SEResNet
>>> import torch
>>> self = SEResNet(depth=50)
>>> self.eval()
>>> inputs = torch.rand(1, 3, 224, 224)
>>> level_outputs = self.forward(inputs)
>>> for level_out in level_outputs:
... print(tuple(level_out.shape))
(1, 64, 56, 56)
(1, 128, 28, 28)
(1, 256, 14, 14)
(1, 512, 7, 7)
"""
arch_settings = {
50: (SEBottleneck, (3, 4, 6, 3)),
101: (SEBottleneck, (3, 4, 23, 3)),
152: (SEBottleneck, (3, 8, 36, 3))
}
def __init__(self, depth, se_ratio=16, **kwargs):
if depth not in self.arch_settings:
raise KeyError(f'invalid depth {depth} for SEResNet')
self.se_ratio = se_ratio
super(SEResNet, self).__init__(depth, **kwargs)
def make_res_layer(self, **kwargs):
return ResLayer(se_ratio=self.se_ratio, **kwargs)
from mmcv.cnn import build_conv_layer, build_norm_layer
from ..builder import BACKBONES
from .resnet import ResLayer
from .seresnet import SEBottleneck as _SEBottleneck
from .seresnet import SEResNet
class SEBottleneck(_SEBottleneck):
"""SEBottleneck block for SEResNeXt.
Args:
in_channels (int): Input channels of this block.
out_channels (int): Output channels of this block.
base_channels (int): Middle channels of the first stage. Default: 64.
groups (int): Groups of conv2.
width_per_group (int): Width per group of conv2. 64x4d indicates
``groups=64, width_per_group=4`` and 32x8d indicates
``groups=32, width_per_group=8``.
stride (int): stride of the block. Default: 1
dilation (int): dilation of convolution. Default: 1
downsample (nn.Module, optional): downsample operation on identity
branch. Default: None
se_ratio (int): Squeeze ratio in SELayer. Default: 16
style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
layer is the 3x3 conv layer, otherwise the stride-two layer is
the first 1x1 conv layer.
conv_cfg (dict, optional): dictionary to construct and config conv
layer. Default: None
norm_cfg (dict): dictionary to construct and config norm layer.
Default: dict(type='BN')
with_cp (bool): Use checkpoint or not. Using checkpoint will save some
memory while slowing down the training speed.
"""
def __init__(self,
in_channels,
out_channels,
base_channels=64,
groups=32,
width_per_group=4,
se_ratio=16,
**kwargs):
super(SEBottleneck, self).__init__(in_channels, out_channels, se_ratio,
**kwargs)
self.groups = groups
self.width_per_group = width_per_group
# We follow the same rational of ResNext to compute mid_channels.
# For SEResNet bottleneck, middle channels are determined by expansion
# and out_channels, but for SEResNeXt bottleneck, it is determined by
# groups and width_per_group and the stage it is located in.
if groups != 1:
assert self.mid_channels % base_channels == 0
self.mid_channels = (
groups * width_per_group * self.mid_channels // base_channels)
self.norm1_name, norm1 = build_norm_layer(
self.norm_cfg, self.mid_channels, postfix=1)
self.norm2_name, norm2 = build_norm_layer(
self.norm_cfg, self.mid_channels, postfix=2)
self.norm3_name, norm3 = build_norm_layer(
self.norm_cfg, self.out_channels, postfix=3)
self.conv1 = build_conv_layer(
self.conv_cfg,
self.in_channels,
self.mid_channels,
kernel_size=1,
stride=self.conv1_stride,
bias=False)
self.add_module(self.norm1_name, norm1)
self.conv2 = build_conv_layer(
self.conv_cfg,
self.mid_channels,
self.mid_channels,
kernel_size=3,
stride=self.conv2_stride,
padding=self.dilation,
dilation=self.dilation,
groups=groups,
bias=False)
self.add_module(self.norm2_name, norm2)
self.conv3 = build_conv_layer(
self.conv_cfg,
self.mid_channels,
self.out_channels,
kernel_size=1,
bias=False)
self.add_module(self.norm3_name, norm3)
@BACKBONES.register_module()
class SEResNeXt(SEResNet):
"""SEResNeXt backbone.
Please refer to the `paper <https://arxiv.org/abs/1709.01507>`_ for
details.
Args:
depth (int): Network depth, from {50, 101, 152}.
groups (int): Groups of conv2 in Bottleneck. Default: 32.
width_per_group (int): Width per group of conv2 in Bottleneck.
Default: 4.
se_ratio (int): Squeeze ratio in SELayer. Default: 16.
in_channels (int): Number of input image channels. Default: 3.
stem_channels (int): Output channels of the stem layer. Default: 64.
num_stages (int): Stages of the network. Default: 4.
strides (Sequence[int]): Strides of the first block of each stage.
Default: ``(1, 2, 2, 2)``.
dilations (Sequence[int]): Dilation of each stage.
Default: ``(1, 1, 1, 1)``.
out_indices (Sequence[int]): Output from which stages. If only one
stage is specified, a single tensor (feature map) is returned,
otherwise multiple stages are specified, a tuple of tensors will
be returned. Default: ``(3, )``.
style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
layer is the 3x3 conv layer, otherwise the stride-two layer is
the first 1x1 conv layer.
deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv.
Default: False.
avg_down (bool): Use AvgPool instead of stride conv when
downsampling in the bottleneck. Default: False.
frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
-1 means not freezing any parameters. Default: -1.
conv_cfg (dict | None): The config dict for conv layers. Default: None.
norm_cfg (dict): The config dict for norm layers.
norm_eval (bool): Whether to set norm layers to eval mode, namely,
freeze running stats (mean and var). Note: Effect on Batch Norm
and its variants only. Default: False.
with_cp (bool): Use checkpoint or not. Using checkpoint will save some
memory while slowing down the training speed. Default: False.
zero_init_residual (bool): Whether to use zero init for last norm layer
in resblocks to let them behave as identity. Default: True.
"""
arch_settings = {
50: (SEBottleneck, (3, 4, 6, 3)),
101: (SEBottleneck, (3, 4, 23, 3)),
152: (SEBottleneck, (3, 8, 36, 3))
}
def __init__(self, depth, groups=32, width_per_group=4, **kwargs):
self.groups = groups
self.width_per_group = width_per_group
super(SEResNeXt, self).__init__(depth, **kwargs)
def make_res_layer(self, **kwargs):
return ResLayer(
groups=self.groups,
width_per_group=self.width_per_group,
base_channels=self.base_channels,
**kwargs)
import torch
import torch.nn as nn
import torch.utils.checkpoint as cp
from mmcv.cnn import (ConvModule, build_activation_layer, constant_init,
normal_init)
from torch.nn.modules.batchnorm import _BatchNorm
from mmcls.models.utils import channel_shuffle, make_divisible
from ..builder import BACKBONES
from .base_backbone import BaseBackbone
class ShuffleUnit(nn.Module):
"""ShuffleUnit block.
ShuffleNet unit with pointwise group convolution (GConv) and channel
shuffle.
Args:
in_channels (int): The input channels of the ShuffleUnit.
out_channels (int): The output channels of the ShuffleUnit.
groups (int): The number of groups to be used in grouped 1x1
convolutions in each ShuffleUnit. Default: 3
first_block (bool): Whether it is the first ShuffleUnit of a
sequential ShuffleUnits. Default: False, which means not using the
grouped 1x1 convolution.
combine (str): The ways to combine the input and output
branches. Default: 'add'.
conv_cfg (dict, optional): Config dict for convolution layer.
Default: None, which means using conv2d.
norm_cfg (dict): Config dict for normalization layer.
Default: dict(type='BN').
act_cfg (dict): Config dict for activation layer.
Default: dict(type='ReLU').
with_cp (bool): Use checkpoint or not. Using checkpoint
will save some memory while slowing down the training speed.
Default: False.
Returns:
Tensor: The output tensor.
"""
def __init__(self,
in_channels,
out_channels,
groups=3,
first_block=True,
combine='add',
conv_cfg=None,
norm_cfg=dict(type='BN'),
act_cfg=dict(type='ReLU'),
with_cp=False):
super(ShuffleUnit, self).__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.first_block = first_block
self.combine = combine
self.groups = groups
self.bottleneck_channels = self.out_channels // 4
self.with_cp = with_cp
if self.combine == 'add':
self.depthwise_stride = 1
self._combine_func = self._add
assert in_channels == out_channels, (
'in_channels must be equal to out_channels when combine '
'is add')
elif self.combine == 'concat':
self.depthwise_stride = 2
self._combine_func = self._concat
self.out_channels -= self.in_channels
self.avgpool = nn.AvgPool2d(kernel_size=3, stride=2, padding=1)
else:
raise ValueError(f'Cannot combine tensors with {self.combine}. '
'Only "add" and "concat" are supported')
self.first_1x1_groups = 1 if first_block else self.groups
self.g_conv_1x1_compress = ConvModule(
in_channels=self.in_channels,
out_channels=self.bottleneck_channels,
kernel_size=1,
groups=self.first_1x1_groups,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg)
self.depthwise_conv3x3_bn = ConvModule(
in_channels=self.bottleneck_channels,
out_channels=self.bottleneck_channels,
kernel_size=3,
stride=self.depthwise_stride,
padding=1,
groups=self.bottleneck_channels,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=None)
self.g_conv_1x1_expand = ConvModule(
in_channels=self.bottleneck_channels,
out_channels=self.out_channels,
kernel_size=1,
groups=self.groups,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=None)
self.act = build_activation_layer(act_cfg)
@staticmethod
def _add(x, out):
# residual connection
return x + out
@staticmethod
def _concat(x, out):
# concatenate along channel axis
return torch.cat((x, out), 1)
def forward(self, x):
def _inner_forward(x):
residual = x
out = self.g_conv_1x1_compress(x)
out = self.depthwise_conv3x3_bn(out)
if self.groups > 1:
out = channel_shuffle(out, self.groups)
out = self.g_conv_1x1_expand(out)
if self.combine == 'concat':
residual = self.avgpool(residual)
out = self.act(out)
out = self._combine_func(residual, out)
else:
out = self._combine_func(residual, out)
out = self.act(out)
return out
if self.with_cp and x.requires_grad:
out = cp.checkpoint(_inner_forward, x)
else:
out = _inner_forward(x)
return out
@BACKBONES.register_module()
class ShuffleNetV1(BaseBackbone):
"""ShuffleNetV1 backbone.
Args:
groups (int): The number of groups to be used in grouped 1x1
convolutions in each ShuffleUnit. Default: 3.
widen_factor (float): Width multiplier - adjusts the number
of channels in each layer by this amount. Default: 1.0.
out_indices (Sequence[int]): Output from which stages.
Default: (2, )
frozen_stages (int): Stages to be frozen (all param fixed).
Default: -1, which means not freezing any parameters.
conv_cfg (dict, optional): Config dict for convolution layer.
Default: None, which means using conv2d.
norm_cfg (dict): Config dict for normalization layer.
Default: dict(type='BN').
act_cfg (dict): Config dict for activation layer.
Default: dict(type='ReLU').
norm_eval (bool): Whether to set norm layers to eval mode, namely,
freeze running stats (mean and var). Note: Effect on Batch Norm
and its variants only. Default: False.
with_cp (bool): Use checkpoint or not. Using checkpoint will save some
memory while slowing down the training speed. Default: False.
"""
def __init__(self,
groups=3,
widen_factor=1.0,
out_indices=(2, ),
frozen_stages=-1,
conv_cfg=None,
norm_cfg=dict(type='BN'),
act_cfg=dict(type='ReLU'),
norm_eval=False,
with_cp=False,
init_cfg=None):
super(ShuffleNetV1, self).__init__(init_cfg)
self.stage_blocks = [4, 8, 4]
self.groups = groups
for index in out_indices:
if index not in range(0, 3):
raise ValueError('the item in out_indices must in '
f'range(0, 3). But received {index}')
if frozen_stages not in range(-1, 3):
raise ValueError('frozen_stages must be in range(-1, 3). '
f'But received {frozen_stages}')
self.out_indices = out_indices
self.frozen_stages = frozen_stages
self.conv_cfg = conv_cfg
self.norm_cfg = norm_cfg
self.act_cfg = act_cfg
self.norm_eval = norm_eval
self.with_cp = with_cp
if groups == 1:
channels = (144, 288, 576)
elif groups == 2:
channels = (200, 400, 800)
elif groups == 3:
channels = (240, 480, 960)
elif groups == 4:
channels = (272, 544, 1088)
elif groups == 8:
channels = (384, 768, 1536)
else:
raise ValueError(f'{groups} groups is not supported for 1x1 '
'Grouped Convolutions')
channels = [make_divisible(ch * widen_factor, 8) for ch in channels]
self.in_channels = int(24 * widen_factor)
self.conv1 = ConvModule(
in_channels=3,
out_channels=self.in_channels,
kernel_size=3,
stride=2,
padding=1,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layers = nn.ModuleList()
for i, num_blocks in enumerate(self.stage_blocks):
first_block = True if i == 0 else False
layer = self.make_layer(channels[i], num_blocks, first_block)
self.layers.append(layer)
def _freeze_stages(self):
if self.frozen_stages >= 0:
for param in self.conv1.parameters():
param.requires_grad = False
for i in range(self.frozen_stages):
layer = self.layers[i]
layer.eval()
for param in layer.parameters():
param.requires_grad = False
def init_weights(self):
super(ShuffleNetV1, self).init_weights()
for name, m in self.named_modules():
if isinstance(m, nn.Conv2d):
if 'conv1' in name:
normal_init(m, mean=0, std=0.01)
else:
normal_init(m, mean=0, std=1.0 / m.weight.shape[1])
elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
constant_init(m.weight, val=1, bias=0.0001)
if isinstance(m, _BatchNorm):
if m.running_mean is not None:
nn.init.constant_(m.running_mean, 0)
def make_layer(self, out_channels, num_blocks, first_block=False):
"""Stack ShuffleUnit blocks to make a layer.
Args:
out_channels (int): out_channels of the block.
num_blocks (int): Number of blocks.
first_block (bool): Whether is the first ShuffleUnit of a
sequential ShuffleUnits. Default: False, which means not using
the grouped 1x1 convolution.
"""
layers = []
for i in range(num_blocks):
first_block = first_block if i == 0 else False
combine_mode = 'concat' if i == 0 else 'add'
layers.append(
ShuffleUnit(
self.in_channels,
out_channels,
groups=self.groups,
first_block=first_block,
combine=combine_mode,
conv_cfg=self.conv_cfg,
norm_cfg=self.norm_cfg,
act_cfg=self.act_cfg,
with_cp=self.with_cp))
self.in_channels = out_channels
return nn.Sequential(*layers)
def forward(self, x):
x = self.conv1(x)
x = self.maxpool(x)
outs = []
for i, layer in enumerate(self.layers):
x = layer(x)
if i in self.out_indices:
outs.append(x)
if len(outs) == 1:
return outs[0]
else:
return tuple(outs)
def train(self, mode=True):
super(ShuffleNetV1, self).train(mode)
self._freeze_stages()
if mode and self.norm_eval:
for m in self.modules():
if isinstance(m, _BatchNorm):
m.eval()
import torch
import torch.nn as nn
import torch.utils.checkpoint as cp
from mmcv.cnn import ConvModule, constant_init, normal_init
from torch.nn.modules.batchnorm import _BatchNorm
from mmcls.models.utils import channel_shuffle
from ..builder import BACKBONES
from .base_backbone import BaseBackbone
class InvertedResidual(nn.Module):
"""InvertedResidual block for ShuffleNetV2 backbone.
Args:
in_channels (int): The input channels of the block.
out_channels (int): The output channels of the block.
stride (int): Stride of the 3x3 convolution layer. Default: 1
conv_cfg (dict, optional): Config dict for convolution layer.
Default: None, which means using conv2d.
norm_cfg (dict): Config dict for normalization layer.
Default: dict(type='BN').
act_cfg (dict): Config dict for activation layer.
Default: dict(type='ReLU').
with_cp (bool): Use checkpoint or not. Using checkpoint will save some
memory while slowing down the training speed. Default: False.
Returns:
Tensor: The output tensor.
"""
def __init__(self,
in_channels,
out_channels,
stride=1,
conv_cfg=None,
norm_cfg=dict(type='BN'),
act_cfg=dict(type='ReLU'),
with_cp=False):
super(InvertedResidual, self).__init__()
self.stride = stride
self.with_cp = with_cp
branch_features = out_channels // 2
if self.stride == 1:
assert in_channels == branch_features * 2, (
f'in_channels ({in_channels}) should equal to '
f'branch_features * 2 ({branch_features * 2}) '
'when stride is 1')
if in_channels != branch_features * 2:
assert self.stride != 1, (
f'stride ({self.stride}) should not equal 1 when '
f'in_channels != branch_features * 2')
if self.stride > 1:
self.branch1 = nn.Sequential(
ConvModule(
in_channels,
in_channels,
kernel_size=3,
stride=self.stride,
padding=1,
groups=in_channels,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=None),
ConvModule(
in_channels,
branch_features,
kernel_size=1,
stride=1,
padding=0,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg),
)
self.branch2 = nn.Sequential(
ConvModule(
in_channels if (self.stride > 1) else branch_features,
branch_features,
kernel_size=1,
stride=1,
padding=0,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg),
ConvModule(
branch_features,
branch_features,
kernel_size=3,
stride=self.stride,
padding=1,
groups=branch_features,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=None),
ConvModule(
branch_features,
branch_features,
kernel_size=1,
stride=1,
padding=0,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg))
def forward(self, x):
def _inner_forward(x):
if self.stride > 1:
out = torch.cat((self.branch1(x), self.branch2(x)), dim=1)
else:
x1, x2 = x.chunk(2, dim=1)
out = torch.cat((x1, self.branch2(x2)), dim=1)
out = channel_shuffle(out, 2)
return out
if self.with_cp and x.requires_grad:
out = cp.checkpoint(_inner_forward, x)
else:
out = _inner_forward(x)
return out
@BACKBONES.register_module()
class ShuffleNetV2(BaseBackbone):
"""ShuffleNetV2 backbone.
Args:
widen_factor (float): Width multiplier - adjusts the number of
channels in each layer by this amount. Default: 1.0.
out_indices (Sequence[int]): Output from which stages.
Default: (0, 1, 2, 3).
frozen_stages (int): Stages to be frozen (all param fixed).
Default: -1, which means not freezing any parameters.
conv_cfg (dict, optional): Config dict for convolution layer.
Default: None, which means using conv2d.
norm_cfg (dict): Config dict for normalization layer.
Default: dict(type='BN').
act_cfg (dict): Config dict for activation layer.
Default: dict(type='ReLU').
norm_eval (bool): Whether to set norm layers to eval mode, namely,
freeze running stats (mean and var). Note: Effect on Batch Norm
and its variants only. Default: False.
with_cp (bool): Use checkpoint or not. Using checkpoint will save some
memory while slowing down the training speed. Default: False.
"""
def __init__(self,
widen_factor=1.0,
out_indices=(3, ),
frozen_stages=-1,
conv_cfg=None,
norm_cfg=dict(type='BN'),
act_cfg=dict(type='ReLU'),
norm_eval=False,
with_cp=False,
init_cfg=None):
super(ShuffleNetV2, self).__init__(init_cfg)
self.stage_blocks = [4, 8, 4]
for index in out_indices:
if index not in range(0, 4):
raise ValueError('the item in out_indices must in '
f'range(0, 4). But received {index}')
if frozen_stages not in range(-1, 4):
raise ValueError('frozen_stages must be in range(-1, 4). '
f'But received {frozen_stages}')
self.out_indices = out_indices
self.frozen_stages = frozen_stages
self.conv_cfg = conv_cfg
self.norm_cfg = norm_cfg
self.act_cfg = act_cfg
self.norm_eval = norm_eval
self.with_cp = with_cp
if widen_factor == 0.5:
channels = [48, 96, 192, 1024]
elif widen_factor == 1.0:
channels = [116, 232, 464, 1024]
elif widen_factor == 1.5:
channels = [176, 352, 704, 1024]
elif widen_factor == 2.0:
channels = [244, 488, 976, 2048]
else:
raise ValueError('widen_factor must be in [0.5, 1.0, 1.5, 2.0]. '
f'But received {widen_factor}')
self.in_channels = 24
self.conv1 = ConvModule(
in_channels=3,
out_channels=self.in_channels,
kernel_size=3,
stride=2,
padding=1,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layers = nn.ModuleList()
for i, num_blocks in enumerate(self.stage_blocks):
layer = self._make_layer(channels[i], num_blocks)
self.layers.append(layer)
output_channels = channels[-1]
self.layers.append(
ConvModule(
in_channels=self.in_channels,
out_channels=output_channels,
kernel_size=1,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg))
def _make_layer(self, out_channels, num_blocks):
"""Stack blocks to make a layer.
Args:
out_channels (int): out_channels of the block.
num_blocks (int): number of blocks.
"""
layers = []
for i in range(num_blocks):
stride = 2 if i == 0 else 1
layers.append(
InvertedResidual(
in_channels=self.in_channels,
out_channels=out_channels,
stride=stride,
conv_cfg=self.conv_cfg,
norm_cfg=self.norm_cfg,
act_cfg=self.act_cfg,
with_cp=self.with_cp))
self.in_channels = out_channels
return nn.Sequential(*layers)
def _freeze_stages(self):
if self.frozen_stages >= 0:
for param in self.conv1.parameters():
param.requires_grad = False
for i in range(self.frozen_stages):
m = self.layers[i]
m.eval()
for param in m.parameters():
param.requires_grad = False
def init_weighs(self):
super(ShuffleNetV2, self).init_weights()
for name, m in self.named_modules():
if isinstance(m, nn.Conv2d):
if 'conv1' in name:
normal_init(m, mean=0, std=0.01)
else:
normal_init(m, mean=0, std=1.0 / m.weight.shape[1])
elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
constant_init(m.weight, val=1, bias=0.0001)
if isinstance(m, _BatchNorm):
if m.running_mean is not None:
nn.init.constant_(m.running_mean, 0)
def forward(self, x):
x = self.conv1(x)
x = self.maxpool(x)
outs = []
for i, layer in enumerate(self.layers):
x = layer(x)
if i in self.out_indices:
outs.append(x)
if len(outs) == 1:
return outs[0]
else:
return tuple(outs)
def train(self, mode=True):
super(ShuffleNetV2, self).train(mode)
self._freeze_stages()
if mode and self.norm_eval:
for m in self.modules():
if isinstance(m, nn.BatchNorm2d):
m.eval()
import torch.nn as nn
from mmcv.cnn import ConvModule
from mmcv.utils.parrots_wrapper import _BatchNorm
from ..builder import BACKBONES
from .base_backbone import BaseBackbone
def make_vgg_layer(in_channels,
out_channels,
num_blocks,
conv_cfg=None,
norm_cfg=None,
act_cfg=dict(type='ReLU'),
dilation=1,
with_norm=False,
ceil_mode=False):
layers = []
for _ in range(num_blocks):
layer = ConvModule(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=3,
dilation=dilation,
padding=dilation,
bias=True,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg)
layers.append(layer)
in_channels = out_channels
layers.append(nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=ceil_mode))
return layers
@BACKBONES.register_module()
class VGG(BaseBackbone):
"""VGG backbone.
Args:
depth (int): Depth of vgg, from {11, 13, 16, 19}.
with_norm (bool): Use BatchNorm or not.
num_classes (int): number of classes for classification.
num_stages (int): VGG stages, normally 5.
dilations (Sequence[int]): Dilation of each stage.
out_indices (Sequence[int], optional): Output from which stages.
If only one stage is specified, a single tensor (feature map) is
returned, otherwise multiple stages are specified, a tuple of
tensors will be returned. When it is None, the default behavior
depends on whether num_classes is specified. If num_classes <= 0,
the default value is (4, ), outputing the last feature map before
classifier. If num_classes > 0, the default value is (5, ),
outputing the classification score. Default: None.
frozen_stages (int): Stages to be frozen (all param fixed). -1 means
not freezing any parameters.
norm_eval (bool): Whether to set norm layers to eval mode, namely,
freeze running stats (mean and var). Note: Effect on Batch Norm
and its variants only. Default: False.
ceil_mode (bool): Whether to use ceil_mode of MaxPool. Default: False.
with_last_pool (bool): Whether to keep the last pooling before
classifier. Default: True.
"""
# Parameters to build layers. Each element specifies the number of conv in
# each stage. For example, VGG11 contains 11 layers with learnable
# parameters. 11 is computed as 11 = (1 + 1 + 2 + 2 + 2) + 3,
# where 3 indicates the last three fully-connected layers.
arch_settings = {
11: (1, 1, 2, 2, 2),
13: (2, 2, 2, 2, 2),
16: (2, 2, 3, 3, 3),
19: (2, 2, 4, 4, 4)
}
def __init__(self,
depth,
num_classes=-1,
num_stages=5,
dilations=(1, 1, 1, 1, 1),
out_indices=None,
frozen_stages=-1,
conv_cfg=None,
norm_cfg=None,
act_cfg=dict(type='ReLU'),
norm_eval=False,
ceil_mode=False,
with_last_pool=True,
init_cfg=[
dict(type='Kaiming', layer=['Conv2d']),
dict(type='Constant', val=1., layer=['_BatchNorm']),
dict(type='Normal', std=0.01, layer=['Linear'])
]):
super(VGG, self).__init__(init_cfg)
if depth not in self.arch_settings:
raise KeyError(f'invalid depth {depth} for vgg')
assert num_stages >= 1 and num_stages <= 5
stage_blocks = self.arch_settings[depth]
self.stage_blocks = stage_blocks[:num_stages]
assert len(dilations) == num_stages
self.num_classes = num_classes
self.frozen_stages = frozen_stages
self.norm_eval = norm_eval
with_norm = norm_cfg is not None
if out_indices is None:
out_indices = (5, ) if num_classes > 0 else (4, )
assert max(out_indices) <= num_stages
self.out_indices = out_indices
self.in_channels = 3
start_idx = 0
vgg_layers = []
self.range_sub_modules = []
for i, num_blocks in enumerate(self.stage_blocks):
num_modules = num_blocks + 1
end_idx = start_idx + num_modules
dilation = dilations[i]
out_channels = 64 * 2**i if i < 4 else 512
vgg_layer = make_vgg_layer(
self.in_channels,
out_channels,
num_blocks,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg,
dilation=dilation,
with_norm=with_norm,
ceil_mode=ceil_mode)
vgg_layers.extend(vgg_layer)
self.in_channels = out_channels
self.range_sub_modules.append([start_idx, end_idx])
start_idx = end_idx
if not with_last_pool:
vgg_layers.pop(-1)
self.range_sub_modules[-1][1] -= 1
self.module_name = 'features'
self.add_module(self.module_name, nn.Sequential(*vgg_layers))
if self.num_classes > 0:
self.classifier = nn.Sequential(
nn.Linear(512 * 7 * 7, 4096),
nn.ReLU(True),
nn.Dropout(),
nn.Linear(4096, 4096),
nn.ReLU(True),
nn.Dropout(),
nn.Linear(4096, num_classes),
)
def forward(self, x):
outs = []
vgg_layers = getattr(self, self.module_name)
for i in range(len(self.stage_blocks)):
for j in range(*self.range_sub_modules[i]):
vgg_layer = vgg_layers[j]
x = vgg_layer(x)
if i in self.out_indices:
outs.append(x)
if self.num_classes > 0:
x = x.view(x.size(0), -1)
x = self.classifier(x)
outs.append(x)
if len(outs) == 1:
return outs[0]
else:
return tuple(outs)
def _freeze_stages(self):
vgg_layers = getattr(self, self.module_name)
for i in range(self.frozen_stages):
for j in range(*self.range_sub_modules[i]):
m = vgg_layers[j]
m.eval()
for param in m.parameters():
param.requires_grad = False
def train(self, mode=True):
super(VGG, self).train(mode)
self._freeze_stages()
if mode and self.norm_eval:
for m in self.modules():
# trick: eval have effect on BatchNorm only
if isinstance(m, _BatchNorm):
m.eval()
import torch
import torch.nn as nn
from mmcv.cnn import (build_activation_layer, build_conv_layer,
build_norm_layer, kaiming_init)
from ..builder import BACKBONES
from ..utils import to_2tuple
from .base_backbone import BaseBackbone
class FFN(nn.Module):
"""Implements feed-forward networks (FFNs) with residual connection.
Args:
embed_dims (int): The feature dimension. Same as
`MultiheadAttention`.
feedforward_channels (int): The hidden dimension of FFNs.
num_fcs (int, optional): The number of fully-connected layers in
FFNs. Defaluts to 2.
act_cfg (dict, optional): The activation config for FFNs.
dropout (float, optional): Probability of an element to be
zeroed. Default 0.0.
add_residual (bool, optional): Add resudual connection.
Defaults to False.
"""
def __init__(self,
embed_dims,
feedforward_channels,
num_fcs=2,
act_cfg=dict(type='GELU'),
dropout=0.0,
add_residual=True):
super(FFN, self).__init__()
assert num_fcs >= 2, 'num_fcs should be no less ' \
f'than 2. got {num_fcs}.'
self.embed_dims = embed_dims
self.feedforward_channels = feedforward_channels
self.num_fcs = num_fcs
self.act_cfg = act_cfg
self.activate = build_activation_layer(act_cfg)
layers = nn.ModuleList()
in_channels = embed_dims
for _ in range(num_fcs - 1):
layers.append(
nn.Sequential(
nn.Linear(in_channels, feedforward_channels),
self.activate, nn.Dropout(dropout)))
in_channels = feedforward_channels
layers.append(nn.Linear(feedforward_channels, embed_dims))
self.layers = nn.Sequential(*layers)
self.dropout = nn.Dropout(dropout)
self.add_residual = add_residual
self.init_weights()
def init_weights(self):
for m in self.modules():
if isinstance(m, nn.Linear):
# xavier_init(m, distribution='uniform')
# Bias init is different from our API
# therefore initialize them separately
# The initialization is sync with ClassyVision
nn.init.xavier_normal_(m.weight)
nn.init.normal_(m.bias, std=1e-6)
def forward(self, x, residual=None):
"""Forward function for `FFN`."""
out = self.layers(x)
if not self.add_residual:
return out
if residual is None:
residual = x
return residual + self.dropout(out)
def __repr__(self):
"""str: a string that describes the module"""
repr_str = self.__class__.__name__
repr_str += f'(embed_dims={self.embed_dims}, '
repr_str += f'feedforward_channels={self.feedforward_channels}, '
repr_str += f'num_fcs={self.num_fcs}, '
repr_str += f'act_cfg={self.act_cfg}, '
repr_str += f'dropout={self.dropout}, '
repr_str += f'add_residual={self.add_residual})'
return repr_str
class MultiheadAttention(nn.Module):
"""A warpper for torch.nn.MultiheadAttention.
This module implements MultiheadAttention with residual connection.
Args:
embed_dims (int): The embedding dimension.
num_heads (int): Parallel attention heads. Same as
`nn.MultiheadAttention`.
attn_drop (float): A Dropout layer on attn_output_weights. Default 0.0.
proj_drop (float): The drop out rate after attention. Default 0.0.
"""
def __init__(self, embed_dims, num_heads, attn_drop=0.0, proj_drop=0.0):
super(MultiheadAttention, self).__init__()
assert embed_dims % num_heads == 0, 'embed_dims must be ' \
f'divisible by num_heads. got {embed_dims} and {num_heads}.'
self.embed_dims = embed_dims
self.num_heads = num_heads
self.attn = nn.MultiheadAttention(embed_dims, num_heads, attn_drop)
self.dropout = nn.Dropout(proj_drop)
def forward(self,
x,
key=None,
value=None,
residual=None,
query_pos=None,
key_pos=None,
attn_mask=None,
key_padding_mask=None):
"""Forward function for `MultiheadAttention`.
Args:
x (Tensor): The input query with shape [num_query, bs,
embed_dims]. Same in `nn.MultiheadAttention.forward`.
key (Tensor): The key tensor with shape [num_key, bs,
embed_dims]. Same in `nn.MultiheadAttention.forward`.
Default None. If None, the `query` will be used.
value (Tensor): The value tensor with same shape as `key`.
Same in `nn.MultiheadAttention.forward`. Default None.
If None, the `key` will be used.
residual (Tensor): The tensor used for addition, with the
same shape as `x`. Default None. If None, `x` will be used.
query_pos (Tensor): The positional encoding for query, with
the same shape as `x`. Default None. If not None, it will
be added to `x` before forward function.
key_pos (Tensor): The positional encoding for `key`, with the
same shape as `key`. Default None. If not None, it will
be added to `key` before forward function. If None, and
`query_pos` has the same shape as `key`, then `query_pos`
will be used for `key_pos`.
attn_mask (Tensor): ByteTensor mask with shape [num_query,
num_key]. Same in `nn.MultiheadAttention.forward`.
Default None.
key_padding_mask (Tensor): ByteTensor with shape [bs, num_key].
Same in `nn.MultiheadAttention.forward`. Default None.
Returns:
Tensor: forwarded results with shape [num_query, bs, embed_dims].
"""
query = x
if key is None:
key = query
if value is None:
value = key
if residual is None:
residual = x
if key_pos is None:
if query_pos is not None and key is not None:
if query_pos.shape == key.shape:
key_pos = query_pos
if query_pos is not None:
query = query + query_pos
if key_pos is not None:
key = key + key_pos
out = self.attn(
query,
key,
value=value,
attn_mask=attn_mask,
key_padding_mask=key_padding_mask)[0]
return residual + self.dropout(out)
class TransformerEncoderLayer(nn.Module):
"""Implements one encoder layer in Vision Transformer.
Args:
embed_dims (int): The feature dimension. Same as `FFN`.
num_heads (int): Parallel attention heads.
feedforward_channels (int): The hidden dimension for FFNs.
attn_drop (float): The drop out rate for attention layer.
Default 0.0.
proj_drop (float): Probability of an element to be zeroed
after the feed forward layer. Default 0.0.
act_cfg (dict): The activation config for FFNs. Defalut GELU.
norm_cfg (dict): Config dict for normalization layer. Default
layer normalization.
num_fcs (int): The number of fully-connected layers for FFNs.
Default 2.
"""
def __init__(self,
embed_dims,
num_heads,
feedforward_channels,
attn_drop=0.,
proj_drop=0.,
act_cfg=dict(type='GELU'),
norm_cfg=dict(type='LN'),
num_fcs=2):
super(TransformerEncoderLayer, self).__init__()
self.norm1_name, norm1 = build_norm_layer(
norm_cfg, embed_dims, postfix=1)
self.add_module(self.norm1_name, norm1)
self.attn = MultiheadAttention(
embed_dims,
num_heads=num_heads,
attn_drop=attn_drop,
proj_drop=proj_drop)
self.norm2_name, norm2 = build_norm_layer(
norm_cfg, embed_dims, postfix=2)
self.add_module(self.norm2_name, norm2)
self.mlp = FFN(embed_dims, feedforward_channels, num_fcs, act_cfg,
proj_drop)
@property
def norm1(self):
return getattr(self, self.norm1_name)
@property
def norm2(self):
return getattr(self, self.norm2_name)
def forward(self, x):
norm_x = self.norm1(x)
# Reason for permute: as the shape of input from pretrained weight
# from pytorch-image-models is [batch_size, num_query, embed_dim],
# but the one from nn.MultiheadAttention is
# [num_query, batch_size, embed_dim]
x = x.permute(1, 0, 2)
norm_x = norm_x.permute(1, 0, 2)
x = self.attn(norm_x, residual=x)
# Convert the shape back to [batch_size, num_query, embed_dim] in
# order to make use of the pretrained weight
x = x.permute(1, 0, 2)
x = self.mlp(self.norm2(x), residual=x)
return x
class PatchEmbed(nn.Module):
"""Image to Patch Embedding.
Args:
img_size (int | tuple): The size of input image.
patch_size (int): The size of one patch
in_channels (int): The num of input channels.
embed_dim (int): The dimensions of embedding.
conv_cfg (dict | None): The config dict for conv layers.
Default: None.
"""
def __init__(self,
img_size=224,
patch_size=16,
in_channels=3,
embed_dim=768,
conv_cfg=None):
super(PatchEmbed, self).__init__()
if isinstance(img_size, int):
img_size = to_2tuple(img_size)
elif isinstance(img_size, tuple):
if len(img_size) == 1:
img_size = to_2tuple(img_size[0])
assert len(img_size) == 2, \
f'The size of image should have length 1 or 2, ' \
f'but got {len(img_size)}'
self.img_size = img_size
self.patch_size = to_2tuple(patch_size)
num_patches = (self.img_size[1] // self.patch_size[1]) * (
self.img_size[0] // self.patch_size[0])
assert num_patches * self.patch_size[0] * self.patch_size[1] == \
self.img_size[0] * self.img_size[1], \
'The image size H*W must be divisible by patch size'
self.num_patches = num_patches
# Use conv layer to embed
self.projection = build_conv_layer(
conv_cfg,
in_channels,
embed_dim,
kernel_size=patch_size,
stride=patch_size)
self.init_weights()
def init_weights(self):
# Lecun norm from ClassyVision
kaiming_init(self.projection, mode='fan_in', nonlinearity='linear')
def forward(self, x):
B, C, H, W = x.shape
# FIXME look at relaxing size constraints
assert H == self.img_size[0] and W == self.img_size[1], \
f"Input image size ({H}*{W}) doesn't " \
f'match model ({self.img_size[0]}*{self.img_size[1]}).'
# The output size is (B, N, D), where N=H*W/P/P, D is embid_dim
x = self.projection(x).flatten(2).transpose(1, 2)
return x
class HybridEmbed(nn.Module):
"""CNN Feature Map Embedding.
Extract feature map from CNN, flatten, project to embedding dim.
"""
def __init__(self,
backbone,
img_size=224,
feature_size=None,
in_channels=3,
embed_dim=768,
conv_cfg=None):
super().__init__()
assert isinstance(backbone, nn.Module)
if isinstance(img_size, int):
img_size = to_2tuple(img_size)
elif isinstance(img_size, tuple):
if len(img_size) == 1:
img_size = to_2tuple(img_size[0])
assert len(img_size) == 2, \
f'The size of image should have length 1 or 2, ' \
f'but got {len(img_size)}'
self.img_size = img_size
self.backbone = backbone
if feature_size is None:
with torch.no_grad():
# FIXME this is hacky, but most reliable way of
# determining the exact dim of the output feature
# map for all networks, the feature metadata has
# reliable channel and stride info, but using
# stride to calc feature dim requires info about padding of
# each stage that isn't captured.
training = backbone.training
if training:
backbone.eval()
o = self.backbone(
torch.zeros(1, in_channels, img_size[0], img_size[1]))
if isinstance(o, (list, tuple)):
# last feature if backbone outputs list/tuple of features
o = o[-1]
feature_size = o.shape[-2:]
feature_dim = o.shape[1]
backbone.train(training)
else:
feature_size = to_2tuple(feature_size)
if hasattr(self.backbone, 'feature_info'):
feature_dim = self.backbone.feature_info.channels()[-1]
else:
feature_dim = self.backbone.num_features
self.num_patches = feature_size[0] * feature_size[1]
# Use conv layer to embed
self.projection = build_conv_layer(
conv_cfg, feature_dim, embed_dim, kernel_size=1, stride=1)
self.init_weights()
def init_weights(self):
# Lecun norm from ClassyVision
kaiming_init(self.projection, mode='fan_in', nonlinearity='linear')
def forward(self, x):
x = self.backbone(x)
if isinstance(x, (list, tuple)):
# last feature if backbone outputs list/tuple of features
x = x[-1]
x = self.projection(x).flatten(2).transpose(1, 2)
return x
@BACKBONES.register_module()
class VisionTransformer(BaseBackbone):
""" Vision Transformer
A PyTorch impl of : `An Image is Worth 16x16 Words:
Transformers for Image Recognition at Scale` -
https://arxiv.org/abs/2010.11929
Args:
num_layers (int): Depth of transformer
embed_dim (int): Embedding dimension
num_heads (int): Number of attention heads
img_size (int | tuple): Input image size
patch_size (int | tuple): The patch size
in_channels (int): Number of input channels
feedforward_channels (int): The hidden dimension for FFNs.
drop_rate (float): Probability of an element to be zeroed.
Default 0.0.
attn_drop (float): The drop out rate for attention layer.
Default 0.0.
hybrid_backbone (nn.Module): CNN backbone to use in-place of
PatchEmbed module. Default None.
norm_cfg
norm_cfg (dict): Config dict for normalization layer. Default
layer normalization.
act_cfg (dict): The activation config for FFNs. Defalut GELU.
num_fcs (int): The number of fully-connected layers for FFNs.
Default 2.
"""
def __init__(self,
num_layers=12,
embed_dim=768,
num_heads=12,
img_size=224,
patch_size=16,
in_channels=3,
feedforward_channels=3072,
drop_rate=0.,
attn_drop_rate=0.,
hybrid_backbone=None,
norm_cfg=dict(type='LN'),
act_cfg=dict(type='GELU'),
num_fcs=2):
super(VisionTransformer, self).__init__()
self.embed_dim = embed_dim
if hybrid_backbone is not None:
self.patch_embed = HybridEmbed(
hybrid_backbone,
img_size=img_size,
in_channels=in_channels,
embed_dim=embed_dim)
else:
self.patch_embed = PatchEmbed(
img_size=img_size,
patch_size=patch_size,
in_channels=in_channels,
embed_dim=embed_dim)
num_patches = self.patch_embed.num_patches
self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
self.pos_embed = nn.Parameter(
torch.zeros(1, num_patches + 1, embed_dim))
self.drop_after_pos = nn.Dropout(p=drop_rate)
self.layers = nn.ModuleList()
for _ in range(num_layers):
self.layers.append(
TransformerEncoderLayer(
embed_dim,
num_heads,
feedforward_channels,
attn_drop=attn_drop_rate,
proj_drop=drop_rate,
act_cfg=act_cfg,
norm_cfg=norm_cfg,
num_fcs=num_fcs))
self.norm1_name, norm1 = build_norm_layer(
norm_cfg, embed_dim, postfix=1)
self.add_module(self.norm1_name, norm1)
self.init_weights()
def init_weights(self):
super(VisionTransformer, self).init_weights()
nn.init.normal_(self.pos_embed, std=0.02)
@property
def norm1(self):
return getattr(self, self.norm1_name)
def forward(self, x):
B = x.shape[0]
x = self.patch_embed(x)
cls_tokens = self.cls_token.expand(
B, -1, -1) # stole cls_tokens impl from Phil Wang, thanks
x = torch.cat((cls_tokens, x), dim=1)
x = x + self.pos_embed
x = self.drop_after_pos(x)
for layer in self.layers:
x = layer(x)
x = self.norm1(x)[:, 0]
return x
from mmcv.cnn import MODELS as MMCV_MODELS
from mmcv.utils import Registry
MODELS = Registry('models', parent=MMCV_MODELS)
BACKBONES = MODELS
NECKS = MODELS
HEADS = MODELS
LOSSES = MODELS
CLASSIFIERS = MODELS
def build_backbone(cfg):
"""Build backbone."""
return BACKBONES.build(cfg)
def build_neck(cfg):
"""Build neck."""
return NECKS.build(cfg)
def build_head(cfg):
"""Build head."""
return HEADS.build(cfg)
def build_loss(cfg):
"""Build loss."""
return LOSSES.build(cfg)
def build_classifier(cfg):
return CLASSIFIERS.build(cfg)
from .base import BaseClassifier
from .image import ImageClassifier
__all__ = ['BaseClassifier', 'ImageClassifier']
import warnings
from abc import ABCMeta, abstractmethod
from collections import OrderedDict
import cv2
import mmcv
import torch
import torch.distributed as dist
from mmcv import color_val
from mmcv.runner import BaseModule
# TODO import `auto_fp16` from mmcv and delete them from mmcls
try:
from mmcv.runner import auto_fp16
except ImportError:
warnings.warn('auto_fp16 from mmcls will be deprecated.'
'Please install mmcv>=1.1.4.')
from mmcls.core import auto_fp16
class BaseClassifier(BaseModule, metaclass=ABCMeta):
"""Base class for classifiers."""
def __init__(self, init_cfg=None):
super(BaseClassifier, self).__init__(init_cfg)
self.fp16_enabled = False
@property
def with_neck(self):
return hasattr(self, 'neck') and self.neck is not None
@property
def with_head(self):
return hasattr(self, 'head') and self.head is not None
@abstractmethod
def extract_feat(self, imgs):
pass
def extract_feats(self, imgs):
assert isinstance(imgs, list)
for img in imgs:
yield self.extract_feat(img)
@abstractmethod
def forward_train(self, imgs, **kwargs):
"""
Args:
img (list[Tensor]): List of tensors of shape (1, C, H, W).
Typically these should be mean centered and std scaled.
kwargs (keyword arguments): Specific to concrete implementation.
"""
pass
@abstractmethod
def simple_test(self, img, **kwargs):
pass
def forward_test(self, imgs, **kwargs):
"""
Args:
imgs (List[Tensor]): the outer list indicates test-time
augmentations and inner Tensor should have a shape NxCxHxW,
which contains all images in the batch.
"""
if isinstance(imgs, torch.Tensor):
imgs = [imgs]
for var, name in [(imgs, 'imgs')]:
if not isinstance(var, list):
raise TypeError(f'{name} must be a list, but got {type(var)}')
if len(imgs) == 1:
return self.simple_test(imgs[0], **kwargs)
else:
raise NotImplementedError('aug_test has not been implemented')
@auto_fp16(apply_to=('img', ))
def forward(self, img, return_loss=True, **kwargs):
"""Calls either forward_train or forward_test depending on whether
return_loss=True.
Note this setting will change the expected inputs. When
`return_loss=True`, img and img_meta are single-nested (i.e. Tensor and
List[dict]), and when `resturn_loss=False`, img and img_meta should be
double nested (i.e. List[Tensor], List[List[dict]]), with the outer
list indicating test time augmentations.
"""
if return_loss:
return self.forward_train(img, **kwargs)
else:
return self.forward_test(img, **kwargs)
def _parse_losses(self, losses):
log_vars = OrderedDict()
for loss_name, loss_value in losses.items():
if isinstance(loss_value, torch.Tensor):
log_vars[loss_name] = loss_value.mean()
elif isinstance(loss_value, list):
log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
elif isinstance(loss_value, dict):
for name, value in loss_value.items():
log_vars[name] = value
else:
raise TypeError(
f'{loss_name} is not a tensor or list of tensors')
loss = sum(_value for _key, _value in log_vars.items()
if 'loss' in _key)
log_vars['loss'] = loss
for loss_name, loss_value in log_vars.items():
# reduce loss when distributed training
if dist.is_available() and dist.is_initialized():
loss_value = loss_value.data.clone()
dist.all_reduce(loss_value.div_(dist.get_world_size()))
log_vars[loss_name] = loss_value.item()
return loss, log_vars
def train_step(self, data, optimizer):
"""The iteration step during training.
This method defines an iteration step during training, except for the
back propagation and optimizer updating, which are done in an optimizer
hook. Note that in some complicated cases or models, the whole process
including back propagation and optimizer updating are also defined in
this method, such as GAN.
Args:
data (dict): The output of dataloader.
optimizer (:obj:`torch.optim.Optimizer` | dict): The optimizer of
runner is passed to ``train_step()``. This argument is unused
and reserved.
Returns:
dict: It should contain at least 3 keys: ``loss``, ``log_vars``,
``num_samples``.
``loss`` is a tensor for back propagation, which can be a
weighted sum of multiple losses.
``log_vars`` contains all the variables to be sent to the
logger.
``num_samples`` indicates the batch size (when the model is
DDP, it means the batch size on each GPU), which is used for
averaging the logs.
"""
losses = self(**data)
loss, log_vars = self._parse_losses(losses)
outputs = dict(
loss=loss, log_vars=log_vars, num_samples=len(data['img'].data))
return outputs
def val_step(self, data, optimizer):
"""The iteration step during validation.
This method shares the same signature as :func:`train_step`, but used
during val epochs. Note that the evaluation after training epochs is
not implemented with this method, but an evaluation hook.
"""
losses = self(**data)
loss, log_vars = self._parse_losses(losses)
outputs = dict(
loss=loss, log_vars=log_vars, num_samples=len(data['img'].data))
return outputs
def show_result(self,
img,
result,
text_color='green',
font_scale=0.5,
row_width=20,
show=False,
win_name='',
wait_time=0,
out_file=None):
"""Draw `result` over `img`.
Args:
img (str or Tensor): The image to be displayed.
result (Tensor): The classification results to draw over `img`.
text_color (str or tuple or :obj:`Color`): Color of texts.
font_scale (float): Font scales of texts.
row_width (int): width between each row of results on the image.
show (bool): Whether to show the image.
Default: False.
win_name (str): The window name.
wait_time (int): Value of waitKey param.
Default: 0.
out_file (str or None): The filename to write the image.
Default: None.
Returns:
img (Tensor): Only if not `show` or `out_file`
"""
img = mmcv.imread(img)
img = img.copy()
# write results on left-top of the image
x, y = 0, row_width
text_color = color_val(text_color)
for k, v in result.items():
if isinstance(v, float):
v = f'{v:.2f}'
label_text = f'{k}: {v}'
cv2.putText(img, label_text, (x, y), cv2.FONT_HERSHEY_COMPLEX,
font_scale, text_color)
y += row_width
# if out_file specified, do not show image in window
if out_file is not None:
show = False
if show:
mmcv.imshow(img, win_name, wait_time)
if out_file is not None:
mmcv.imwrite(img, out_file)
if not (show or out_file):
warnings.warn('show==False and out_file is not specified, only '
'result image will be returned')
return img
import copy
import warnings
from ..builder import CLASSIFIERS, build_backbone, build_head, build_neck
from ..utils.augment import Augments
from .base import BaseClassifier
@CLASSIFIERS.register_module()
class ImageClassifier(BaseClassifier):
def __init__(self,
backbone,
neck=None,
head=None,
pretrained=None,
train_cfg=None,
init_cfg=None):
super(ImageClassifier, self).__init__(init_cfg)
if pretrained is not None:
warnings.warn('DeprecationWarning: pretrained is a deprecated \
key, please consider using init_cfg')
self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
self.backbone = build_backbone(backbone)
if neck is not None:
self.neck = build_neck(neck)
if head is not None:
self.head = build_head(head)
self.augments = None
if train_cfg is not None:
augments_cfg = train_cfg.get('augments', None)
if augments_cfg is not None:
self.augments = Augments(augments_cfg)
else:
# Considering BC-breaking
mixup_cfg = train_cfg.get('mixup', None)
cutmix_cfg = train_cfg.get('cutmix', None)
assert mixup_cfg is None or cutmix_cfg is None, \
'If mixup and cutmix are set simultaneously,' \
'use augments instead.'
if mixup_cfg is not None:
warnings.warn('The mixup attribute will be deprecated. '
'Please use augments instead.')
cfg = copy.deepcopy(mixup_cfg)
cfg['type'] = 'BatchMixup'
# In the previous version, mixup_prob is always 1.0.
cfg['prob'] = 1.0
self.augments = Augments(cfg)
if cutmix_cfg is not None:
warnings.warn('The cutmix attribute will be deprecated. '
'Please use augments instead.')
cfg = copy.deepcopy(cutmix_cfg)
cutmix_prob = cfg.pop('cutmix_prob')
cfg['type'] = 'BatchCutMix'
cfg['prob'] = cutmix_prob
self.augments = Augments(cfg)
def extract_feat(self, img):
"""Directly extract features from the backbone + neck."""
x = self.backbone(img)
if self.with_neck:
x = self.neck(x)
return x
def forward_train(self, img, gt_label, **kwargs):
"""Forward computation during training.
Args:
img (Tensor): of shape (N, C, H, W) encoding input images.
Typically these should be mean centered and std scaled.
gt_label (Tensor): It should be of shape (N, 1) encoding the
ground-truth label of input images for single label task. It
shoulf be of shape (N, C) encoding the ground-truth label
of input images for multi-labels task.
Returns:
dict[str, Tensor]: a dictionary of loss components
"""
if self.augments is not None:
img, gt_label = self.augments(img, gt_label)
x = self.extract_feat(img)
losses = dict()
loss = self.head.forward_train(x, gt_label)
losses.update(loss)
return losses
def simple_test(self, img, img_metas):
"""Test without augmentation."""
x = self.extract_feat(img)
return self.head.simple_test(x)
from .cls_head import ClsHead
from .linear_head import LinearClsHead
from .multi_label_head import MultiLabelClsHead
from .multi_label_linear_head import MultiLabelLinearClsHead
from .vision_transformer_head import VisionTransformerClsHead
__all__ = [
'ClsHead', 'LinearClsHead', 'MultiLabelClsHead', 'MultiLabelLinearClsHead',
'VisionTransformerClsHead'
]
from abc import ABCMeta, abstractmethod
from mmcv.runner import BaseModule
class BaseHead(BaseModule, metaclass=ABCMeta):
"""Base head."""
def __init__(self, init_cfg=None):
super(BaseHead, self).__init__(init_cfg)
@abstractmethod
def forward_train(self, x, gt_label, **kwargs):
pass
import torch
import torch.nn.functional as F
from mmcls.models.losses import Accuracy
from ..builder import HEADS, build_loss
from .base_head import BaseHead
@HEADS.register_module()
class ClsHead(BaseHead):
"""classification head.
Args:
loss (dict): Config of classification loss.
topk (int | tuple): Top-k accuracy.
cal_acc (bool): Whether to calculate accuracy during training.
If you use Mixup/CutMix or something like that during training,
it is not reasonable to calculate accuracy. Defaults to False.
"""
def __init__(self,
loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
topk=(1, ),
cal_acc=False,
init_cfg=None):
super(ClsHead, self).__init__(init_cfg=init_cfg)
assert isinstance(loss, dict)
assert isinstance(topk, (int, tuple))
if isinstance(topk, int):
topk = (topk, )
for _topk in topk:
assert _topk > 0, 'Top-k should be larger than 0'
self.topk = topk
self.compute_loss = build_loss(loss)
self.compute_accuracy = Accuracy(topk=self.topk)
self.cal_acc = cal_acc
def loss(self, cls_score, gt_label):
num_samples = len(cls_score)
losses = dict()
# compute loss
loss = self.compute_loss(cls_score, gt_label, avg_factor=num_samples)
if self.cal_acc:
# compute accuracy
acc = self.compute_accuracy(cls_score, gt_label)
assert len(acc) == len(self.topk)
losses['accuracy'] = {
f'top-{k}': a
for k, a in zip(self.topk, acc)
}
losses['loss'] = loss
return losses
def forward_train(self, cls_score, gt_label):
losses = self.loss(cls_score, gt_label)
return losses
def simple_test(self, cls_score):
"""Test without augmentation."""
if isinstance(cls_score, list):
cls_score = sum(cls_score) / float(len(cls_score))
pred = F.softmax(cls_score, dim=1) if cls_score is not None else None
on_trace = hasattr(torch.jit, 'is_tracing') and torch.jit.is_tracing()
if torch.onnx.is_in_onnx_export() or on_trace:
return pred
pred = list(pred.detach().cpu().numpy())
return pred
import torch
import torch.nn as nn
import torch.nn.functional as F
from ..builder import HEADS
from .cls_head import ClsHead
@HEADS.register_module()
class LinearClsHead(ClsHead):
"""Linear classifier head.
Args:
num_classes (int): Number of categories excluding the background
category.
in_channels (int): Number of channels in the input feature map.
"""
def __init__(self,
num_classes,
in_channels,
init_cfg=None,
*args,
**kwargs):
init_cfg = init_cfg or dict(
type='Normal',
mean=0.,
std=0.01,
bias=0.,
override=dict(name='fc'))
super(LinearClsHead, self).__init__(init_cfg=init_cfg, *args, **kwargs)
self.in_channels = in_channels
self.num_classes = num_classes
if self.num_classes <= 0:
raise ValueError(
f'num_classes={num_classes} must be a positive integer')
self._init_layers()
def _init_layers(self):
self.fc = nn.Linear(self.in_channels, self.num_classes)
def simple_test(self, img):
"""Test without augmentation."""
cls_score = self.fc(img)
if isinstance(cls_score, list):
cls_score = sum(cls_score) / float(len(cls_score))
pred = F.softmax(cls_score, dim=1) if cls_score is not None else None
on_trace = hasattr(torch.jit, 'is_tracing') and torch.jit.is_tracing()
if torch.onnx.is_in_onnx_export() or on_trace:
return pred
pred = list(pred.detach().cpu().numpy())
return pred
def forward_train(self, x, gt_label):
cls_score = self.fc(x)
losses = self.loss(cls_score, gt_label)
return losses
import torch
import torch.nn.functional as F
from ..builder import HEADS, build_loss
from .base_head import BaseHead
@HEADS.register_module()
class MultiLabelClsHead(BaseHead):
"""Classification head for multilabel task.
Args:
loss (dict): Config of classification loss.
"""
def __init__(self,
loss=dict(
type='CrossEntropyLoss',
use_sigmoid=True,
reduction='mean',
loss_weight=1.0),
init_cfg=None):
super(MultiLabelClsHead, self).__init__(init_cfg=init_cfg)
assert isinstance(loss, dict)
self.compute_loss = build_loss(loss)
def loss(self, cls_score, gt_label):
gt_label = gt_label.type_as(cls_score)
num_samples = len(cls_score)
losses = dict()
# map difficult examples to positive ones
_gt_label = torch.abs(gt_label)
# compute loss
loss = self.compute_loss(cls_score, _gt_label, avg_factor=num_samples)
losses['loss'] = loss
return losses
def forward_train(self, cls_score, gt_label):
gt_label = gt_label.type_as(cls_score)
losses = self.loss(cls_score, gt_label)
return losses
def simple_test(self, cls_score):
if isinstance(cls_score, list):
cls_score = sum(cls_score) / float(len(cls_score))
pred = F.sigmoid(cls_score) if cls_score is not None else None
on_trace = hasattr(torch.jit, 'is_tracing') and torch.jit.is_tracing()
if torch.onnx.is_in_onnx_export() or on_trace:
return pred
pred = list(pred.detach().cpu().numpy())
return pred
import torch
import torch.nn as nn
import torch.nn.functional as F
from ..builder import HEADS
from .multi_label_head import MultiLabelClsHead
@HEADS.register_module()
class MultiLabelLinearClsHead(MultiLabelClsHead):
"""Linear classification head for multilabel task.
Args:
num_classes (int): Number of categories.
in_channels (int): Number of channels in the input feature map.
loss (dict): Config of classification loss.
"""
def __init__(self,
num_classes,
in_channels,
loss=dict(
type='CrossEntropyLoss',
use_sigmoid=True,
reduction='mean',
loss_weight=1.0),
init_cfg=dict(
type='Normal',
mean=0.,
std=0.01,
bias=0.,
override=dict(name='fc'))):
super(MultiLabelLinearClsHead, self).__init__(
loss=loss, init_cfg=init_cfg)
if num_classes <= 0:
raise ValueError(
f'num_classes={num_classes} must be a positive integer')
self.in_channels = in_channels
self.num_classes = num_classes
self._init_layers()
def _init_layers(self):
self.fc = nn.Linear(self.in_channels, self.num_classes)
def forward_train(self, x, gt_label):
gt_label = gt_label.type_as(x)
cls_score = self.fc(x)
losses = self.loss(cls_score, gt_label)
return losses
def simple_test(self, img):
"""Test without augmentation."""
cls_score = self.fc(img)
if isinstance(cls_score, list):
cls_score = sum(cls_score) / float(len(cls_score))
pred = F.sigmoid(cls_score) if cls_score is not None else None
on_trace = hasattr(torch.jit, 'is_tracing') and torch.jit.is_tracing()
if torch.onnx.is_in_onnx_export() or on_trace:
return pred
pred = list(pred.detach().cpu().numpy())
return pred
from collections import OrderedDict
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import build_activation_layer, constant_init, kaiming_init
from ..builder import HEADS
from .cls_head import ClsHead
@HEADS.register_module()
class VisionTransformerClsHead(ClsHead):
"""Vision Transformer classifier head.
Args:
num_classes (int): Number of categories excluding the background
category.
in_channels (int): Number of channels in the input feature map.
hidden_dim (int): Number of the dimensions for hidden layer. Only
available during pre-training. Default None.
act_cfg (dict): The activation config. Only available during
pre-training. Defalut Tanh.
"""
def __init__(self,
num_classes,
in_channels,
hidden_dim=None,
act_cfg=dict(type='Tanh'),
*args,
**kwargs):
super(VisionTransformerClsHead, self).__init__(*args, **kwargs)
self.in_channels = in_channels
self.num_classes = num_classes
self.hidden_dim = hidden_dim
self.act_cfg = act_cfg
if self.num_classes <= 0:
raise ValueError(
f'num_classes={num_classes} must be a positive integer')
self._init_layers()
def _init_layers(self):
if self.hidden_dim is None:
layers = [('head', nn.Linear(self.in_channels, self.num_classes))]
else:
layers = [
('pre_logits', nn.Linear(self.in_channels, self.hidden_dim)),
('act', build_activation_layer(self.act_cfg)),
('head', nn.Linear(self.hidden_dim, self.num_classes)),
]
self.layers = nn.Sequential(OrderedDict(layers))
def init_weights(self):
super(VisionTransformerClsHead, self).init_weights()
# Modified from ClassyVision
if hasattr(self.layers, 'pre_logits'):
# Lecun norm
kaiming_init(
self.layers.pre_logits, mode='fan_in', nonlinearity='linear')
constant_init(self.layers.head, 0)
def simple_test(self, img):
"""Test without augmentation."""
cls_score = self.layers(img)
if isinstance(cls_score, list):
cls_score = sum(cls_score) / float(len(cls_score))
pred = F.softmax(cls_score, dim=1) if cls_score is not None else None
on_trace = hasattr(torch.jit, 'is_tracing') and torch.jit.is_tracing()
if torch.onnx.is_in_onnx_export() or on_trace:
return pred
pred = list(pred.detach().cpu().numpy())
return pred
def forward_train(self, x, gt_label):
cls_score = self.layers(x)
losses = self.loss(cls_score, gt_label)
return losses
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment