Commit b3d6785d authored by myhloli's avatar myhloli
Browse files

refactor(ocr): remove unused code and simplify model architecture

- Remove unused imports and code
- Simplify model architecture by removing unnecessary components
- Update initialization and forward pass logic
- Rename variables for consistency
parent 3cb156f5
......@@ -2,7 +2,8 @@ import os, sys
import torch
import torch.nn as nn
import torch.nn.functional as F
from pytorchocr.modeling.common import Activation
from ..common import Activation
class ConvBNLayer(nn.Module):
......
import torch
from torch import nn
class MTB(nn.Module):
def __init__(self, cnn_num, in_channels):
super(MTB, self).__init__()
self.block = nn.Sequential()
self.out_channels = in_channels
self.cnn_num = cnn_num
if self.cnn_num == 2:
for i in range(self.cnn_num):
self.block.add_module(
'conv_{}'.format(i),
nn.Conv2d(
in_channels=in_channels
if i == 0 else 32 * (2**(i - 1)),
out_channels=32 * (2**i),
kernel_size=3,
stride=2,
padding=1))
self.block.add_module('relu_{}'.format(i), nn.ReLU())
self.block.add_module('bn_{}'.format(i),
nn.BatchNorm2d(32 * (2**i)))
def forward(self, images):
x = self.block(images)
if self.cnn_num == 2:
# (b, w, h, c)
x = x.permute(0, 3, 2, 1)
x_shape = x.shape
x = torch.reshape(
x, (x_shape[0], x_shape[1], x_shape[2] * x_shape[3]))
return x
"""
This code is refer from:
https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textrecog/layers/conv_layer.py
https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textrecog/backbones/resnet31_ocr.py
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F
# import paddle
# from paddle import ParamAttr
# import paddle.nn as nn
# import paddle.nn.functional as F
__all__ = ["ResNet31"]
def conv3x3(in_channel, out_channel, stride=1):
return nn.Conv2d(
in_channel,
out_channel,
kernel_size=3,
stride=stride,
padding=1,
bias=False)
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, in_channels, channels, stride=1, downsample=False):
super().__init__()
self.conv1 = conv3x3(in_channels, channels, stride)
self.bn1 = nn.BatchNorm2d(channels)
self.relu = nn.ReLU()
self.conv2 = conv3x3(channels, channels)
self.bn2 = nn.BatchNorm2d(channels)
self.downsample = downsample
if downsample:
self.downsample = nn.Sequential(
nn.Conv2d(
in_channels,
channels * self.expansion,
1,
stride,
bias=False),
nn.BatchNorm2d(channels * self.expansion), )
else:
self.downsample = nn.Sequential()
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class ResNet31(nn.Module):
'''
Args:
in_channels (int): Number of channels of input image tensor.
layers (list[int]): List of BasicBlock number for each stage.
channels (list[int]): List of out_channels of Conv2d layer.
out_indices (None | Sequence[int]): Indices of output stages.
last_stage_pool (bool): If True, add `MaxPool2d` layer to last stage.
'''
def __init__(self,
in_channels=3,
layers=[1, 2, 5, 3],
channels=[64, 128, 256, 256, 512, 512, 512],
out_indices=None,
last_stage_pool=False):
super(ResNet31, self).__init__()
assert isinstance(in_channels, int)
assert isinstance(last_stage_pool, bool)
self.out_indices = out_indices
self.last_stage_pool = last_stage_pool
# conv 1 (Conv Conv)
self.conv1_1 = nn.Conv2d(
in_channels, channels[0], kernel_size=3, stride=1, padding=1)
self.bn1_1 = nn.BatchNorm2d(channels[0])
self.relu1_1 = nn.ReLU(inplace=True)
self.conv1_2 = nn.Conv2d(
channels[0], channels[1], kernel_size=3, stride=1, padding=1)
self.bn1_2 = nn.BatchNorm2d(channels[1])
self.relu1_2 = nn.ReLU(inplace=True)
# conv 2 (Max-pooling, Residual block, Conv)
self.pool2 = nn.MaxPool2d(
kernel_size=2, stride=2, padding=0, ceil_mode=True)
self.block2 = self._make_layer(channels[1], channels[2], layers[0])
self.conv2 = nn.Conv2d(
channels[2], channels[2], kernel_size=3, stride=1, padding=1)
self.bn2 = nn.BatchNorm2d(channels[2])
self.relu2 = nn.ReLU(inplace=True)
# conv 3 (Max-pooling, Residual block, Conv)
self.pool3 = nn.MaxPool2d(
kernel_size=2, stride=2, padding=0, ceil_mode=True)
self.block3 = self._make_layer(channels[2], channels[3], layers[1])
self.conv3 = nn.Conv2d(
channels[3], channels[3], kernel_size=3, stride=1, padding=1)
self.bn3 = nn.BatchNorm2d(channels[3])
self.relu3 = nn.ReLU(inplace=True)
# conv 4 (Max-pooling, Residual block, Conv)
self.pool4 = nn.MaxPool2d(
kernel_size=(2, 1), stride=(2, 1), padding=0, ceil_mode=True)
self.block4 = self._make_layer(channels[3], channels[4], layers[2])
self.conv4 = nn.Conv2d(
channels[4], channels[4], kernel_size=3, stride=1, padding=1)
self.bn4 = nn.BatchNorm2d(channels[4])
self.relu4 = nn.ReLU(inplace=True)
# conv 5 ((Max-pooling), Residual block, Conv)
self.pool5 = None
if self.last_stage_pool:
self.pool5 = nn.MaxPool2d(
kernel_size=2, stride=2, padding=0, ceil_mode=True)
self.block5 = self._make_layer(channels[4], channels[5], layers[3])
self.conv5 = nn.Conv2d(
channels[5], channels[5], kernel_size=3, stride=1, padding=1)
self.bn5 = nn.BatchNorm2d(channels[5])
self.relu5 = nn.ReLU(inplace=True)
self.out_channels = channels[-1]
def _make_layer(self, input_channels, output_channels, blocks):
layers = []
for _ in range(blocks):
downsample = None
if input_channels != output_channels:
downsample = nn.Sequential(
nn.Conv2d(
input_channels,
output_channels,
kernel_size=1,
stride=1,
bias=False),
nn.BatchNorm2d(output_channels), )
layers.append(
BasicBlock(
input_channels, output_channels, downsample=downsample))
input_channels = output_channels
return nn.Sequential(*layers)
def forward(self, x):
x = self.conv1_1(x)
x = self.bn1_1(x)
x = self.relu1_1(x)
x = self.conv1_2(x)
x = self.bn1_2(x)
x = self.relu1_2(x)
outs = []
for i in range(4):
layer_index = i + 2
pool_layer = getattr(self, 'pool{}'.format(layer_index))
block_layer = getattr(self, 'block{}'.format(layer_index))
conv_layer = getattr(self, 'conv{}'.format(layer_index))
bn_layer = getattr(self, 'bn{}'.format(layer_index))
relu_layer = getattr(self, 'relu{}'.format(layer_index))
if pool_layer is not None:
x = pool_layer(x)
x = block_layer(x)
x = conv_layer(x)
x = bn_layer(x)
x = relu_layer(x)
outs.append(x)
if self.out_indices is not None:
return tuple([outs[i] for i in self.out_indices])
return x
\ No newline at end of file
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os, sys
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from pytorchocr.modeling.common import Activation
__all__ = ["ResNetFPN"]
class ResNetFPN(nn.Module):
def __init__(self, in_channels=1, layers=50, **kwargs):
super(ResNetFPN, self).__init__()
supported_layers = {
18: {
'depth': [2, 2, 2, 2],
'block_class': BasicBlock
},
34: {
'depth': [3, 4, 6, 3],
'block_class': BasicBlock
},
50: {
'depth': [3, 4, 6, 3],
'block_class': BottleneckBlock
},
101: {
'depth': [3, 4, 23, 3],
'block_class': BottleneckBlock
},
152: {
'depth': [3, 8, 36, 3],
'block_class': BottleneckBlock
}
}
stride_list = [(2, 2), (2, 2), (1, 1), (1, 1)]
num_filters = [64, 128, 256, 512]
self.depth = supported_layers[layers]['depth']
self.conv = ConvBNLayer(
in_channels=in_channels,
out_channels=64,
kernel_size=7,
stride=2,
act="relu",
name="conv1")
self.block_list = nn.ModuleList()
in_ch = 64
if layers >= 50:
for block in range(len(self.depth)):
for i in range(self.depth[block]):
if layers in [101, 152] and block == 2:
if i == 0:
conv_name = "res" + str(block + 2) + "a"
else:
conv_name = "res" + str(block + 2) + "b" + str(i)
else:
conv_name = "res" + str(block + 2) + chr(97 + i)
bottlenectBlock = BottleneckBlock(
in_channels=in_ch,
out_channels=num_filters[block],
stride=stride_list[block] if i == 0 else 1,
name=conv_name)
in_ch = num_filters[block] * 4
self.block_list.add_module("bottleneckBlock_{}_{}".format(block, i), bottlenectBlock)
else:
for block in range(len(self.depth)):
for i in range(self.depth[block]):
conv_name = "res" + str(block + 2) + chr(97 + i)
if i == 0 and block != 0:
stride = (2, 1)
else:
stride = (1, 1)
basicBlock = BasicBlock(
in_channels=in_ch,
out_channels=num_filters[block],
stride=stride_list[block] if i == 0 else 1,
is_first=block == i == 0,
name=conv_name)
in_ch = basicBlock.out_channels
self.block_list.add_module(conv_name, basicBlock)
out_ch_list = [in_ch // 4, in_ch // 2, in_ch]
self.base_block = nn.ModuleList()
self.conv_trans = []
self.bn_block = []
for i in [-2, -3]:
in_channels = out_ch_list[i + 1] + out_ch_list[i]
bb_0 = nn.Conv2d(
in_channels=in_channels,
out_channels=out_ch_list[i],
kernel_size=1,
bias=True)
self.base_block.add_module("F_{}_base_block_0".format(i), bb_0)
bb_1 = nn.Conv2d(
in_channels=out_ch_list[i],
out_channels=out_ch_list[i],
kernel_size=3,
padding=1,
bias=True)
self.base_block.add_module("F_{}_base_block_1".format(i), bb_1)
bb_2 = nn.Sequential(
nn.BatchNorm2d(out_ch_list[i]),
Activation("relu")
)
self.base_block.add_module("F_{}_base_block_2".format(i), bb_2)
bb_3 = nn.Conv2d(
in_channels=out_ch_list[i],
out_channels=512,
kernel_size=1,
bias=True)
self.base_block.add_module("F_{}_base_block_3".format(i), bb_3)
self.out_channels = 512
def __call__(self, x):
x = self.conv(x)
fpn_list = []
F = []
for i in range(len(self.depth)):
fpn_list.append(np.sum(self.depth[:i + 1]))
for i, block in enumerate(self.block_list):
x = block(x)
for number in fpn_list:
if i + 1 == number:
F.append(x)
base = F[-1]
j = 0
for i, block in enumerate(self.base_block):
if i % 3 == 0 and i < 6:
j = j + 1
b, c, w, h = F[-j - 1].shape
if [w, h] == list(base.shape[2:]):
base = base
else:
base = self.conv_trans[j - 1](base)
base = self.bn_block[j - 1](base)
base = torch.cat([base, F[-j - 1]], dim=1)
base = block(base)
return base
class ConvBNLayer(nn.Module):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
groups=1,
act=None,
name=None):
super(ConvBNLayer, self).__init__()
self.conv = nn.Conv2d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=2 if stride == (1, 1) else kernel_size,
dilation=2 if stride == (1, 1) else 1,
stride=stride,
padding=(kernel_size - 1) // 2,
groups=groups,
bias=False, )
if name == "conv1":
bn_name = "bn_" + name
else:
bn_name = "bn" + name[3:]
self.bn = nn.BatchNorm2d(out_channels)
self.act = act
if self.act is not None:
self._act = Activation(act_type=self.act, inplace=True)
def __call__(self, x):
x = self.conv(x)
x = self.bn(x)
if self.act is not None:
x = self._act(x)
return x
class ShortCut(nn.Module):
def __init__(self, in_channels, out_channels, stride, name, is_first=False):
super(ShortCut, self).__init__()
self.use_conv = True
if in_channels != out_channels or stride != 1 or is_first == True:
if stride == (1, 1):
self.conv = ConvBNLayer(
in_channels, out_channels, 1, 1, name=name)
else: # stride==(2,2)
self.conv = ConvBNLayer(
in_channels, out_channels, 1, stride, name=name)
else:
self.use_conv = False
def forward(self, x):
if self.use_conv:
x = self.conv(x)
return x
class BottleneckBlock(nn.Module):
def __init__(self, in_channels, out_channels, stride, name):
super(BottleneckBlock, self).__init__()
self.conv0 = ConvBNLayer(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=1,
act='relu',
name=name + "_branch2a")
self.conv1 = ConvBNLayer(
in_channels=out_channels,
out_channels=out_channels,
kernel_size=3,
stride=stride,
act='relu',
name=name + "_branch2b")
self.conv2 = ConvBNLayer(
in_channels=out_channels,
out_channels=out_channels * 4,
kernel_size=1,
act=None,
name=name + "_branch2c")
self.short = ShortCut(
in_channels=in_channels,
out_channels=out_channels * 4,
stride=stride,
is_first=False,
name=name + "_branch1")
self.out_channels = out_channels * 4
def forward(self, x):
y = self.conv0(x)
y = self.conv1(y)
y = self.conv2(y)
y = y + self.short(x)
y = F.relu(y)
return y
class BasicBlock(nn.Module):
def __init__(self, in_channels, out_channels, stride, name, is_first):
super(BasicBlock, self).__init__()
self.conv0 = ConvBNLayer(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=3,
act='relu',
stride=stride,
name=name + "_branch2a")
self.conv1 = ConvBNLayer(
in_channels=out_channels,
out_channels=out_channels,
kernel_size=3,
act=None,
name=name + "_branch2b")
self.short = ShortCut(
in_channels=in_channels,
out_channels=out_channels,
stride=stride,
is_first=is_first,
name=name + "_branch1")
self.out_channels = out_channels
def forward(self, x):
y = self.conv0(x)
y = self.conv1(y)
y = y + self.short(x)
return F.relu(y)
import os, sys
import torch
import torch.nn as nn
import torch.nn.functional as F
from pytorchocr.modeling.common import Activation
class ConvBNLayer(nn.Module):
def __init__(
self,
in_channels,
out_channels,
kernel_size,
stride=1,
groups=1,
is_vd_mode=False,
act=None,
name=None, ):
super(ConvBNLayer, self).__init__()
self.act = act
self.is_vd_mode = is_vd_mode
self._pool2d_avg = nn.AvgPool2d(
kernel_size=stride, stride=stride, padding=0, ceil_mode=True)
self._conv = nn.Conv2d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=1 if is_vd_mode else stride,
padding=(kernel_size - 1) // 2,
groups=groups,
bias=False)
self._batch_norm = nn.BatchNorm2d(
out_channels,)
if self.act is not None:
self._act = Activation(act_type=act, inplace=True)
def forward(self, inputs):
if self.is_vd_mode:
inputs = self._pool2d_avg(inputs)
y = self._conv(inputs)
y = self._batch_norm(y)
if self.act is not None:
y = self._act(y)
return y
class BottleneckBlock(nn.Module):
def __init__(self,
in_channels,
out_channels,
stride,
shortcut=True,
if_first=False,
name=None):
super(BottleneckBlock, self).__init__()
self.conv0 = ConvBNLayer(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=1,
act='relu',
name=name + "_branch2a")
self.conv1 = ConvBNLayer(
in_channels=out_channels,
out_channels=out_channels,
kernel_size=3,
stride=stride,
act='relu',
name=name + "_branch2b")
self.conv2 = ConvBNLayer(
in_channels=out_channels,
out_channels=out_channels * 4,
kernel_size=1,
act=None,
name=name + "_branch2c")
if not shortcut:
self.short = ConvBNLayer(
in_channels=in_channels,
out_channels=out_channels * 4,
kernel_size=1,
stride=stride,
is_vd_mode=not if_first and stride[0] != 1,
name=name + "_branch1")
self.shortcut = shortcut
def forward(self, inputs):
y = self.conv0(inputs)
conv1 = self.conv1(y)
conv2 = self.conv2(conv1)
if self.shortcut:
short = inputs
else:
short = self.short(inputs)
y = short + conv2
y = F.relu(y)
return y
class BasicBlock(nn.Module):
def __init__(self,
in_channels,
out_channels,
stride,
shortcut=True,
if_first=False,
name=None):
super(BasicBlock, self).__init__()
self.stride = stride
self.conv0 = ConvBNLayer(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=3,
stride=stride,
act='relu',
name=name + "_branch2a")
self.conv1 = ConvBNLayer(
in_channels=out_channels,
out_channels=out_channels,
kernel_size=3,
act=None,
name=name + "_branch2b")
if not shortcut:
self.short = ConvBNLayer(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=1,
stride=stride,
is_vd_mode=not if_first and stride[0] != 1,
name=name + "_branch1")
self.shortcut = shortcut
def forward(self, inputs):
y = self.conv0(inputs)
conv1 = self.conv1(y)
if self.shortcut:
short = inputs
else:
short = self.short(inputs)
y = short + conv1
y = F.relu(y)
return y
class ResNet(nn.Module):
def __init__(self, in_channels=3, layers=50, **kwargs):
super(ResNet, self).__init__()
self.layers = layers
supported_layers = [18, 34, 50, 101, 152, 200]
assert layers in supported_layers, \
"supported layers are {} but input layer is {}".format(
supported_layers, layers)
if layers == 18:
depth = [2, 2, 2, 2]
elif layers == 34 or layers == 50:
depth = [3, 4, 6, 3]
elif layers == 101:
depth = [3, 4, 23, 3]
elif layers == 152:
depth = [3, 8, 36, 3]
elif layers == 200:
depth = [3, 12, 48, 3]
num_channels = [64, 256, 512,
1024] if layers >= 50 else [64, 64, 128, 256]
num_filters = [64, 128, 256, 512]
self.conv1_1 = ConvBNLayer(
in_channels=in_channels,
out_channels=32,
kernel_size=3,
stride=1,
act='relu',
name="conv1_1")
self.conv1_2 = ConvBNLayer(
in_channels=32,
out_channels=32,
kernel_size=3,
stride=1,
act='relu',
name="conv1_2")
self.conv1_3 = ConvBNLayer(
in_channels=32,
out_channels=64,
kernel_size=3,
stride=1,
act='relu',
name="conv1_3")
self.pool2d_max = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
# self.block_list = list()
self.block_list = nn.Sequential()
if layers >= 50:
for block in range(len(depth)):
shortcut = False
for i in range(depth[block]):
if layers in [101, 152, 200] and block == 2:
if i == 0:
conv_name = "res" + str(block + 2) + "a"
else:
conv_name = "res" + str(block + 2) + "b" + str(i)
else:
conv_name = "res" + str(block + 2) + chr(97 + i)
if i == 0 and block != 0:
stride = (2, 1)
else:
stride = (1, 1)
bottleneck_block = BottleneckBlock(in_channels=num_channels[block] if i == 0 else num_filters[block] * 4,
out_channels=num_filters[block],
stride=stride,
shortcut=shortcut,
if_first=block == i == 0,
name=conv_name)
shortcut = True
# self.block_list.append(bottleneck_block)
self.block_list.add_module('bb_%d_%d' % (block, i), bottleneck_block)
self.out_channels = num_filters[block]
else:
for block in range(len(depth)):
shortcut = False
for i in range(depth[block]):
conv_name = "res" + str(block + 2) + chr(97 + i)
if i == 0 and block != 0:
stride = (2, 1)
else:
stride = (1, 1)
basic_block = BasicBlock(in_channels=num_channels[block] if i == 0 else num_filters[block],
out_channels=num_filters[block],
stride=stride,
shortcut=shortcut,
if_first=block == i == 0,
name=conv_name)
shortcut = True
# self.block_list.append(basic_block)
self.block_list.add_module('bb_%d_%d' % (block, i), basic_block)
self.out_channels = num_filters[block]
self.out_pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
def forward(self, inputs):
y = self.conv1_1(inputs)
y = self.conv1_2(y)
y = self.conv1_3(y)
y = self.pool2d_max(y)
for block in self.block_list:
y = block(y)
y = self.out_pool(y)
return y
\ No newline at end of file
import torch
import torch.nn as nn
from pytorchocr.modeling.common import Activation
import numpy as np
import torch
from torch import nn
from ..common import Activation
def drop_path(x, drop_prob=0., training=False):
def drop_path(x, drop_prob=0.0, training=False):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
"""
if drop_prob == 0. or not training:
if drop_prob == 0.0 or not training:
return x
keep_prob = torch.as_tensor(1 - drop_prob)
shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
shape = (x.shape[0],) + (1,) * (x.ndim - 1)
random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype)
random_tensor = torch.floor(random_tensor) # binarize
output = x.divide(keep_prob) * random_tensor
......@@ -19,15 +21,17 @@ def drop_path(x, drop_prob=0., training=False):
class ConvBNLayer(nn.Module):
def __init__(self,
in_channels,
out_channels,
kernel_size=3,
stride=1,
padding=0,
bias_attr=False,
groups=1,
act='gelu'):
def __init__(
self,
in_channels,
out_channels,
kernel_size=3,
stride=1,
padding=0,
bias_attr=False,
groups=1,
act="gelu",
):
super().__init__()
self.conv = nn.Conv2d(
in_channels=in_channels,
......@@ -36,7 +40,8 @@ class ConvBNLayer(nn.Module):
stride=stride,
padding=padding,
groups=groups,
bias=bias_attr)
bias=bias_attr,
)
self.norm = nn.BatchNorm2d(out_channels)
self.act = Activation(act_type=act, inplace=True)
......@@ -48,8 +53,7 @@ class ConvBNLayer(nn.Module):
class DropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
"""
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
def __init__(self, drop_prob=None):
super(DropPath, self).__init__()
......@@ -68,12 +72,14 @@ class Identity(nn.Module):
class Mlp(nn.Module):
def __init__(self,
in_features,
hidden_features=None,
out_features=None,
act_layer='gelu',
drop=0.):
def __init__(
self,
in_features,
hidden_features=None,
out_features=None,
act_layer="gelu",
drop=0.0,
):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
......@@ -93,11 +99,12 @@ class Mlp(nn.Module):
class ConvMixer(nn.Module):
def __init__(
self,
dim,
num_heads=8,
HW=[8, 25],
local_k=[3, 3], ):
self,
dim,
num_heads=8,
HW=[8, 25],
local_k=[3, 3],
):
super().__init__()
self.HW = HW
self.dim = dim
......@@ -105,9 +112,10 @@ class ConvMixer(nn.Module):
dim,
dim,
local_k,
1, [local_k[0] // 2, local_k[1] // 2],
1,
[local_k[0] // 2, local_k[1] // 2],
groups=num_heads,
)
)
def forward(self, x):
h = self.HW[0]
......@@ -119,16 +127,18 @@ class ConvMixer(nn.Module):
class Attention(nn.Module):
def __init__(self,
dim,
num_heads=8,
mixer='Global',
HW=[8, 25],
local_k=[7, 11],
qkv_bias=False,
qk_scale=None,
attn_drop=0.,
proj_drop=0.):
def __init__(
self,
dim,
num_heads=8,
mixer="Global",
HW=[8, 25],
local_k=[7, 11],
qkv_bias=False,
qk_scale=None,
attn_drop=0.0,
proj_drop=0.0,
):
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
......@@ -143,16 +153,19 @@ class Attention(nn.Module):
W = HW[1]
self.N = H * W
self.C = dim
if mixer == 'Local' and HW is not None:
if mixer == "Local" and HW is not None:
hk = local_k[0]
wk = local_k[1]
mask = torch.ones(H * W, H + hk - 1, W + wk - 1, dtype=torch.float32)
for h in range(0, H):
for w in range(0, W):
mask[h * W + w, h:h + hk, w:w + wk] = 0.
mask_paddle = mask[:, hk // 2:H + hk // 2, wk // 2:W + wk //
2].flatten(1)
mask_inf = torch.full([H * W, H * W], fill_value=float("-Inf"), dtype=torch.float32)
mask[h * W + w, h : h + hk, w : w + wk] = 0.0
mask_paddle = mask[:, hk // 2 : H + hk // 2, wk // 2 : W + wk // 2].flatten(
1
)
mask_inf = torch.full(
[H * W, H * W], fill_value=float("-Inf"), dtype=torch.float32
)
mask = torch.where(mask_paddle < 1, mask_paddle, mask_inf)
self.mask = mask.unsqueeze(0).unsqueeze(1)
# self.mask = mask[None, None, :]
......@@ -165,11 +178,13 @@ class Attention(nn.Module):
else:
_, N, C = x.shape
qkv = self.qkv(x)
qkv = qkv.reshape((-1, N, 3, self.num_heads, C // self.num_heads)).permute(2, 0, 3, 1, 4)
qkv = qkv.reshape((-1, N, 3, self.num_heads, C // self.num_heads)).permute(
2, 0, 3, 1, 4
)
q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
attn = (q.matmul(k.permute(0, 1, 3, 2)))
if self.mixer == 'Local':
attn = q.matmul(k.permute(0, 1, 3, 2))
if self.mixer == "Local":
attn += self.mask
attn = nn.functional.softmax(attn, dim=-1)
attn = self.attn_drop(attn)
......@@ -181,28 +196,30 @@ class Attention(nn.Module):
class Block(nn.Module):
def __init__(self,
dim,
num_heads,
mixer='Global',
local_mixer=[7, 11],
HW=None,
mlp_ratio=4.,
qkv_bias=False,
qk_scale=None,
drop=0.,
attn_drop=0.,
drop_path=0.,
act_layer='gelu',
norm_layer='nn.LayerNorm',
epsilon=1e-6,
prenorm=True):
def __init__(
self,
dim,
num_heads,
mixer="Global",
local_mixer=[7, 11],
HW=None,
mlp_ratio=4.0,
qkv_bias=False,
qk_scale=None,
drop=0.0,
attn_drop=0.0,
drop_path=0.0,
act_layer="gelu",
norm_layer="nn.LayerNorm",
epsilon=1e-6,
prenorm=True,
):
super().__init__()
if isinstance(norm_layer, str):
self.norm1 = eval(norm_layer)(dim, eps=epsilon)
else:
self.norm1 = norm_layer(dim)
if mixer == 'Global' or mixer == 'Local':
if mixer == "Global" or mixer == "Local":
self.mixer = Attention(
dim,
num_heads=num_heads,
......@@ -212,24 +229,26 @@ class Block(nn.Module):
qkv_bias=qkv_bias,
qk_scale=qk_scale,
attn_drop=attn_drop,
proj_drop=drop)
elif mixer == 'Conv':
self.mixer = ConvMixer(
dim, num_heads=num_heads, HW=HW, local_k=local_mixer)
proj_drop=drop,
)
elif mixer == "Conv":
self.mixer = ConvMixer(dim, num_heads=num_heads, HW=HW, local_k=local_mixer)
else:
raise TypeError("The mixer must be one of [Global, Local, Conv]")
self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity()
if isinstance(norm_layer, str):
self.norm2 = eval(norm_layer)(dim, eps=epsilon)
else:
self.norm2 = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp_ratio = mlp_ratio
self.mlp = Mlp(in_features=dim,
hidden_features=mlp_hidden_dim,
act_layer=act_layer,
drop=drop)
self.mlp = Mlp(
in_features=dim,
hidden_features=mlp_hidden_dim,
act_layer=act_layer,
drop=drop,
)
self.prenorm = prenorm
def forward(self, x):
......@@ -243,25 +262,24 @@ class Block(nn.Module):
class PatchEmbed(nn.Module):
""" Image to Patch Embedding
"""
"""Image to Patch Embedding"""
def __init__(self,
img_size=[32, 100],
in_channels=3,
embed_dim=768,
sub_num=2,
patch_size=[4, 4],
mode='pope',
):
def __init__(
self,
img_size=[32, 100],
in_channels=3,
embed_dim=768,
sub_num=2,
patch_size=[4, 4],
mode="pope",
):
super().__init__()
num_patches = (img_size[1] // (2 ** sub_num)) * \
(img_size[0] // (2 ** sub_num))
num_patches = (img_size[1] // (2**sub_num)) * (img_size[0] // (2**sub_num))
self.img_size = img_size
self.num_patches = num_patches
self.embed_dim = embed_dim
self.norm = None
if mode == 'pope':
if mode == "pope":
if sub_num == 2:
self.proj = nn.Sequential(
ConvBNLayer(
......@@ -270,16 +288,19 @@ class PatchEmbed(nn.Module):
kernel_size=3,
stride=2,
padding=1,
act='gelu',
bias_attr=True),
act="gelu",
bias_attr=True,
),
ConvBNLayer(
in_channels=embed_dim // 2,
out_channels=embed_dim,
kernel_size=3,
stride=2,
padding=1,
act='gelu',
bias_attr=True))
act="gelu",
bias_attr=True,
),
)
if sub_num == 3:
self.proj = nn.Sequential(
ConvBNLayer(
......@@ -288,55 +309,66 @@ class PatchEmbed(nn.Module):
kernel_size=3,
stride=2,
padding=1,
act='gelu',
bias_attr=True),
act="gelu",
bias_attr=True,
),
ConvBNLayer(
in_channels=embed_dim // 4,
out_channels=embed_dim // 2,
kernel_size=3,
stride=2,
padding=1,
act='gelu',
bias_attr=True),
act="gelu",
bias_attr=True,
),
ConvBNLayer(
in_channels=embed_dim // 2,
out_channels=embed_dim,
kernel_size=3,
stride=2,
padding=1,
act='gelu',
bias_attr=True))
elif mode == 'linear':
act="gelu",
bias_attr=True,
),
)
elif mode == "linear":
self.proj = nn.Conv2d(
1, embed_dim, kernel_size=patch_size, stride=patch_size)
self.num_patches = img_size[0] // patch_size[0] * img_size[
1] // patch_size[1]
1, embed_dim, kernel_size=patch_size, stride=patch_size
)
self.num_patches = (
img_size[0] // patch_size[0] * img_size[1] // patch_size[1]
)
def forward(self, x):
B, C, H, W = x.shape
assert H == self.img_size[0] and W == self.img_size[1], \
"Input image size ({}*{}) doesn't match model ({}*{}).".format(
H,W,self.img_size[0],self.img_size[1]
)
assert (
H == self.img_size[0] and W == self.img_size[1]
), "Input image size ({}*{}) doesn't match model ({}*{}).".format(
H, W, self.img_size[0], self.img_size[1]
)
x = self.proj(x).flatten(2).permute(0, 2, 1)
return x
class SubSample(nn.Module):
def __init__(self,
in_channels,
out_channels,
types='Pool',
stride=[2, 1],
sub_norm='nn.LayerNorm',
act=None):
def __init__(
self,
in_channels,
out_channels,
types="Pool",
stride=[2, 1],
sub_norm="nn.LayerNorm",
act=None,
):
super().__init__()
self.types = types
if types == 'Pool':
if types == "Pool":
self.avgpool = nn.AvgPool2d(
kernel_size=[3, 5], stride=stride, padding=[1, 2])
kernel_size=[3, 5], stride=stride, padding=[1, 2]
)
self.maxpool = nn.MaxPool2d(
kernel_size=[3, 5], stride=stride, padding=[1, 2])
kernel_size=[3, 5], stride=stride, padding=[1, 2]
)
self.proj = nn.Linear(in_channels, out_channels)
else:
self.conv = nn.Conv2d(
......@@ -345,7 +377,7 @@ class SubSample(nn.Module):
kernel_size=3,
stride=stride,
padding=1,
)
)
self.norm = eval(sub_norm)(out_channels)
if act is not None:
self.act = act()
......@@ -353,8 +385,7 @@ class SubSample(nn.Module):
self.act = None
def forward(self, x):
if self.types == 'Pool':
if self.types == "Pool":
x1 = self.avgpool(x)
x2 = self.maxpool(x)
x = (x1 + x2) * 0.5
......@@ -371,46 +402,51 @@ class SubSample(nn.Module):
class SVTRNet(nn.Module):
def __init__(
self,
img_size=[32, 100],
in_channels=3,
embed_dim=[64, 128, 256],
depth=[3, 6, 3],
num_heads=[2, 4, 8],
mixer=['Local'] * 6 + ['Global'] *
6, # Local atten, Global atten, Conv
local_mixer=[[7, 11], [7, 11], [7, 11]],
patch_merging='Conv', # Conv, Pool, None
mlp_ratio=4,
qkv_bias=True,
qk_scale=None,
drop_rate=0.,
last_drop=0.0,
attn_drop_rate=0.,
drop_path_rate=0.1,
norm_layer='nn.LayerNorm',
sub_norm='nn.LayerNorm',
epsilon=1e-6,
out_channels=192,
out_char_num=25,
block_unit='Block',
act='gelu',
last_stage=True,
sub_num=2,
prenorm=True,
use_lenhead=False,
**kwargs):
self,
img_size=[32, 100],
in_channels=3,
embed_dim=[64, 128, 256],
depth=[3, 6, 3],
num_heads=[2, 4, 8],
mixer=["Local"] * 6 + ["Global"] * 6, # Local atten, Global atten, Conv
local_mixer=[[7, 11], [7, 11], [7, 11]],
patch_merging="Conv", # Conv, Pool, None
mlp_ratio=4,
qkv_bias=True,
qk_scale=None,
drop_rate=0.0,
last_drop=0.0,
attn_drop_rate=0.0,
drop_path_rate=0.1,
norm_layer="nn.LayerNorm",
sub_norm="nn.LayerNorm",
epsilon=1e-6,
out_channels=192,
out_char_num=25,
block_unit="Block",
act="gelu",
last_stage=True,
sub_num=2,
prenorm=True,
use_lenhead=False,
**kwargs
):
super().__init__()
self.img_size = img_size
self.embed_dim = embed_dim
self.out_channels = out_channels
self.prenorm = prenorm
patch_merging = None if patch_merging != 'Conv' and patch_merging != 'Pool' else patch_merging
patch_merging = (
None
if patch_merging != "Conv" and patch_merging != "Pool"
else patch_merging
)
self.patch_embed = PatchEmbed(
img_size=img_size,
in_channels=in_channels,
embed_dim=embed_dim[0],
sub_num=sub_num)
sub_num=sub_num,
)
num_patches = self.patch_embed.num_patches
self.HW = [img_size[0] // (2**sub_num), img_size[1] // (2**sub_num)]
self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim[0]))
......@@ -418,81 +454,95 @@ class SVTRNet(nn.Module):
Block_unit = eval(block_unit)
dpr = np.linspace(0, drop_path_rate, sum(depth))
self.blocks1 = nn.ModuleList([
Block_unit(
dim=embed_dim[0],
num_heads=num_heads[0],
mixer=mixer[0:depth[0]][i],
HW=self.HW,
local_mixer=local_mixer[0],
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop_rate,
act_layer=act,
attn_drop=attn_drop_rate,
drop_path=dpr[0:depth[0]][i],
norm_layer=norm_layer,
epsilon=epsilon,
prenorm=prenorm) for i in range(depth[0])
])
self.blocks1 = nn.ModuleList(
[
Block_unit(
dim=embed_dim[0],
num_heads=num_heads[0],
mixer=mixer[0 : depth[0]][i],
HW=self.HW,
local_mixer=local_mixer[0],
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop_rate,
act_layer=act,
attn_drop=attn_drop_rate,
drop_path=dpr[0 : depth[0]][i],
norm_layer=norm_layer,
epsilon=epsilon,
prenorm=prenorm,
)
for i in range(depth[0])
]
)
if patch_merging is not None:
self.sub_sample1 = SubSample(
embed_dim[0],
embed_dim[1],
sub_norm=sub_norm,
stride=[2, 1],
types=patch_merging)
types=patch_merging,
)
HW = [self.HW[0] // 2, self.HW[1]]
else:
HW = self.HW
self.patch_merging = patch_merging
self.blocks2 = nn.ModuleList([
Block_unit(
dim=embed_dim[1],
num_heads=num_heads[1],
mixer=mixer[depth[0]:depth[0] + depth[1]][i],
HW=HW,
local_mixer=local_mixer[1],
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop_rate,
act_layer=act,
attn_drop=attn_drop_rate,
drop_path=dpr[depth[0]:depth[0] + depth[1]][i],
norm_layer=norm_layer,
epsilon=epsilon,
prenorm=prenorm) for i in range(depth[1])
])
self.blocks2 = nn.ModuleList(
[
Block_unit(
dim=embed_dim[1],
num_heads=num_heads[1],
mixer=mixer[depth[0] : depth[0] + depth[1]][i],
HW=HW,
local_mixer=local_mixer[1],
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop_rate,
act_layer=act,
attn_drop=attn_drop_rate,
drop_path=dpr[depth[0] : depth[0] + depth[1]][i],
norm_layer=norm_layer,
epsilon=epsilon,
prenorm=prenorm,
)
for i in range(depth[1])
]
)
if patch_merging is not None:
self.sub_sample2 = SubSample(
embed_dim[1],
embed_dim[2],
sub_norm=sub_norm,
stride=[2, 1],
types=patch_merging)
types=patch_merging,
)
HW = [self.HW[0] // 4, self.HW[1]]
else:
HW = self.HW
self.blocks3 = nn.ModuleList([
Block_unit(
dim=embed_dim[2],
num_heads=num_heads[2],
mixer=mixer[depth[0] + depth[1]:][i],
HW=HW,
local_mixer=local_mixer[2],
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop_rate,
act_layer=act,
attn_drop=attn_drop_rate,
drop_path=dpr[depth[0] + depth[1]:][i],
norm_layer=norm_layer,
epsilon=epsilon,
prenorm=prenorm) for i in range(depth[2])
])
self.blocks3 = nn.ModuleList(
[
Block_unit(
dim=embed_dim[2],
num_heads=num_heads[2],
mixer=mixer[depth[0] + depth[1] :][i],
HW=HW,
local_mixer=local_mixer[2],
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop_rate,
act_layer=act,
attn_drop=attn_drop_rate,
drop_path=dpr[depth[0] + depth[1] :][i],
norm_layer=norm_layer,
epsilon=epsilon,
prenorm=prenorm,
)
for i in range(depth[2])
]
)
self.last_stage = last_stage
if last_stage:
self.avg_pool = nn.AdaptiveAvgPool2d([1, out_char_num])
......@@ -502,8 +552,9 @@ class SVTRNet(nn.Module):
kernel_size=1,
stride=1,
padding=0,
bias=False)
self.hardswish = Activation('hard_swish', inplace=True) #nn.Hardswish()
bias=False,
)
self.hardswish = Activation("hard_swish", inplace=True) # nn.Hardswish()
# self.dropout = nn.Dropout(p=last_drop, mode="downscale_in_infer")
self.dropout = nn.Dropout(p=last_drop)
if not prenorm:
......@@ -511,9 +562,10 @@ class SVTRNet(nn.Module):
self.use_lenhead = use_lenhead
if use_lenhead:
self.len_conv = nn.Linear(embed_dim[2], self.out_channels)
self.hardswish_len = Activation('hard_swish', inplace=True)# nn.Hardswish()
self.dropout_len = nn.Dropout(
p=last_drop)
self.hardswish_len = Activation(
"hard_swish", inplace=True
) # nn.Hardswish()
self.dropout_len = nn.Dropout(p=last_drop)
torch.nn.init.xavier_normal_(self.pos_embed)
self.apply(self._init_weights)
......@@ -521,7 +573,7 @@ class SVTRNet(nn.Module):
def _init_weights(self, m):
# weight initialization
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out')
nn.init.kaiming_normal_(m.weight, mode="fan_out")
if m.bias is not None:
nn.init.zeros_(m.bias)
elif isinstance(m, nn.BatchNorm2d):
......@@ -532,7 +584,7 @@ class SVTRNet(nn.Module):
if m.bias is not None:
nn.init.zeros_(m.bias)
elif isinstance(m, nn.ConvTranspose2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out')
nn.init.kaiming_normal_(m.weight, mode="fan_out")
if m.bias is not None:
nn.init.zeros_(m.bias)
elif isinstance(m, nn.LayerNorm):
......@@ -548,13 +600,17 @@ class SVTRNet(nn.Module):
if self.patch_merging is not None:
x = self.sub_sample1(
x.permute(0, 2, 1).reshape(
[-1, self.embed_dim[0], self.HW[0], self.HW[1]]))
[-1, self.embed_dim[0], self.HW[0], self.HW[1]]
)
)
for blk in self.blocks2:
x = blk(x)
if self.patch_merging is not None:
x = self.sub_sample2(
x.permute(0, 2, 1).reshape(
[-1, self.embed_dim[1], self.HW[0] // 2, self.HW[1]]))
[-1, self.embed_dim[1], self.HW[0] // 2, self.HW[1]]
)
)
for blk in self.blocks3:
x = blk(x)
if not self.prenorm:
......@@ -572,11 +628,11 @@ class SVTRNet(nn.Module):
else:
h = self.HW[0]
x = self.avg_pool(
x.permute(0, 2, 1).reshape(
[-1, self.embed_dim[2], h, self.HW[1]]))
x.permute(0, 2, 1).reshape([-1, self.embed_dim[2], h, self.HW[1]])
)
x = self.last_conv(x)
x = self.hardswish(x)
x = self.dropout(x)
if self.use_lenhead:
return x, len_x
return x
\ No newline at end of file
return x
"""
This code is refer from:
https://github.com/roatienza/deep-text-recognition-benchmark/blob/master/modules/vitstr.py
"""
import numpy as np
import torch
import torch.nn as nn
from pytorchocr.modeling.backbones.rec_svtrnet import Block, PatchEmbed
# import paddle
# import paddle.nn as nn
# from ppocr.modeling.backbones.rec_svtrnet import Block, PatchEmbed, zeros_, trunc_normal_, ones_
scale_dim_heads = {'tiny': [192, 3], 'small': [384, 6], 'base': [768, 12]}
class ViTSTR(nn.Module):
def __init__(self,
img_size=[224, 224],
in_channels=1,
scale='tiny',
seqlen=27,
patch_size=[16, 16],
embed_dim=None,
depth=12,
num_heads=None,
mlp_ratio=4,
qkv_bias=True,
qk_scale=None,
drop_path_rate=0.,
drop_rate=0.,
attn_drop_rate=0.,
norm_layer='nn.LayerNorm',
act_layer='gelu',
epsilon=1e-6,
out_channels=None,
**kwargs):
super().__init__()
self.seqlen = seqlen
embed_dim = embed_dim if embed_dim is not None else scale_dim_heads[
scale][0]
num_heads = num_heads if num_heads is not None else scale_dim_heads[
scale][1]
out_channels = out_channels if out_channels is not None else embed_dim
self.patch_embed = PatchEmbed(
img_size=img_size,
in_channels=in_channels,
embed_dim=embed_dim,
patch_size=patch_size,
mode='linear')
num_patches = self.patch_embed.num_patches
self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
self.pos_drop = nn.Dropout(p=drop_rate)
dpr = np.linspace(0, drop_path_rate, depth)
self.blocks = nn.ModuleList([
Block(
dim=embed_dim,
num_heads=num_heads,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop_rate,
attn_drop=attn_drop_rate,
drop_path=dpr[i],
norm_layer=norm_layer,
act_layer=act_layer,
epsilon=epsilon,
prenorm=False) for i in range(depth)
])
self.norm = eval(norm_layer)(embed_dim, eps=epsilon)
self.out_channels = out_channels
torch.nn.init.xavier_normal_(self.pos_embed)
torch.nn.init.xavier_normal_(self.cls_token)
self.apply(self._init_weights)
def _init_weights(self, m):
# weight initialization
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out')
if m.bias is not None:
nn.init.zeros_(m.bias)
elif isinstance(m, nn.BatchNorm2d):
nn.init.ones_(m.weight)
nn.init.zeros_(m.bias)
elif isinstance(m, nn.Linear):
nn.init.normal_(m.weight, 0, 0.01)
if m.bias is not None:
nn.init.zeros_(m.bias)
elif isinstance(m, nn.ConvTranspose2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out')
if m.bias is not None:
nn.init.zeros_(m.bias)
elif isinstance(m, nn.LayerNorm):
nn.init.ones_(m.weight)
nn.init.zeros_(m.bias)
def forward_features(self, x):
B = x.shape[0]
x = self.patch_embed(x)
# cls_tokens = paddle.tile(self.cls_token, repeat_times=[B, 1, 1])
cls_tokens = self.cls_token.repeat(B, 1, 1)
x = torch.cat((cls_tokens, x), dim=1)
x = x + self.pos_embed
x = self.pos_drop(x)
for blk in self.blocks:
x = blk(x)
x = self.norm(x)
return x
def forward(self, x):
x = self.forward_features(x)
x = x[:, :self.seqlen]
return x.permute(0, 2, 1).unsqueeze(2)
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F
__all__ = ['MobileNetV3']
def make_divisible(v, divisor=8, min_value=None):
if min_value is None:
min_value = divisor
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
if new_v < 0.9 * v:
new_v += divisor
return new_v
def hard_sigmoid(x, slope=0.1666667, offset=0.5,):
return torch.clamp(slope * x + offset, 0., 1.)
def hard_swish(x, inplace=True):
return x * F.relu6(x + 3., inplace=inplace) / 6.
class MobileNetV3(nn.Module):
def __init__(self,
in_channels=3,
model_name='large',
scale=0.5,
disable_se=False,
**kwargs):
"""
the MobilenetV3 backbone network for detection module.
Args:
params(dict): the super parameters for build network
"""
super(MobileNetV3, self).__init__()
self.disable_se = disable_se
if model_name == "large":
cfg = [
# k, exp, c, se, nl, s,
[3, 16, 16, False, 'relu', 1],
[3, 64, 24, False, 'relu', 2],
[3, 72, 24, False, 'relu', 1],
[5, 72, 40, True, 'relu', 2],
[5, 120, 40, True, 'relu', 1],
[5, 120, 40, True, 'relu', 1],
[3, 240, 80, False, 'hardswish', 2],
[3, 200, 80, False, 'hardswish', 1],
[3, 184, 80, False, 'hardswish', 1],
[3, 184, 80, False, 'hardswish', 1],
[3, 480, 112, True, 'hardswish', 1],
[3, 672, 112, True, 'hardswish', 1],
[5, 672, 160, True, 'hardswish', 2],
[5, 960, 160, True, 'hardswish', 1],
[5, 960, 160, True, 'hardswish', 1],
]
cls_ch_squeeze = 960
elif model_name == "small":
cfg = [
# k, exp, c, se, nl, s,
[3, 16, 16, True, 'relu', 2],
[3, 72, 24, False, 'relu', 2],
[3, 88, 24, False, 'relu', 1],
[5, 96, 40, True, 'hardswish', 2],
[5, 240, 40, True, 'hardswish', 1],
[5, 240, 40, True, 'hardswish', 1],
[5, 120, 48, True, 'hardswish', 1],
[5, 144, 48, True, 'hardswish', 1],
[5, 288, 96, True, 'hardswish', 2],
[5, 576, 96, True, 'hardswish', 1],
[5, 576, 96, True, 'hardswish', 1],
]
cls_ch_squeeze = 576
else:
raise NotImplementedError("mode[" + model_name +
"_model] is not implemented!")
supported_scale = [0.35, 0.5, 0.75, 1.0, 1.25]
assert scale in supported_scale, \
"supported scale are {} but input scale is {}".format(supported_scale, scale)
inplanes = 16
# conv1
self.conv = ConvBNLayer(
in_channels=in_channels,
out_channels=make_divisible(inplanes * scale),
kernel_size=3,
stride=2,
padding=1,
groups=1,
if_act=True,
act='hardswish',
name='conv1')
self.stages = nn.ModuleList()
self.out_channels = []
block_list = []
i = 0
inplanes = make_divisible(inplanes * scale)
for (k, exp, c, se, nl, s) in cfg:
se = se and not self.disable_se
start_idx = 2 if model_name == 'large' else 0
if s == 2 and i > start_idx:
self.out_channels.append(inplanes)
self.stages.append(nn.Sequential(*block_list))
block_list = []
block_list.append(
ResidualUnit(
in_channels=inplanes,
mid_channels=make_divisible(scale * exp),
out_channels=make_divisible(scale * c),
kernel_size=k,
stride=s,
use_se=se,
act=nl,
name="conv" + str(i + 2)))
inplanes = make_divisible(scale * c)
i += 1
block_list.append(
ConvBNLayer(
in_channels=inplanes,
out_channels=make_divisible(scale * cls_ch_squeeze),
kernel_size=1,
stride=1,
padding=0,
groups=1,
if_act=True,
act='hardswish',
name='conv_last'))
self.stages.append(nn.Sequential(*block_list))
self.out_channels.append(make_divisible(scale * cls_ch_squeeze))
# for i, stage in enumerate(self.stages):
# self.add_module(module=stage, name="stage{}".format(i))
def forward(self, x):
x = self.conv(x)
out_list = []
for stage in self.stages:
x = stage(x)
out_list.append(x)
return out_list
class ConvBNLayer(nn.Module):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride,
padding,
groups=1,
if_act=True,
act=None,
name=None):
super(ConvBNLayer, self).__init__()
self.if_act = if_act
self.act = act
self.conv = nn.Conv2d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
groups=groups,
bias=False)
self.bn = nn.BatchNorm2d(
out_channels,
)
def forward(self, x):
x = self.conv(x)
x = self.bn(x)
if self.if_act:
if self.act == "relu":
x = F.relu(x)
elif self.act == "hardswish":
x = hard_swish(x)
else:
print("The activation function({}) is selected incorrectly.".
format(self.act))
exit()
return x
class ResidualUnit(nn.Module):
def __init__(self,
in_channels,
mid_channels,
out_channels,
kernel_size,
stride,
use_se,
act=None,
name=''):
super(ResidualUnit, self).__init__()
self.if_shortcut = stride == 1 and in_channels == out_channels
self.if_se = use_se
self.expand_conv = ConvBNLayer(
in_channels=in_channels,
out_channels=mid_channels,
kernel_size=1,
stride=1,
padding=0,
if_act=True,
act=act,
name=name + "_expand")
self.bottleneck_conv = ConvBNLayer(
in_channels=mid_channels,
out_channels=mid_channels,
kernel_size=kernel_size,
stride=stride,
padding=int((kernel_size - 1) // 2),
groups=mid_channels,
if_act=True,
act=act,
name=name + "_depthwise")
if self.if_se:
self.mid_se = SEModule(mid_channels, name=name + "_se")
self.linear_conv = ConvBNLayer(
in_channels=mid_channels,
out_channels=out_channels,
kernel_size=1,
stride=1,
padding=0,
if_act=False,
act=None,
name=name + "_linear")
def forward(self, inputs):
x = self.expand_conv(inputs)
x = self.bottleneck_conv(x)
if self.if_se:
x = self.mid_se(x)
x = self.linear_conv(x)
if self.if_shortcut:
x = torch.add(inputs, x)
return x
class SEModule(nn.Module):
def __init__(self, in_channels, reduction=4, name=""):
super(SEModule, self).__init__()
self.avg_pool = nn.AdaptiveAvgPool2d(1)
self.conv1 = nn.Conv2d(
in_channels=in_channels,
out_channels=in_channels // reduction,
kernel_size=1,
stride=1,
padding=0,
bias=True)
self.conv2 = nn.Conv2d(
in_channels=in_channels // reduction,
out_channels=in_channels,
kernel_size=1,
stride=1,
padding=0,
bias=True)
def forward(self, inputs):
outputs = self.avg_pool(inputs)
outputs = self.conv1(outputs)
outputs = F.relu(outputs)
outputs = self.conv2(outputs)
outputs = hard_sigmoid(outputs, slope=0.2, offset=0.5)
return inputs * outputs
\ No newline at end of file
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F
from pytorchocr.modeling.common import Activation
__all__ = ["ResNet"]
class ConvBNLayer(nn.Module):
def __init__(
self,
in_channels,
out_channels,
kernel_size,
stride=1,
groups=1,
is_vd_mode=False,
act=None,
name=None, ):
super(ConvBNLayer, self).__init__()
self.is_vd_mode = is_vd_mode
self._pool2d_avg = nn.AvgPool2d(
kernel_size=2, stride=2, padding=0, ceil_mode=True)
self._conv = nn.Conv2d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=(kernel_size - 1) // 2,
groups=groups,
bias=False)
if name == "conv1":
bn_name = "bn_" + name
else:
bn_name = "bn" + name[3:]
self._batch_norm = nn.BatchNorm2d(
out_channels,
)
self.act = act
if self.act is not None:
self._act = Activation(act, inplace=True)
def forward(self, inputs):
if self.is_vd_mode:
inputs = self._pool2d_avg(inputs)
y = self._conv(inputs)
y = self._batch_norm(y)
if self.act is not None:
y = self._act(y)
return y
class BottleneckBlock(nn.Module):
def __init__(self,
in_channels,
out_channels,
stride,
shortcut=True,
if_first=False,
name=None):
super(BottleneckBlock, self).__init__()
self.conv0 = ConvBNLayer(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=1,
act='relu',
name=name + "_branch2a")
self.conv1 = ConvBNLayer(
in_channels=out_channels,
out_channels=out_channels,
kernel_size=3,
stride=stride,
act='relu',
name=name + "_branch2b")
self.conv2 = ConvBNLayer(
in_channels=out_channels,
out_channels=out_channels * 4,
kernel_size=1,
act=None,
name=name + "_branch2c")
if not shortcut:
self.short = ConvBNLayer(
in_channels=in_channels,
out_channels=out_channels * 4,
kernel_size=1,
stride=1,
is_vd_mode=False if if_first else True,
name=name + "_branch1")
self.shortcut = shortcut
def forward(self, inputs):
y = self.conv0(inputs)
conv1 = self.conv1(y)
conv2 = self.conv2(conv1)
if self.shortcut:
short = inputs
else:
short = self.short(inputs)
y = torch.add(short, conv2)
y = F.relu(y)
return y
class BasicBlock(nn.Module):
def __init__(self,
in_channels,
out_channels,
stride,
shortcut=True,
if_first=False,
name=None):
super(BasicBlock, self).__init__()
self.stride = stride
self.conv0 = ConvBNLayer(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=3,
stride=stride,
act='relu',
name=name + "_branch2a")
self.conv1 = ConvBNLayer(
in_channels=out_channels,
out_channels=out_channels,
kernel_size=3,
act=None,
name=name + "_branch2b")
if not shortcut:
self.short = ConvBNLayer(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=1,
stride=1,
is_vd_mode=False if if_first else True,
name=name + "_branch1")
self.shortcut = shortcut
def forward(self, inputs):
y = self.conv0(inputs)
conv1 = self.conv1(y)
if self.shortcut:
short = inputs
else:
short = self.short(inputs)
y = torch.add(short, conv1)
y = F.relu(y)
return y
class ResNet(nn.Module):
def __init__(self, in_channels=3, layers=50, **kwargs):
super(ResNet, self).__init__()
self.layers = layers
supported_layers = [18, 34, 50, 101, 152, 200]
assert layers in supported_layers, \
"supported layers are {} but input layer is {}".format(
supported_layers, layers)
if layers == 18:
depth = [2, 2, 2, 2]
elif layers == 34 or layers == 50:
depth = [3, 4, 6, 3]
elif layers == 101:
depth = [3, 4, 23, 3]
elif layers == 152:
depth = [3, 8, 36, 3]
elif layers == 200:
depth = [3, 12, 48, 3]
num_channels = [64, 256, 512,
1024] if layers >= 50 else [64, 64, 128, 256]
num_filters = [64, 128, 256, 512]
self.conv1_1 = ConvBNLayer(
in_channels=in_channels,
out_channels=32,
kernel_size=3,
stride=2,
act='relu',
name="conv1_1")
self.conv1_2 = ConvBNLayer(
in_channels=32,
out_channels=32,
kernel_size=3,
stride=1,
act='relu',
name="conv1_2")
self.conv1_3 = ConvBNLayer(
in_channels=32,
out_channels=64,
kernel_size=3,
stride=1,
act='relu',
name="conv1_3")
self.pool2d_max = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.stages = nn.ModuleList()
self.out_channels = []
if layers >= 50:
for block in range(len(depth)):
block_list = nn.Sequential()
shortcut = False
for i in range(depth[block]):
if layers in [101, 152] and block == 2:
if i == 0:
conv_name = "res" + str(block + 2) + "a"
else:
conv_name = "res" + str(block + 2) + "b" + str(i)
else:
conv_name = "res" + str(block + 2) + chr(97 + i)
bottleneck_block = BottleneckBlock(
in_channels=num_channels[block]
if i == 0 else num_filters[block] * 4,
out_channels=num_filters[block],
stride=2 if i == 0 and block != 0 else 1,
shortcut=shortcut,
if_first=block == i == 0,
name=conv_name
)
shortcut = True
# block_list.append(bottleneck_block)
block_list.add_module('bb_%d_%d' % (block, i), bottleneck_block)
self.out_channels.append(num_filters[block] * 4)
# self.stages.append(nn.Sequential(*block_list))
self.stages.append(block_list)
else:
for block in range(len(depth)):
block_list = nn.Sequential()
shortcut = False
for i in range(depth[block]):
conv_name = "res" + str(block + 2) + chr(97 + i)
basic_block = BasicBlock(
in_channels=num_channels[block]
if i == 0 else num_filters[block],
out_channels=num_filters[block],
stride=2 if i == 0 and block != 0 else 1,
shortcut=shortcut,
if_first=block == i == 0,
name=conv_name
)
shortcut = True
# block_list.append(basic_block)
block_list.add_module('bb_%d_%d' % (block, i), basic_block)
self.out_channels.append(num_filters[block])
# self.stages.append(nn.Sequential(*block_list))
self.stages.append(block_list)
def forward(self, inputs):
y = self.conv1_1(inputs)
y = self.conv1_2(y)
y = self.conv1_3(y)
y = self.pool2d_max(y)
out = []
for block in self.stages:
y = block(y)
out.append(y)
return out
\ No newline at end of file
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import nn
class Hswish(nn.Module):
def __init__(self, inplace=True):
......@@ -10,7 +9,8 @@ class Hswish(nn.Module):
self.inplace = inplace
def forward(self, x):
return x * F.relu6(x + 3., inplace=self.inplace) / 6.
return x * F.relu6(x + 3.0, inplace=self.inplace) / 6.0
# out = max(0, min(1, slop*x+offset))
# paddle.fluid.layers.hard_sigmoid(x, slope=0.2, offset=0.5, name=None)
......@@ -22,7 +22,8 @@ class Hsigmoid(nn.Module):
def forward(self, x):
# torch: F.relu6(x + 3., inplace=self.inplace) / 6.
# paddle: F.relu6(1.2 * x + 3., inplace=self.inplace) / 6.
return F.relu6(1.2 * x + 3., inplace=self.inplace) / 6.
return F.relu6(1.2 * x + 3.0, inplace=self.inplace) / 6.0
class GELU(nn.Module):
def __init__(self, inplace=True):
......@@ -43,31 +44,33 @@ class Swish(nn.Module):
x.mul_(torch.sigmoid(x))
return x
else:
return x*torch.sigmoid(x)
return x * torch.sigmoid(x)
class Activation(nn.Module):
def __init__(self, act_type, inplace=True):
super(Activation, self).__init__()
act_type = act_type.lower()
if act_type == 'relu':
if act_type == "relu":
self.act = nn.ReLU(inplace=inplace)
elif act_type == 'relu6':
elif act_type == "relu6":
self.act = nn.ReLU6(inplace=inplace)
elif act_type == 'sigmoid':
elif act_type == "sigmoid":
raise NotImplementedError
elif act_type == 'hard_sigmoid':
self.act = Hsigmoid(inplace)#nn.Hardsigmoid(inplace=inplace)#Hsigmoid(inplace)#
elif act_type == 'hard_swish' or act_type == 'hswish':
elif act_type == "hard_sigmoid":
self.act = Hsigmoid(
inplace
) # nn.Hardsigmoid(inplace=inplace)#Hsigmoid(inplace)#
elif act_type == "hard_swish" or act_type == "hswish":
self.act = Hswish(inplace=inplace)
elif act_type == 'leakyrelu':
elif act_type == "leakyrelu":
self.act = nn.LeakyReLU(inplace=inplace)
elif act_type == 'gelu':
elif act_type == "gelu":
self.act = GELU(inplace=inplace)
elif act_type == 'swish':
elif act_type == "swish":
self.act = Swish(inplace=inplace)
else:
raise NotImplementedError
def forward(self, inputs):
return self.act(inputs)
\ No newline at end of file
return self.act(inputs)
......@@ -12,40 +12,32 @@
# See the License for the specific language governing permissions and
# limitations under the License.
__all__ = ['build_head']
__all__ = ["build_head"]
def build_head(config, **kwargs):
# det head
from .det_db_head import DBHead, PFHeadLocal
from .det_east_head import EASTHead
from .det_sast_head import SASTHead
from .det_pse_head import PSEHead
from .det_fce_head import FCEHead
from .e2e_pg_head import PGHead
# rec head
from .rec_ctc_head import CTCHead
from .rec_att_head import AttentionHead
from .rec_srn_head import SRNHead
from .rec_nrtr_head import Transformer
from .rec_sar_head import SARHead
from .rec_can_head import CANHead
from .rec_multi_head import MultiHead
# cls head
from .cls_head import ClsHead
support_dict = [
'DBHead', 'PSEHead', 'EASTHead', 'SASTHead', 'CTCHead', 'ClsHead', 'AttentionHead',
'SRNHead', 'PGHead', 'Transformer', 'TableAttentionHead','SARHead', 'FCEHead',
'CANHead', 'MultiHead', 'PFHeadLocal',
support_dict = [
"DBHead",
"CTCHead",
"ClsHead",
"MultiHead",
"PFHeadLocal",
]
from .table_att_head import TableAttentionHead
module_name = config.pop('name')
assert module_name in support_dict, Exception('head only support {}'.format(
support_dict))
module_name = config.pop("name")
char_num = config.pop("char_num", 6625)
assert module_name in support_dict, Exception(
"head only support {}".format(support_dict)
)
module_class = eval(module_name)(**config, **kwargs)
return module_class
\ No newline at end of file
return module_class
import os, sys
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import nn
class ClsHead(nn.Module):
"""
......@@ -12,17 +12,12 @@ class ClsHead(nn.Module):
def __init__(self, in_channels, class_dim, **kwargs):
super(ClsHead, self).__init__()
self.training = False
self.pool = nn.AdaptiveAvgPool2d(1)
self.fc = nn.Linear(
in_channels,
class_dim,
bias=True)
self.fc = nn.Linear(in_channels, class_dim, bias=True)
def forward(self, x):
x = self.pool(x)
x = torch.reshape(x, shape=[x.shape[0], x.shape[1]])
x = self.fc(x)
if not self.training:
x = F.softmax(x, dim=1)
return x
\ No newline at end of file
x = F.softmax(x, dim=1)
return x
import os, sys
import torch
import torch.nn as nn
import torch.nn.functional as F
from pytorchocr.modeling.common import Activation
from pytorchocr.modeling.backbones.det_mobilenet_v3 import ConvBNLayer
from ..common import Activation
from ..backbones.det_mobilenet_v3 import ConvBNLayer
class Head(nn.Module):
def __init__(self, in_channels, **kwargs):
......@@ -76,13 +75,8 @@ class DBHead(nn.Module):
def forward(self, x):
shrink_maps = self.binarize(x)
if not self.training:
return {'maps': shrink_maps}
return {'maps': shrink_maps}
threshold_maps = self.thresh(x)
binary_maps = self.step_function(shrink_maps, threshold_maps)
y = torch.cat([shrink_maps, threshold_maps, binary_maps], dim=1)
return {'maps': y}
class LocalModule(nn.Module):
def __init__(self, in_c, mid_c, use_distance=True):
......@@ -101,7 +95,7 @@ class PFHeadLocal(DBHead):
super(PFHeadLocal, self).__init__(in_channels, k, **kwargs)
self.mode = mode
self.up_conv = nn.interpolate(scale_factor=2, mode="nearest")
self.up_conv = nn.Upsample(scale_factor=2, mode="nearest")
if self.mode == 'large':
self.cbn_layer = LocalModule(in_channels // 4, in_channels // 4)
elif self.mode == 'small':
......@@ -112,10 +106,4 @@ class PFHeadLocal(DBHead):
base_maps = shrink_maps
cbn_maps = self.cbn_layer(self.up_conv(f), shrink_maps, None)
cbn_maps = F.sigmoid(cbn_maps)
if not self.training:
return {'maps': 0.5 * (base_maps + cbn_maps), 'cbn_maps': cbn_maps}
threshold_maps = self.thresh(x)
binary_maps = self.step_function(shrink_maps, threshold_maps)
y = torch.cat([cbn_maps, threshold_maps, binary_maps], dim=1)
return {'maps': y, 'distance_maps': cbn_maps, 'cbn_maps': binary_maps}
\ No newline at end of file
return {'maps': 0.5 * (base_maps + cbn_maps), 'cbn_maps': cbn_maps}
\ No newline at end of file
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import os, sys
import torch
import torch.nn as nn
import torch.nn.functional as F
from pytorchocr.modeling.common import Activation
# import paddle
# from paddle import nn
# import paddle.nn.functional as F
# from paddle import ParamAttr
class ConvBNLayer(nn.Module):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride,
padding,
groups=1,
if_act=True,
act=None,
name=None):
super(ConvBNLayer, self).__init__()
self.if_act = if_act
self.act = act
self.conv = nn.Conv2d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
groups=groups,
bias=False)
self.bn = nn.BatchNorm2d(
out_channels,)
self.act = act
if act is not None:
self._act = Activation(act)
def forward(self, x):
x = self.conv(x)
x = self.bn(x)
if self.act is not None:
x = self._act(x)
return x
class EASTHead(nn.Module):
"""
"""
def __init__(self, in_channels, model_name, **kwargs):
super(EASTHead, self).__init__()
self.model_name = model_name
if self.model_name == "large":
num_outputs = [128, 64, 1, 8]
else:
num_outputs = [64, 32, 1, 8]
self.det_conv1 = ConvBNLayer(
in_channels=in_channels,
out_channels=num_outputs[0],
kernel_size=3,
stride=1,
padding=1,
if_act=True,
act='relu',
name="det_head1")
self.det_conv2 = ConvBNLayer(
in_channels=num_outputs[0],
out_channels=num_outputs[1],
kernel_size=3,
stride=1,
padding=1,
if_act=True,
act='relu',
name="det_head2")
self.score_conv = ConvBNLayer(
in_channels=num_outputs[1],
out_channels=num_outputs[2],
kernel_size=1,
stride=1,
padding=0,
if_act=False,
act=None,
name="f_score")
self.geo_conv = ConvBNLayer(
in_channels=num_outputs[1],
out_channels=num_outputs[3],
kernel_size=1,
stride=1,
padding=0,
if_act=False,
act=None,
name="f_geo")
def forward(self, x):
f_det = self.det_conv1(x)
f_det = self.det_conv2(f_det)
f_score = self.score_conv(f_det)
f_score = torch.sigmoid(f_score)
f_geo = self.geo_conv(f_det)
f_geo = (torch.sigmoid(f_geo) - 0.5) * 2 * 800
pred = {'f_score': f_score, 'f_geo': f_geo}
return pred
\ No newline at end of file
"""
This code is refer from:
https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textdet/dense_heads/fce_head.py
"""
import torch
import torch.nn as nn
import torch.nn.functional as F
# from paddle import nn
# from paddle import ParamAttr
# import paddle.nn.functional as F
# from paddle.nn.initializer import Normal
# import paddle
from functools import partial
def multi_apply(func, *args, **kwargs):
pfunc = partial(func, **kwargs) if kwargs else func
map_results = map(pfunc, *args)
return tuple(map(list, zip(*map_results)))
class FCEHead(nn.Module):
"""The class for implementing FCENet head.
FCENet(CVPR2021): Fourier Contour Embedding for Arbitrary-shaped Text
Detection.
[https://arxiv.org/abs/2104.10442]
Args:
in_channels (int): The number of input channels.
scales (list[int]) : The scale of each layer.
fourier_degree (int) : The maximum Fourier transform degree k.
"""
def __init__(self, in_channels, fourier_degree=5):
super().__init__()
assert isinstance(in_channels, int)
self.downsample_ratio = 1.0
self.in_channels = in_channels
self.fourier_degree = fourier_degree
self.out_channels_cls = 4
self.out_channels_reg = (2 * self.fourier_degree + 1) * 2
self.out_conv_cls = nn.Conv2d(
in_channels=self.in_channels,
out_channels=self.out_channels_cls,
kernel_size=3,
stride=1,
padding=1,
groups=1,
bias=True)
self.out_conv_reg = nn.Conv2d(
in_channels=self.in_channels,
out_channels=self.out_channels_reg,
kernel_size=3,
stride=1,
padding=1,
groups=1,
bias=True)
def forward(self, feats, targets=None):
cls_res, reg_res = multi_apply(self.forward_single, feats)
level_num = len(cls_res)
outs = {}
if not self.training:
for i in range(level_num):
tr_pred = F.softmax(cls_res[i][:, 0:2, :, :], dim=1)
tcl_pred = F.softmax(cls_res[i][:, 2:, :, :], dim=1)
outs['level_{}'.format(i)] = torch.cat(
[tr_pred, tcl_pred, reg_res[i]], dim=1)
else:
preds = [[cls_res[i], reg_res[i]] for i in range(level_num)]
outs['levels'] = preds
return outs
def forward_single(self, x):
cls_predict = self.out_conv_cls(x)
reg_predict = self.out_conv_reg(x)
return cls_predict, reg_predict
"""
This code is refer from:
https://github.com/whai362/PSENet/blob/python3/models/head/psenet_head.py
"""
# from paddle import nn
from torch import nn
class PSEHead(nn.Module):
def __init__(self, in_channels, hidden_dim=256, out_channels=7, **kwargs):
super(PSEHead, self).__init__()
self.conv1 = nn.Conv2d(
in_channels, hidden_dim, kernel_size=3, stride=1, padding=1)
self.bn1 = nn.BatchNorm2d(hidden_dim)
self.relu1 = nn.ReLU()
self.conv2 = nn.Conv2d(
hidden_dim, out_channels, kernel_size=1, stride=1, padding=0)
def forward(self, x, **kwargs):
out = self.conv1(x)
out = self.relu1(self.bn1(out))
out = self.conv2(out)
return {'maps': out}
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import os, sys
import torch
import torch.nn as nn
import torch.nn.functional as F
from pytorchocr.modeling.common import Activation
# import paddle
# from paddle import nn
# import paddle.nn.functional as F
# from paddle import ParamAttr
class ConvBNLayer(nn.Module):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride,
groups=1,
if_act=True,
act=None,
name=None):
super(ConvBNLayer, self).__init__()
self.if_act = if_act
self.conv = nn.Conv2d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=(kernel_size - 1) // 2,
groups=groups,
bias=False)
self.bn = nn.BatchNorm2d(
out_channels,)
self.act = act
if act is not None:
self._act = Activation(act)
def forward(self, x):
x = self.conv(x)
x = self.bn(x)
if self.act is not None:
x = self._act(x)
return x
class SAST_Header1(nn.Module):
def __init__(self, in_channels, **kwargs):
super(SAST_Header1, self).__init__()
out_channels = [64, 64, 128]
self.score_conv = nn.Sequential(
ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_score1'),
ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_score2'),
ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_score3'),
ConvBNLayer(out_channels[2], 1, 3, 1, act=None, name='f_score4')
)
self.border_conv = nn.Sequential(
ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_border1'),
ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_border2'),
ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_border3'),
ConvBNLayer(out_channels[2], 4, 3, 1, act=None, name='f_border4')
)
def forward(self, x):
f_score = self.score_conv(x)
f_score = torch.sigmoid(f_score)
f_border = self.border_conv(x)
return f_score, f_border
class SAST_Header2(nn.Module):
def __init__(self, in_channels, **kwargs):
super(SAST_Header2, self).__init__()
out_channels = [64, 64, 128]
self.tvo_conv = nn.Sequential(
ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_tvo1'),
ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_tvo2'),
ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_tvo3'),
ConvBNLayer(out_channels[2], 8, 3, 1, act=None, name='f_tvo4')
)
self.tco_conv = nn.Sequential(
ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_tco1'),
ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_tco2'),
ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_tco3'),
ConvBNLayer(out_channels[2], 2, 3, 1, act=None, name='f_tco4')
)
def forward(self, x):
f_tvo = self.tvo_conv(x)
f_tco = self.tco_conv(x)
return f_tvo, f_tco
class SASTHead(nn.Module):
"""
"""
def __init__(self, in_channels, **kwargs):
super(SASTHead, self).__init__()
self.head1 = SAST_Header1(in_channels)
self.head2 = SAST_Header2(in_channels)
def forward(self, x):
f_score, f_border = self.head1(x)
f_tvo, f_tco = self.head2(x)
predicts = {}
predicts['f_score'] = f_score
predicts['f_border'] = f_border
predicts['f_tvo'] = f_tvo
predicts['f_tco'] = f_tco
return predicts
\ No newline at end of file
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from pytorchocr.modeling.common import Activation
class ConvBNLayer(nn.Module):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride,
padding,
groups=1,
if_act=True,
act=None,
name=None):
super(ConvBNLayer, self).__init__()
self.if_act = if_act
self.act = act
self.conv = nn.Conv2d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
groups=groups,
bias=False)
self.bn = nn.BatchNorm2d(out_channels)
self.act = act
if self.act is not None:
self._act = Activation(act_type=self.act, inplace=True)
def forward(self, x):
x = self.conv(x)
x = self.bn(x)
if self.act is not None:
x = self._act(x)
return x
class PGHead(nn.Module):
"""
"""
def __init__(self, in_channels, **kwargs):
super(PGHead, self).__init__()
self.conv_f_score1 = ConvBNLayer(
in_channels=in_channels,
out_channels=64,
kernel_size=1,
stride=1,
padding=0,
act='relu',
name="conv_f_score{}".format(1))
self.conv_f_score2 = ConvBNLayer(
in_channels=64,
out_channels=64,
kernel_size=3,
stride=1,
padding=1,
act='relu',
name="conv_f_score{}".format(2))
self.conv_f_score3 = ConvBNLayer(
in_channels=64,
out_channels=128,
kernel_size=1,
stride=1,
padding=0,
act='relu',
name="conv_f_score{}".format(3))
self.conv1 = nn.Conv2d(
in_channels=128,
out_channels=1,
kernel_size=3,
stride=1,
padding=1,
groups=1,
bias=False)
self.conv_f_boder1 = ConvBNLayer(
in_channels=in_channels,
out_channels=64,
kernel_size=1,
stride=1,
padding=0,
act='relu',
name="conv_f_boder{}".format(1))
self.conv_f_boder2 = ConvBNLayer(
in_channels=64,
out_channels=64,
kernel_size=3,
stride=1,
padding=1,
act='relu',
name="conv_f_boder{}".format(2))
self.conv_f_boder3 = ConvBNLayer(
in_channels=64,
out_channels=128,
kernel_size=1,
stride=1,
padding=0,
act='relu',
name="conv_f_boder{}".format(3))
self.conv2 = nn.Conv2d(
in_channels=128,
out_channels=4,
kernel_size=3,
stride=1,
padding=1,
groups=1,
bias=False)
self.conv_f_char1 = ConvBNLayer(
in_channels=in_channels,
out_channels=128,
kernel_size=1,
stride=1,
padding=0,
act='relu',
name="conv_f_char{}".format(1))
self.conv_f_char2 = ConvBNLayer(
in_channels=128,
out_channels=128,
kernel_size=3,
stride=1,
padding=1,
act='relu',
name="conv_f_char{}".format(2))
self.conv_f_char3 = ConvBNLayer(
in_channels=128,
out_channels=256,
kernel_size=1,
stride=1,
padding=0,
act='relu',
name="conv_f_char{}".format(3))
self.conv_f_char4 = ConvBNLayer(
in_channels=256,
out_channels=256,
kernel_size=3,
stride=1,
padding=1,
act='relu',
name="conv_f_char{}".format(4))
self.conv_f_char5 = ConvBNLayer(
in_channels=256,
out_channels=256,
kernel_size=1,
stride=1,
padding=0,
act='relu',
name="conv_f_char{}".format(5))
self.conv3 = nn.Conv2d(
in_channels=256,
out_channels=37,
kernel_size=3,
stride=1,
padding=1,
groups=1,
bias=False)
self.conv_f_direc1 = ConvBNLayer(
in_channels=in_channels,
out_channels=64,
kernel_size=1,
stride=1,
padding=0,
act='relu',
name="conv_f_direc{}".format(1))
self.conv_f_direc2 = ConvBNLayer(
in_channels=64,
out_channels=64,
kernel_size=3,
stride=1,
padding=1,
act='relu',
name="conv_f_direc{}".format(2))
self.conv_f_direc3 = ConvBNLayer(
in_channels=64,
out_channels=128,
kernel_size=1,
stride=1,
padding=0,
act='relu',
name="conv_f_direc{}".format(3))
self.conv4 = nn.Conv2d(
in_channels=128,
out_channels=2,
kernel_size=3,
stride=1,
padding=1,
groups=1,
bias=False)
def forward(self, x):
f_score = self.conv_f_score1(x)
f_score = self.conv_f_score2(f_score)
f_score = self.conv_f_score3(f_score)
f_score = self.conv1(f_score)
f_score = torch.sigmoid(f_score)
# f_border
f_border = self.conv_f_boder1(x)
f_border = self.conv_f_boder2(f_border)
f_border = self.conv_f_boder3(f_border)
f_border = self.conv2(f_border)
f_char = self.conv_f_char1(x)
f_char = self.conv_f_char2(f_char)
f_char = self.conv_f_char3(f_char)
f_char = self.conv_f_char4(f_char)
f_char = self.conv_f_char5(f_char)
f_char = self.conv3(f_char)
f_direction = self.conv_f_direc1(x)
f_direction = self.conv_f_direc2(f_direction)
f_direction = self.conv_f_direc3(f_direction)
f_direction = self.conv4(f_direction)
predicts = {}
predicts['f_score'] = f_score
predicts['f_border'] = f_border
predicts['f_char'] = f_char
predicts['f_direction'] = f_direction
return predicts
import torch
from torch import nn
import torch.nn.functional as F
from torch.nn import Linear
from torch.nn.init import xavier_uniform_
class MultiheadAttention(nn.Module):
"""Allows the model to jointly attend to information
from different representation subspaces.
See reference: Attention Is All You Need
.. math::
\text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
\text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)
Args:
embed_dim: total dimension of the model
num_heads: parallel attention layers, or heads
"""
def __init__(self,
embed_dim,
num_heads,
dropout=0.,
bias=True,
add_bias_kv=False,
add_zero_attn=False):
super(MultiheadAttention, self).__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
self.scaling = self.head_dim**-0.5
self.out_proj = Linear(embed_dim, embed_dim, bias=bias)
self._reset_parameters()
self.conv1 = torch.nn.Conv2d(
in_channels=embed_dim, out_channels=embed_dim, kernel_size=(1, 1))
self.conv2 = torch.nn.Conv2d(
in_channels=embed_dim, out_channels=embed_dim, kernel_size=(1, 1))
self.conv3 = torch.nn.Conv2d(
in_channels=embed_dim, out_channels=embed_dim, kernel_size=(1, 1))
def _reset_parameters(self):
xavier_uniform_(self.out_proj.weight)
def forward(self,
query,
key,
value,
key_padding_mask=None,
incremental_state=None,
attn_mask=None):
"""
Inputs of forward function
query: [target length, batch size, embed dim]
key: [sequence length, batch size, embed dim]
value: [sequence length, batch size, embed dim]
key_padding_mask: if True, mask padding based on batch size
incremental_state: if provided, previous time steps are cashed
need_weights: output attn_output_weights
static_kv: key and value are static
Outputs of forward function
attn_output: [target length, batch size, embed dim]
attn_output_weights: [batch size, target length, sequence length]
"""
q_shape = query.shape
src_shape = key.shape
q = self._in_proj_q(query)
k = self._in_proj_k(key)
v = self._in_proj_v(value)
q *= self.scaling
# q = paddle.transpose(
# paddle.reshape(
# q, [q_shape[0], q_shape[1], self.num_heads, self.head_dim]),
# [1, 2, 0, 3])
q = torch.reshape(q, (q_shape[0], q_shape[1], self.num_heads, self.head_dim))
q = q.permute(1, 2, 0, 3)
# k = paddle.transpose(
# paddle.reshape(
# k, [src_shape[0], q_shape[1], self.num_heads, self.head_dim]),
# [1, 2, 0, 3])
k = torch.reshape(k, (src_shape[0], q_shape[1], self.num_heads, self.head_dim))
k = k.permute(1, 2, 0, 3)
# v = paddle.transpose(
# paddle.reshape(
# v, [src_shape[0], q_shape[1], self.num_heads, self.head_dim]),
# [1, 2, 0, 3])
v = torch.reshape(v, (src_shape[0], q_shape[1], self.num_heads, self.head_dim))
v = v.permute(1, 2, 0, 3)
if key_padding_mask is not None:
assert key_padding_mask.shape[0] == q_shape[1]
assert key_padding_mask.shape[1] == src_shape[0]
attn_output_weights = torch.matmul(q,
k.permute(0, 1, 3, 2))
if attn_mask is not None:
attn_mask = torch.unsqueeze(torch.unsqueeze(attn_mask, 0), 0)
attn_output_weights += attn_mask
if key_padding_mask is not None:
attn_output_weights = torch.reshape(
attn_output_weights,
[q_shape[1], self.num_heads, q_shape[0], src_shape[0]])
key = torch.unsqueeze(torch.unsqueeze(key_padding_mask, 1), 2)
key = key.type(torch.float32)
y = torch.full(
size=key.shape, fill_value=float("-Inf"), dtype=torch.float32)
y = torch.where(key == 0., key, y)
attn_output_weights += y
attn_output_weights = F.softmax(
attn_output_weights.type(torch.float32),
dim=-1,
dtype=torch.float32 if attn_output_weights.dtype == torch.float16
else attn_output_weights.dtype)
attn_output_weights = F.dropout(
attn_output_weights, p=self.dropout, training=self.training)
attn_output = torch.matmul(attn_output_weights, v)
attn_output = torch.reshape(
attn_output.permute(2, 0, 1, 3),
[q_shape[0], q_shape[1], self.embed_dim])
attn_output = self.out_proj(attn_output)
return attn_output
def _in_proj_q(self, query):
query = query.permute(1, 2, 0)
query = torch.unsqueeze(query, dim=2)
res = self.conv1(query)
res = torch.squeeze(res, dim=2)
res = res.permute(2, 0, 1)
return res
def _in_proj_k(self, key):
key = key.permute(1, 2, 0)
key = torch.unsqueeze(key, dim=2)
res = self.conv2(key)
res = torch.squeeze(res, dim=2)
res = res.permute(2, 0, 1)
return res
def _in_proj_v(self, value):
value = value.permute(1, 2, 0) #(1, 2, 0)
value = torch.unsqueeze(value, dim=2)
res = self.conv3(value)
res = torch.squeeze(res, dim=2)
res = res.permute(2, 0, 1)
return res
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from pytorchocr.modeling.common import Activation
class AttentionHead(nn.Module):
def __init__(self, in_channels, out_channels, hidden_size, **kwargs):
super(AttentionHead, self).__init__()
self.input_size = in_channels
self.hidden_size = hidden_size
self.num_classes = out_channels
self.attention_cell = AttentionGRUCell(
in_channels, hidden_size, out_channels, use_gru=False)
self.generator = nn.Linear(hidden_size, out_channels)
def _char_to_onehot(self, input_char, onehot_dim):
input_ont_hot = F.one_hot(input_char.type(torch.int64), onehot_dim)
return input_ont_hot
def forward(self, inputs, targets=None, batch_max_length=25):
batch_size = inputs.size()[0]
num_steps = batch_max_length
hidden = torch.zeros((batch_size, self.hidden_size))
output_hiddens = []
if targets is not None:
for i in range(num_steps):
char_onehots = self._char_to_onehot(
targets[:, i], onehot_dim=self.num_classes)
(outputs, hidden), alpha = self.attention_cell(hidden, inputs,
char_onehots)
output_hiddens.append(torch.unsqueeze(outputs, dim=1))
output = torch.cat(output_hiddens, dim=1)
probs = self.generator(output)
else:
targets = torch.zeros([batch_size], dtype=torch.int32)
probs = None
char_onehots = None
outputs = None
alpha = None
for i in range(num_steps):
char_onehots = self._char_to_onehot(
targets, onehot_dim=self.num_classes)
(outputs, hidden), alpha = self.attention_cell(hidden, inputs,
char_onehots)
probs_step = self.generator(outputs)
if probs is None:
probs = torch.unsqueeze(probs_step, dim=1)
else:
probs = torch.cat(
[probs, torch.unsqueeze(
probs_step, dim=1)], dim=1)
next_input = probs_step.argmax(dim=1)
targets = next_input
return probs
class AttentionGRUCell(nn.Module):
def __init__(self, input_size, hidden_size, num_embeddings, use_gru=False):
super(AttentionGRUCell, self).__init__()
self.i2h = nn.Linear(input_size, hidden_size, bias=False)
self.h2h = nn.Linear(hidden_size, hidden_size)
self.score = nn.Linear(hidden_size, 1, bias=False)
self.rnn = nn.GRUCell(
input_size=input_size + num_embeddings, hidden_size=hidden_size, bias=True)
self.hidden_size = hidden_size
def forward(self, prev_hidden, batch_H, char_onehots):
batch_H_proj = self.i2h(batch_H)
prev_hidden_proj = torch.unsqueeze(self.h2h(prev_hidden), dim=1)
res = torch.add(batch_H_proj, prev_hidden_proj)
res = torch.tanh(res)
e = self.score(res)
alpha = F.softmax(e, dim=1)
alpha = alpha.permute(0, 2, 1)
context = torch.squeeze(torch.matmul(alpha, batch_H), dim=1)
concat_context = torch.cat([context, char_onehots.float()], 1)
cur_hidden = self.rnn(concat_context, prev_hidden)
return (cur_hidden, cur_hidden), alpha
class AttentionLSTM(nn.Module):
def __init__(self, in_channels, out_channels, hidden_size, **kwargs):
super(AttentionLSTM, self).__init__()
self.input_size = in_channels
self.hidden_size = hidden_size
self.num_classes = out_channels
self.attention_cell = AttentionLSTMCell(
in_channels, hidden_size, out_channels, use_gru=False)
self.generator = nn.Linear(hidden_size, out_channels)
def _char_to_onehot(self, input_char, onehot_dim):
input_ont_hot = F.one_hot(input_char.type(torch.int64), onehot_dim)
return input_ont_hot
def forward(self, inputs, targets=None, batch_max_length=25):
batch_size = inputs.shape[0]
num_steps = batch_max_length
hidden = (torch.zeros((batch_size, self.hidden_size)), torch.zeros(
(batch_size, self.hidden_size)))
output_hiddens = []
if targets is not None:
for i in range(num_steps):
# one-hot vectors for a i-th char
char_onehots = self._char_to_onehot(
targets[:, i], onehot_dim=self.num_classes)
hidden, alpha = self.attention_cell(hidden, inputs,
char_onehots)
hidden = (hidden[1][0], hidden[1][1])
output_hiddens.append(torch.unsqueeze(hidden[0], dim=1))
output = torch.cat(output_hiddens, dim=1)
probs = self.generator(output)
else:
targets = torch.zeros([batch_size], dtype=torch.int32)
probs = None
for i in range(num_steps):
char_onehots = self._char_to_onehot(
targets, onehot_dim=self.num_classes)
hidden, alpha = self.attention_cell(hidden, inputs,
char_onehots)
probs_step = self.generator(hidden[0])
hidden = (hidden[1][0], hidden[1][1])
if probs is None:
probs = torch.unsqueeze(probs_step, dim=1)
else:
probs = torch.cat(
[probs, torch.unsqueeze(
probs_step, dim=1)], dim=1)
next_input = probs_step.argmax(dim=1)
targets = next_input
return probs
class AttentionLSTMCell(nn.Module):
def __init__(self, input_size, hidden_size, num_embeddings, use_gru=False):
super(AttentionLSTMCell, self).__init__()
self.i2h = nn.Linear(input_size, hidden_size, bias=False)
self.h2h = nn.Linear(hidden_size, hidden_size)
self.score = nn.Linear(hidden_size, 1, bias=False)
if not use_gru:
self.rnn = nn.LSTMCell(
input_size=input_size + num_embeddings, hidden_size=hidden_size)
else:
self.rnn = nn.GRUCell(
input_size=input_size + num_embeddings, hidden_size=hidden_size)
self.hidden_size = hidden_size
def forward(self, prev_hidden, batch_H, char_onehots):
batch_H_proj = self.i2h(batch_H)
prev_hidden_proj = torch.unsqueeze(self.h2h(prev_hidden[0]), dim=1)
res = torch.add(batch_H_proj, prev_hidden_proj)
res = torch.tanh(res)
e = self.score(res)
alpha = F.softmax(e, dim=1)
alpha = alpha.permute(0, 2, 1)
context = torch.squeeze(torch.matmul(alpha, batch_H), dim=1)
concat_context = torch.cat([context, char_onehots.float()], 1)
cur_hidden = self.rnn(concat_context, prev_hidden)
return cur_hidden, alpha
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment