Commit a7a899f6 authored by myhloli's avatar myhloli
Browse files

feat(model): add OCR model base structure and utilities

- Add base model structure for OCR in pytorch
- Implement data augmentation and transformation modules
- Create utilities for dictionary handling and state dict conversion
- Include post-processing modules for OCR
- Add weight initialization and loading functions
parent 72e66c2d
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os, sys
import torch
import torch.nn as nn
import torch.nn.functional as F
from pytorchocr.modeling.common import Activation
# import paddle
# from paddle import nn
# import paddle.nn.functional as F
# from paddle import ParamAttr
class ConvBNLayer(nn.Module):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride,
groups=1,
if_act=True,
act=None,
name=None):
super(ConvBNLayer, self).__init__()
self.if_act = if_act
self.act = act
# self.conv = nn.Conv2D(
# in_channels=in_channels,
# out_channels=out_channels,
# kernel_size=kernel_size,
# stride=stride,
# padding=(kernel_size - 1) // 2,
# groups=groups,
# weight_attr=ParamAttr(name=name + '_weights'),
# bias_attr=False)
self.conv = nn.Conv2d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=(kernel_size - 1) // 2,
groups=groups,
bias=False)
self.bn = nn.BatchNorm2d(
out_channels,)
self.act = act
if act is not None:
self._act = Activation(act)
def forward(self, x):
x = self.conv(x)
x = self.bn(x)
if self.act is not None:
x = self._act(x)
return x
class DeConvBNLayer(nn.Module):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride,
groups=1,
if_act=True,
act=None,
name=None):
super(DeConvBNLayer, self).__init__()
self.if_act = if_act
self.act = act
self.deconv = nn.ConvTranspose2d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=(kernel_size - 1) // 2,
groups=groups,
bias=False)
self.bn = nn.BatchNorm2d(
out_channels,
)
self.act = act
if act is not None:
self._act = Activation(act)
def forward(self, x):
x = self.deconv(x)
x = self.bn(x)
if self.act is not None:
x = self._act(x)
return x
class FPN_Up_Fusion(nn.Module):
def __init__(self, in_channels):
super(FPN_Up_Fusion, self).__init__()
in_channels = in_channels[::-1]
out_channels = [256, 256, 192, 192, 128]
self.h0_conv = ConvBNLayer(in_channels[0], out_channels[0], 1, 1, act=None, name='fpn_up_h0')
self.h1_conv = ConvBNLayer(in_channels[1], out_channels[1], 1, 1, act=None, name='fpn_up_h1')
self.h2_conv = ConvBNLayer(in_channels[2], out_channels[2], 1, 1, act=None, name='fpn_up_h2')
self.h3_conv = ConvBNLayer(in_channels[3], out_channels[3], 1, 1, act=None, name='fpn_up_h3')
self.h4_conv = ConvBNLayer(in_channels[4], out_channels[4], 1, 1, act=None, name='fpn_up_h4')
self.g0_conv = DeConvBNLayer(out_channels[0], out_channels[1], 4, 2, act=None, name='fpn_up_g0')
self.g1_conv = nn.Sequential(
ConvBNLayer(out_channels[1], out_channels[1], 3, 1, act='relu', name='fpn_up_g1_1'),
DeConvBNLayer(out_channels[1], out_channels[2], 4, 2, act=None, name='fpn_up_g1_2')
)
self.g2_conv = nn.Sequential(
ConvBNLayer(out_channels[2], out_channels[2], 3, 1, act='relu', name='fpn_up_g2_1'),
DeConvBNLayer(out_channels[2], out_channels[3], 4, 2, act=None, name='fpn_up_g2_2')
)
self.g3_conv = nn.Sequential(
ConvBNLayer(out_channels[3], out_channels[3], 3, 1, act='relu', name='fpn_up_g3_1'),
DeConvBNLayer(out_channels[3], out_channels[4], 4, 2, act=None, name='fpn_up_g3_2')
)
self.g4_conv = nn.Sequential(
ConvBNLayer(out_channels[4], out_channels[4], 3, 1, act='relu', name='fpn_up_fusion_1'),
ConvBNLayer(out_channels[4], out_channels[4], 1, 1, act=None, name='fpn_up_fusion_2')
)
def _add_relu(self, x1, x2):
# x = paddle.add(x=x1, y=x2)
x = torch.add(x1, x2)
x = F.relu(x)
return x
def forward(self, x):
f = x[2:][::-1]
h0 = self.h0_conv(f[0])
h1 = self.h1_conv(f[1])
h2 = self.h2_conv(f[2])
h3 = self.h3_conv(f[3])
h4 = self.h4_conv(f[4])
g0 = self.g0_conv(h0)
g1 = self._add_relu(g0, h1)
g1 = self.g1_conv(g1)
g2 = self.g2_conv(self._add_relu(g1, h2))
g3 = self.g3_conv(self._add_relu(g2, h3))
g4 = self.g4_conv(self._add_relu(g3, h4))
return g4
class FPN_Down_Fusion(nn.Module):
def __init__(self, in_channels):
super(FPN_Down_Fusion, self).__init__()
out_channels = [32, 64, 128]
self.h0_conv = ConvBNLayer(in_channels[0], out_channels[0], 3, 1, act=None, name='fpn_down_h0')
self.h1_conv = ConvBNLayer(in_channels[1], out_channels[1], 3, 1, act=None, name='fpn_down_h1')
self.h2_conv = ConvBNLayer(in_channels[2], out_channels[2], 3, 1, act=None, name='fpn_down_h2')
self.g0_conv = ConvBNLayer(out_channels[0], out_channels[1], 3, 2, act=None, name='fpn_down_g0')
self.g1_conv = nn.Sequential(
ConvBNLayer(out_channels[1], out_channels[1], 3, 1, act='relu', name='fpn_down_g1_1'),
ConvBNLayer(out_channels[1], out_channels[2], 3, 2, act=None, name='fpn_down_g1_2')
)
self.g2_conv = nn.Sequential(
ConvBNLayer(out_channels[2], out_channels[2], 3, 1, act='relu', name='fpn_down_fusion_1'),
ConvBNLayer(out_channels[2], out_channels[2], 1, 1, act=None, name='fpn_down_fusion_2')
)
def forward(self, x):
f = x[:3]
h0 = self.h0_conv(f[0])
h1 = self.h1_conv(f[1])
h2 = self.h2_conv(f[2])
g0 = self.g0_conv(h0)
# g1 = paddle.add(x=g0, y=h1)
g1 = torch.add(g0, h1)
g1 = F.relu(g1)
g1 = self.g1_conv(g1)
# g2 = paddle.add(x=g1, y=h2)
g2 = torch.add(g1, h2)
g2 = F.relu(g2)
g2 = self.g2_conv(g2)
return g2
class Cross_Attention(nn.Module):
def __init__(self, in_channels):
super(Cross_Attention, self).__init__()
self.theta_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act='relu', name='f_theta')
self.phi_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act='relu', name='f_phi')
self.g_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act='relu', name='f_g')
self.fh_weight_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act=None, name='fh_weight')
self.fh_sc_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act=None, name='fh_sc')
self.fv_weight_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act=None, name='fv_weight')
self.fv_sc_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act=None, name='fv_sc')
self.f_attn_conv = ConvBNLayer(in_channels * 2, in_channels, 1, 1, act='relu', name='f_attn')
def _cal_fweight(self, f, shape):
f_theta, f_phi, f_g = f
# flatten
# f_theta = paddle.transpose(f_theta, [0, 2, 3, 1])
f_theta = f_theta.permute(0, 2, 3, 1)
# f_theta = paddle.reshape(f_theta, [shape[0] * shape[1], shape[2], 128])
f_theta = torch.reshape(f_theta, [shape[0] * shape[1], shape[2], 128])
# f_phi = paddle.transpose(f_phi, [0, 2, 3, 1])
f_phi = f_phi.permute(0, 2, 3, 1)
# f_phi = paddle.reshape(f_phi, [shape[0] * shape[1], shape[2], 128])
f_phi = torch.reshape(f_phi, [shape[0] * shape[1], shape[2], 128])
# f_g = paddle.transpose(f_g, [0, 2, 3, 1])
f_g = f_g.permute(0, 2, 3, 1)
# f_g = paddle.reshape(f_g, [shape[0] * shape[1], shape[2], 128])
f_g = torch.reshape(f_g, [shape[0] * shape[1], shape[2], 128])
# correlation
# f_attn = paddle.matmul(f_theta, paddle.transpose(f_phi, [0, 2, 1]))
f_attn = torch.matmul(f_theta, f_phi.permute(0, 2, 1))
# scale
f_attn = f_attn / (128 ** 0.5)
f_attn = F.softmax(f_attn, dim=-1)
# weighted sum
# f_weight = paddle.matmul(f_attn, f_g)
f_weight = torch.matmul(f_attn, f_g)
# f_weight = paddle.reshape(
# f_weight, [shape[0], shape[1], shape[2], 128])
f_weight = torch.reshape(
f_weight, [shape[0], shape[1], shape[2], 128])
return f_weight
def forward(self, f_common):
# f_shape = paddle.shape(f_common)
f_shape = f_common.size()
# print('f_shape: ', f_shape)
f_theta = self.theta_conv(f_common)
f_phi = self.phi_conv(f_common)
f_g = self.g_conv(f_common)
######## horizon ########
fh_weight = self._cal_fweight([f_theta, f_phi, f_g],
[f_shape[0], f_shape[2], f_shape[3]])
# fh_weight = paddle.transpose(fh_weight, [0, 3, 1, 2])
fh_weight = fh_weight.permute(0, 3, 1, 2)
fh_weight = self.fh_weight_conv(fh_weight)
# short cut
fh_sc = self.fh_sc_conv(f_common)
f_h = F.relu(fh_weight + fh_sc)
######## vertical ########
# fv_theta = paddle.transpose(f_theta, [0, 1, 3, 2])
fv_theta = f_theta.permute(0, 1, 3, 2)
# fv_phi = paddle.transpose(f_phi, [0, 1, 3, 2])
fv_phi = f_phi.permute(0, 1, 3, 2)
# fv_g = paddle.transpose(f_g, [0, 1, 3, 2])
fv_g = f_g.permute(0, 1, 3, 2)
fv_weight = self._cal_fweight([fv_theta, fv_phi, fv_g],
[f_shape[0], f_shape[3], f_shape[2]])
# fv_weight = paddle.transpose(fv_weight, [0, 3, 2, 1])
fv_weight = fv_weight.permute(0, 3, 2, 1)
fv_weight = self.fv_weight_conv(fv_weight)
# short cut
fv_sc = self.fv_sc_conv(f_common)
f_v = F.relu(fv_weight + fv_sc)
######## merge ########
# f_attn = paddle.concat([f_h, f_v], axis=1)
f_attn = torch.cat([f_h, f_v], dim=1)
f_attn = self.f_attn_conv(f_attn)
return f_attn
class SASTFPN(nn.Module):
def __init__(self, in_channels, with_cab=False, **kwargs):
super(SASTFPN, self).__init__()
self.in_channels = in_channels
self.with_cab = with_cab
self.FPN_Down_Fusion = FPN_Down_Fusion(self.in_channels)
self.FPN_Up_Fusion = FPN_Up_Fusion(self.in_channels)
self.out_channels = 128
self.cross_attention = Cross_Attention(self.out_channels)
def forward(self, x):
# down fpn
f_down = self.FPN_Down_Fusion(x)
# up fpn
f_up = self.FPN_Up_Fusion(x)
# fusion
# f_common = paddle.add(x=f_down, y=f_up)
f_common = torch.add(f_down, f_up)
f_common = F.relu(f_common)
if self.with_cab:
# print('enhence f_common with CAB.')
f_common = self.cross_attention(f_common)
return f_common
\ No newline at end of file
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F
from pytorchocr.modeling.common import Activation
class TableFPN(nn.Module):
def __init__(self, in_channels, out_channels, **kwargs):
super(TableFPN, self).__init__()
self.out_channels = 512
self.in2_conv = nn.Conv2d(
in_channels=in_channels[0],
out_channels=self.out_channels,
kernel_size=1,
bias=False)
self.in3_conv = nn.Conv2d(
in_channels=in_channels[1],
out_channels=self.out_channels,
kernel_size=1,
stride = 1,
bias=False)
self.in4_conv = nn.Conv2d(
in_channels=in_channels[2],
out_channels=self.out_channels,
kernel_size=1,
bias=False)
self.in5_conv = nn.Conv2d(
in_channels=in_channels[3],
out_channels=self.out_channels,
kernel_size=1,
bias=False)
self.p5_conv = nn.Conv2d(
in_channels=self.out_channels,
out_channels=self.out_channels // 4,
kernel_size=3,
padding=1,
bias=False)
self.p4_conv = nn.Conv2d(
in_channels=self.out_channels,
out_channels=self.out_channels // 4,
kernel_size=3,
padding=1,
bias=False)
self.p3_conv = nn.Conv2d(
in_channels=self.out_channels,
out_channels=self.out_channels // 4,
kernel_size=3,
padding=1,
bias=False)
self.p2_conv = nn.Conv2d(
in_channels=self.out_channels,
out_channels=self.out_channels // 4,
kernel_size=3,
padding=1,
bias=False)
self.fuse_conv = nn.Conv2d(
in_channels=self.out_channels * 4,
out_channels=512,
kernel_size=3,
padding=1,
bias=False)
def forward(self, x):
c2, c3, c4, c5 = x
in5 = self.in5_conv(c5)
in4 = self.in4_conv(c4)
in3 = self.in3_conv(c3)
in2 = self.in2_conv(c2)
out4 = in4 + F.interpolate(
in5, size=in4.shape[2:4], mode="nearest", )#align_mode=1) # 1/16
out3 = in3 + F.interpolate(
out4, size=in3.shape[2:4], mode="nearest", )#align_mode=1) # 1/8
out2 = in2 + F.interpolate(
out3, size=in2.shape[2:4], mode="nearest", )#align_mode=1) # 1/4
p4 = F.interpolate(out4, size=in5.shape[2:4], mode="nearest", )#align_mode=1)
p3 = F.interpolate(out3, size=in5.shape[2:4], mode="nearest", )#align_mode=1)
p2 = F.interpolate(out2, size=in5.shape[2:4], mode="nearest", )#align_mode=1)
fuse = torch.cat([in5, p4, p3, p2], dim=1)
fuse_conv = self.fuse_conv(fuse) * 0.005
return [c5 + fuse_conv]
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__all__ = ['build_transform']
def build_transform(config):
from .tps import TPS
from .stn import STN_ON
from .tsrn import TSRN
from .tbsrn import TBSRN
support_dict = ['TPS', 'STN_ON', 'TSRN', 'TBSRN']
module_name = config.pop('name')
assert module_name in support_dict, Exception(
'transform only support {}'.format(support_dict))
module_class = eval(module_name)(**config)
return module_class
\ No newline at end of file
"""
This code is refer from:
https://github.com/ayumiymk/aster.pytorch/blob/master/lib/models/stn_head.py
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import torch
from torch import nn
from torch.nn import functional as F
import numpy as np
from .tps_spatial_transformer import TPSSpatialTransformer
def conv3x3_block(in_channels, out_channels, stride=1):
n = 3 * 3 * out_channels
w = math.sqrt(2. / n)
conv_layer = nn.Conv2d(
in_channels,
out_channels,
kernel_size=3,
stride=stride,
padding=1,
bias=True)
block = nn.Sequential(conv_layer, nn.BatchNorm2d(out_channels), nn.ReLU())
return block
class STN(nn.Module):
def __init__(self, in_channels, num_ctrlpoints, activation='none'):
super(STN, self).__init__()
self.in_channels = in_channels
self.num_ctrlpoints = num_ctrlpoints
self.activation = activation
self.stn_convnet = nn.Sequential(
conv3x3_block(in_channels, 32), #32x64
nn.MaxPool2d(
kernel_size=2, stride=2),
conv3x3_block(32, 64), #16x32
nn.MaxPool2d(
kernel_size=2, stride=2),
conv3x3_block(64, 128), # 8*16
nn.MaxPool2d(
kernel_size=2, stride=2),
conv3x3_block(128, 256), # 4*8
nn.MaxPool2d(
kernel_size=2, stride=2),
conv3x3_block(256, 256), # 2*4,
nn.MaxPool2d(
kernel_size=2, stride=2),
conv3x3_block(256, 256)) # 1*2
self.stn_fc1 = nn.Sequential(
nn.Linear(
2 * 256,
512,
bias=True),
nn.BatchNorm1d(512),
nn.ReLU(inplace=True))
fc2_bias = self.init_stn()
self.stn_fc2 = nn.Linear(
512,
num_ctrlpoints * 2,
bias=True)
def init_stn(self):
margin = 0.01
sampling_num_per_side = int(self.num_ctrlpoints / 2)
ctrl_pts_x = np.linspace(margin, 1. - margin, sampling_num_per_side)
ctrl_pts_y_top = np.ones(sampling_num_per_side) * margin
ctrl_pts_y_bottom = np.ones(sampling_num_per_side) * (1 - margin)
ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1)
ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1)
ctrl_points = np.concatenate(
[ctrl_pts_top, ctrl_pts_bottom], axis=0).astype(np.float32)
if self.activation == 'none':
pass
elif self.activation == 'sigmoid':
ctrl_points = -np.log(1. / ctrl_points - 1.)
ctrl_points = torch.Tensor(ctrl_points)
# fc2_bias = ctrl_points.view(-1)
fc2_bias = torch.reshape(
ctrl_points, shape=[ctrl_points.shape[0] * ctrl_points.shape[1]])
return fc2_bias
def forward(self, x):
x = self.stn_convnet(x)
batch_size, _, h, w = x.shape
# x = x.view(batch_size, -1)
x = torch.reshape(x, shape=(batch_size, -1))
img_feat = self.stn_fc1(x)
x = self.stn_fc2(0.1 * img_feat)
if self.activation == 'sigmoid':
x = F.sigmoid(x)
# x = x.view(-1, self.num_ctrlpoints, 2)
x = torch.reshape(x, shape=[-1, self.num_ctrlpoints, 2])
return img_feat, x
class STN_ON(nn.Module):
def __init__(self, in_channels, tps_inputsize, tps_outputsize,
num_control_points, tps_margins, stn_activation):
super(STN_ON, self).__init__()
self.tps = TPSSpatialTransformer(
output_image_size=tuple(tps_outputsize),
num_control_points=num_control_points,
margins=tuple(tps_margins))
self.stn_head = STN(in_channels=in_channels,
num_ctrlpoints=num_control_points,
activation=stn_activation)
self.tps_inputsize = tps_inputsize
self.out_channels = in_channels
def forward(self, image):
stn_input = torch.nn.functional.interpolate(
image, self.tps_inputsize, mode="bilinear", align_corners=True)
stn_img_feat, ctrl_points = self.stn_head(stn_input)
x, _ = self.tps(image, ctrl_points)
return x
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is refer from:
https://github.com/FudanVI/FudanOCR/blob/main/scene-text-telescope/model/tbsrn.py
"""
import math
import warnings
import numpy as np
import torch
from torch import nn
import string
warnings.filterwarnings("ignore")
from .tps_spatial_transformer import TPSSpatialTransformer
from .stn import STN as STNHead
from .tsrn import GruBlock, mish, UpsampleBLock
from pytorchocr.modeling.heads.sr_rensnet_transformer import Transformer, LayerNorm, \
PositionwiseFeedForward, MultiHeadedAttention
def positionalencoding2d(d_model, height, width):
"""
:param d_model: dimension of the model
:param height: height of the positions
:param width: width of the positions
:return: d_model*height*width position matrix
"""
if d_model % 4 != 0:
raise ValueError("Cannot use sin/cos positional encoding with "
"odd dimension (got dim={:d})".format(d_model))
pe = torch.zeros([d_model, height, width])
# Each dimension use half of d_model
d_model = int(d_model / 2)
div_term = torch.exp(torch.arange(0., d_model, 2) *
-(math.log(10000.0) / d_model))
pos_w = torch.arange(0., width, dtype=torch.float32).unsqueeze(1)
pos_h = torch.arange(0., height, dtype=torch.float32).unsqueeze(1)
pe[0:d_model:2, :, :] = torch.sin(pos_w * div_term).transpose(0, 1).unsqueeze(1).repeat(1, height, 1)
pe[1:d_model:2, :, :] = torch.cos(pos_w * div_term).transpose(0, 1).unsqueeze(1).repeat(1, height, 1)
pe[d_model::2, :, :] = torch.sin(pos_h * div_term).transpose(0, 1).unsqueeze(2).repeat(1, 1, width)
pe[d_model + 1::2, :, :] = torch.cos(pos_h * div_term).transpose(0, 1).unsqueeze(2).repeat(1, 1, width)
return pe
class FeatureEnhancer(nn.Module):
def __init__(self):
super(FeatureEnhancer, self).__init__()
self.multihead = MultiHeadedAttention(h=4, d_model=128, dropout=0.1)
self.mul_layernorm1 = LayerNorm(features=128)
self.pff = PositionwiseFeedForward(128, 128)
self.mul_layernorm3 = LayerNorm(features=128)
self.linear = nn.Linear(128, 64)
def forward(self, conv_feature):
'''
text : (batch, seq_len, embedding_size)
global_info: (batch, embedding_size, 1, 1)
conv_feature: (batch, channel, H, W)
'''
batch = conv_feature.shape[0]
if torch.cuda.is_available():
position2d = positionalencoding2d(64, 16, 64).float().cuda().unsqueeze(0).reshape([1, 64, 1024])
else:
position2d = positionalencoding2d(64, 16, 64).float().unsqueeze(0).reshape([1, 64, 1024])
position2d = position2d.repeat(batch, 1, 1)
conv_feature = torch.cat([conv_feature, position2d], 1) # batch, 128(64+64), 32, 128
result = conv_feature.permute(0, 2, 1).contiguous()
origin_result = result
result = self.mul_layernorm1(origin_result + self.multihead(result, result, result, mask=None)[0])
origin_result = result
result = self.mul_layernorm3(origin_result + self.pff(result))
result = self.linear(result)
return result.permute(0, 2, 1).contiguous()
def str_filt(str_, voc_type):
alpha_dict = {
'digit': string.digits,
'lower': string.digits + string.ascii_lowercase,
'upper': string.digits + string.ascii_letters,
'all': string.digits + string.ascii_letters + string.punctuation
}
if voc_type == 'lower':
str_ = str_.lower()
for char in str_:
if char not in alpha_dict[voc_type]:
str_ = str_.replace(char, '')
str_ = str_.lower()
return str_
class TBSRN(nn.Module):
def __init__(self,
in_channels=3,
scale_factor=2,
width=128,
height=32,
STN=True,
srb_nums=5,
mask=False,
hidden_units=32,
infer_mode=False):
super(TBSRN, self).__init__()
in_planes = 3
if mask:
in_planes = 4
assert math.log(scale_factor, 2) % 1 == 0
upsample_block_num = int(math.log(scale_factor, 2))
self.block1 = nn.Sequential(
nn.Conv2d(in_planes, 2 * hidden_units, kernel_size=9, padding=4),
nn.PReLU()
# nn.ReLU()
)
self.srb_nums = srb_nums
for i in range(srb_nums):
setattr(self, 'block%d' % (i + 2), RecurrentResidualBlock(2 * hidden_units))
setattr(self, 'block%d' % (srb_nums + 2),
nn.Sequential(
nn.Conv2d(2 * hidden_units, 2 * hidden_units, kernel_size=3, padding=1),
nn.BatchNorm2d(2 * hidden_units)
))
# self.non_local = NonLocalBlock2D(64, 64)
block_ = [UpsampleBLock(2 * hidden_units, 2) for _ in range(upsample_block_num)]
block_.append(nn.Conv2d(2 * hidden_units, in_planes, kernel_size=9, padding=4))
setattr(self, 'block%d' % (srb_nums + 3), nn.Sequential(*block_))
self.tps_inputsize = [height // scale_factor, width // scale_factor]
tps_outputsize = [height // scale_factor, width // scale_factor]
num_control_points = 20
tps_margins = [0.05, 0.05]
self.stn = STN
self.out_channels = in_channels
if self.stn:
self.tps = TPSSpatialTransformer(
output_image_size=tuple(tps_outputsize),
num_control_points=num_control_points,
margins=tuple(tps_margins))
self.stn_head = STNHead(
in_channels=in_planes,
num_ctrlpoints=num_control_points,
activation='none')
self.infer_mode = infer_mode
self.english_alphabet = '-0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
self.english_dict = {}
for index in range(len(self.english_alphabet)):
self.english_dict[self.english_alphabet[index]] = index
transformer = Transformer(alphabet='-0123456789abcdefghijklmnopqrstuvwxyz')
self.transformer = transformer
for param in self.transformer.parameters():
param.trainable = False
def label_encoder(self, label):
batch = len(label)
length = [len(i) for i in label]
length_tensor = torch.Tensor(length).type(torch.int64)
max_length = max(length)
input_tensor = np.zeros((batch, max_length))
for i in range(batch):
for j in range(length[i] - 1):
input_tensor[i][j + 1] = self.english_dict[label[i][j]]
text_gt = []
for i in label:
for j in i:
text_gt.append(self.english_dict[j])
text_gt = torch.Tensor(text_gt).type(torch.int64)
input_tensor = torch.Tensor(input_tensor).type(torch.int64)
return length_tensor, input_tensor, text_gt
def forward(self, x):
output = {}
if self.infer_mode:
output["lr_img"] = x
y = x
else:
output["lr_img"] = x[0]
output["hr_img"] = x[1]
y = x[0]
if self.stn and self.training:
_, ctrl_points_x = self.stn_head(y)
y, _ = self.tps(y, ctrl_points_x)
block = {'1': self.block1(y)}
for i in range(self.srb_nums + 1):
block[str(i + 2)] = getattr(self,
'block%d' % (i + 2))(block[str(i + 1)])
block[str(self.srb_nums + 3)] = getattr(self, 'block%d' % (self.srb_nums + 3)) \
((block['1'] + block[str(self.srb_nums + 2)]))
sr_img = torch.tanh(block[str(self.srb_nums + 3)])
output["sr_img"] = sr_img
if self.training:
hr_img = x[1]
# add transformer
label = [str_filt(i, 'lower') + '-' for i in x[2]]
length_tensor, input_tensor, text_gt = self.label_encoder(label)
hr_pred, word_attention_map_gt, hr_correct_list = self.transformer(hr_img, length_tensor,
input_tensor)
sr_pred, word_attention_map_pred, sr_correct_list = self.transformer(sr_img, length_tensor,
input_tensor)
output["hr_img"] = hr_img
output["hr_pred"] = hr_pred
output["text_gt"] = text_gt
output["word_attention_map_gt"] = word_attention_map_gt
output["sr_pred"] = sr_pred
output["word_attention_map_pred"] = word_attention_map_pred
return output
class RecurrentResidualBlock(nn.Module):
def __init__(self, channels):
super(RecurrentResidualBlock, self).__init__()
self.conv1 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
self.bn1 = nn.BatchNorm2d(channels)
self.gru1 = GruBlock(channels, channels)
# self.prelu = nn.ReLU()
self.prelu = mish()
self.conv2 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
self.bn2 = nn.BatchNorm2d(channels)
self.gru2 = GruBlock(channels, channels)
self.feature_enhancer = FeatureEnhancer()
for p in self.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)
def forward(self, x):
residual = self.conv1(x)
residual = self.bn1(residual)
residual = self.prelu(residual)
residual = self.conv2(residual)
residual = self.bn2(residual)
size = residual.shape
residual = residual.reshape([size[0], size[1], -1])
residual = self.feature_enhancer(residual)
residual = residual.reshape([size[0], size[1], size[2], size[3]])
return x + residual
\ No newline at end of file
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import os, sys
import torch
import torch.nn as nn
import torch.nn.functional as F
from pytorchocr.modeling.common import Activation
# import paddle
# from paddle import nn, ParamAttr
# from paddle.nn import functional as F
import numpy as np
class ConvBNLayer(nn.Module):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
groups=1,
act=None,
name=None):
super(ConvBNLayer, self).__init__()
self.conv = nn.Conv2d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=(kernel_size - 1) // 2,
groups=groups,
bias=False,
)
bn_name = "bn_" + name
self.bn = nn.BatchNorm2d(
out_channels, )
self.act = act
if act is not None:
self._act = Activation(act)
def forward(self, x):
x = self.conv(x)
x = self.bn(x)
if self.act is not None:
x = self._act(x)
return x
class LocalizationNetwork(nn.Module):
def __init__(self, in_channels, num_fiducial, loc_lr, model_name):
super(LocalizationNetwork, self).__init__()
self.F = num_fiducial
F = num_fiducial
if model_name == "large":
num_filters_list = [64, 128, 256, 512]
fc_dim = 256
else:
num_filters_list = [16, 32, 64, 128]
fc_dim = 64
# self.block_list = []
self.block_list = nn.Sequential()
for fno in range(0, len(num_filters_list)):
num_filters = num_filters_list[fno]
name = "loc_conv%d" % fno
# conv = self.add_sublayer(
# name,
# ConvBNLayer(
# in_channels=in_channels,
# out_channels=num_filters,
# kernel_size=3,
# act='relu',
# name=name))
conv = ConvBNLayer(
in_channels=in_channels,
out_channels=num_filters,
kernel_size=3,
act='relu',
name=name)
# self.block_list.append(conv)
self.block_list.add_module(name, conv)
if fno == len(num_filters_list) - 1:
pool = nn.AdaptiveAvgPool2d(1)
else:
# pool = nn.MaxPool2D(kernel_size=2, stride=2, padding=0)
pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
in_channels = num_filters
# self.block_list.append(pool)
self.block_list.add_module('{}_pool'.format(name), pool)
name = "loc_fc1"
stdv = 1.0 / math.sqrt(num_filters_list[-1] * 1.0)
self.fc1 = nn.Linear(
in_channels,
fc_dim,
bias=True,
)
# Init fc2 in LocalizationNetwork
initial_bias = self.get_initial_fiducials()
initial_bias = initial_bias.reshape(-1)
name = "loc_fc2"
self.fc2 = nn.Linear(
fc_dim,
F * 2,
bias=True
)
self.out_channels = F * 2
def forward(self, x):
"""
Estimating parameters of geometric transformation
Args:
image: input
Return:
batch_C_prime: the matrix of the geometric transformation
"""
B = x.shape[0]
i = 0
for block in self.block_list:
x = block(x)
x = x.squeeze(dim=2).squeeze(dim=2)
x = self.fc1(x)
x = F.relu(x)
x = self.fc2(x)
x = x.reshape(shape=[-1, self.F, 2])
return x
def get_initial_fiducials(self):
""" see RARE paper Fig. 6 (a) """
F = self.F
ctrl_pts_x = np.linspace(-1.0, 1.0, int(F / 2))
ctrl_pts_y_top = np.linspace(0.0, -1.0, num=int(F / 2))
ctrl_pts_y_bottom = np.linspace(1.0, 0.0, num=int(F / 2))
ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1)
ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1)
initial_bias = np.concatenate([ctrl_pts_top, ctrl_pts_bottom], axis=0)
return initial_bias
class GridGenerator(nn.Module):
def __init__(self, in_channels, num_fiducial):
super(GridGenerator, self).__init__()
self.eps = 1e-6
self.F = num_fiducial
name = "ex_fc"
self.fc = nn.Linear(
in_channels,
6,
bias=True
)
def forward(self, batch_C_prime, I_r_size):
"""
Generate the grid for the grid_sampler.
Args:
batch_C_prime: the matrix of the geometric transformation
I_r_size: the shape of the input image
Return:
batch_P_prime: the grid for the grid_sampler
"""
C = self.build_C_paddle()
P = self.build_P_paddle(I_r_size)
inv_delta_C_tensor = self.build_inv_delta_C_paddle(C).type(torch.float32)
P_hat_tensor = self.build_P_hat_paddle(
C, torch.as_tensor(P)).type(torch.float32)
inv_delta_C_tensor.stop_gradient = True
P_hat_tensor.stop_gradient = True
batch_C_ex_part_tensor = self.get_expand_tensor(batch_C_prime)
batch_C_ex_part_tensor.stop_gradient = True
batch_C_prime_with_zeros = torch.cat(
[batch_C_prime, batch_C_ex_part_tensor], dim=1)
inv_delta_C_tensor = inv_delta_C_tensor.to(batch_C_prime_with_zeros.device)
batch_T = torch.matmul(inv_delta_C_tensor, batch_C_prime_with_zeros)
P_hat_tensor = P_hat_tensor.to(batch_T.device)
batch_P_prime = torch.matmul(P_hat_tensor, batch_T)
return batch_P_prime
def build_C_paddle(self):
""" Return coordinates of fiducial points in I_r; C """
F = self.F
ctrl_pts_x = torch.linspace(-1.0, 1.0, int(F / 2), dtype=torch.float64)
ctrl_pts_y_top = -1 * torch.ones([int(F / 2)], dtype=torch.float64)
ctrl_pts_y_bottom = torch.ones([int(F / 2)], dtype=torch.float64)
ctrl_pts_top = torch.stack([ctrl_pts_x, ctrl_pts_y_top], dim=1)
ctrl_pts_bottom = torch.stack([ctrl_pts_x, ctrl_pts_y_bottom], dim=1)
C = torch.cat([ctrl_pts_top, ctrl_pts_bottom], dim=0)
return C # F x 2
def build_P_paddle(self, I_r_size):
I_r_height, I_r_width = I_r_size
I_r_grid_x = (torch.arange(
-I_r_width, I_r_width, 2, dtype=torch.float64) + 1.0
) / torch.as_tensor(np.array([I_r_width]).astype(np.float64))
I_r_grid_y = (torch.arange(
-I_r_height, I_r_height, 2, dtype=torch.float64) + 1.0
) / torch.as_tensor(np.array([I_r_height]).astype(np.float64))
# P: self.I_r_width x self.I_r_height x 2
P = torch.stack(torch.meshgrid([I_r_grid_x, I_r_grid_y]), dim=2)
# P = paddle.transpose(P, perm=[1, 0, 2])
P = P.permute(1, 0, 2)
# n (= self.I_r_width x self.I_r_height) x 2
return P.reshape([-1, 2])
def build_inv_delta_C_paddle(self, C):
""" Return inv_delta_C which is needed to calculate T """
F = self.F
hat_C = torch.zeros((F, F), dtype=torch.float64) # F x F
for i in range(0, F):
for j in range(i, F):
if i == j:
hat_C[i, j] = 1
else:
r = torch.norm(C[i] - C[j])
hat_C[i, j] = r
hat_C[j, i] = r
hat_C = (hat_C**2) * torch.log(hat_C)
delta_C = torch.cat( # F+3 x F+3
[
torch.cat(
[torch.ones(
(F, 1), dtype=torch.float64), C, hat_C], dim=1), # F x F+3
torch.cat(
[
torch.zeros(
(2, 3), dtype=torch.float64), C.permute(1,0)
],
dim=1), # 2 x F+3
torch.cat(
[
torch.zeros(
(1, 3), dtype=torch.float64), torch.ones(
(1, F), dtype=torch.float64)
],
dim=1) # 1 x F+3
],
dim=0)
inv_delta_C = torch.inverse(delta_C)
return inv_delta_C # F+3 x F+3
def build_P_hat_paddle(self, C, P):
F = self.F
eps = self.eps
n = P.shape[0] # n (= self.I_r_width x self.I_r_height)
# P_tile: n x 2 -> n x 1 x 2 -> n x F x 2
# P_tile = paddle.tile(paddle.unsqueeze(P, axis=1), (1, F, 1))
P_tile = torch.unsqueeze(P, dim=1).repeat(1, F, 1)
C_tile = torch.unsqueeze(C, dim=0) # 1 x F x 2
P_diff = P_tile - C_tile # n x F x 2
# rbf_norm: n x F
rbf_norm = torch.norm(P_diff, p=2, dim=2, keepdim=False)
# rbf: n x F
# rbf = torch.mul(
# torch.square(rbf_norm), torch.log(rbf_norm + eps))
rbf = torch.mul(
rbf_norm**2, torch.log(rbf_norm + eps))
P_hat = torch.cat(
[torch.ones(
(n, 1), dtype=torch.float64), P, rbf], dim=1)
return P_hat # n x F+3
def get_expand_tensor(self, batch_C_prime):
B, H, C = batch_C_prime.shape
batch_C_prime = batch_C_prime.reshape([B, H * C])
batch_C_ex_part_tensor = self.fc(batch_C_prime)
batch_C_ex_part_tensor = batch_C_ex_part_tensor.reshape([-1, 3, 2])
return batch_C_ex_part_tensor
class TPS(nn.Module):
def __init__(self, in_channels, num_fiducial, loc_lr, model_name):
super(TPS, self).__init__()
self.loc_net = LocalizationNetwork(in_channels, num_fiducial, loc_lr,
model_name)
self.grid_generator = GridGenerator(self.loc_net.out_channels,
num_fiducial)
self.out_channels = in_channels
def forward(self, image):
image.stop_gradient = False
batch_C_prime = self.loc_net(image)
batch_P_prime = self.grid_generator(batch_C_prime, image.shape[2:])
batch_P_prime = batch_P_prime.reshape(
[-1, image.shape[2], image.shape[3], 2])
if torch.__version__ < '1.3.0':
batch_I_r = F.grid_sample(image, grid=batch_P_prime)
else:
batch_I_r = F.grid_sample(image, grid=batch_P_prime, align_corners=True)
return batch_I_r
"""
This code is refer from:
https://github.com/ayumiymk/aster.pytorch/blob/master/lib/models/tps_spatial_transformer.py
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import torch
from torch import nn
from torch.nn import functional as F
import numpy as np
import itertools
def grid_sample(input, grid, canvas=None):
input.stop_gradient = False
output = F.grid_sample(input, grid, align_corners=True) if torch.__version__ >= '1.3.0' else F.grid_sample(input, grid)
if canvas is None:
return output
else:
# input_mask = paddle.ones(shape=input.shape)
input_mask = input.data.new(input.size()).fill_(1)
output_mask = F.grid_sample(input_mask, grid)
padded_output = output * output_mask + canvas * (1 - output_mask)
return padded_output
# phi(x1, x2) = r^2 * log(r), where r = ||x1 - x2||_2
def compute_partial_repr(input_points, control_points):
N = input_points.shape[0]
M = control_points.shape[0]
# pairwise_diff = input_points.view(N, 1, 2) - control_points.view(1, M, 2)
pairwise_diff = torch.reshape(
input_points, shape=[N, 1, 2]) - torch.reshape(
control_points, shape=[1, M, 2])
# original implementation, very slow
# pairwise_dist = torch.sum(pairwise_diff ** 2, dim = 2) # square of distance
pairwise_diff_square = pairwise_diff * pairwise_diff
pairwise_dist = pairwise_diff_square[:, :, 0] + pairwise_diff_square[:, :, 1]
repr_matrix = 0.5 * pairwise_dist * torch.log(pairwise_dist)
# fix numerical error for 0 * log(0), substitute all nan with 0
# mask = np.array(repr_matrix != repr_matrix)
# repr_matrix[mask] = 0
mask = repr_matrix != repr_matrix
repr_matrix.masked_fill_(mask, 0)
return repr_matrix
# output_ctrl_pts are specified, according to our task.
def build_output_control_points(num_control_points, margins):
margin_x, margin_y = margins
num_ctrl_pts_per_side = num_control_points // 2
ctrl_pts_x = np.linspace(margin_x, 1.0 - margin_x, num_ctrl_pts_per_side)
ctrl_pts_y_top = np.ones(num_ctrl_pts_per_side) * margin_y
ctrl_pts_y_bottom = np.ones(num_ctrl_pts_per_side) * (1.0 - margin_y)
ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1)
ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1)
output_ctrl_pts_arr = np.concatenate(
[ctrl_pts_top, ctrl_pts_bottom], axis=0)
output_ctrl_pts = torch.Tensor(output_ctrl_pts_arr)
return output_ctrl_pts
class TPSSpatialTransformer(nn.Module):
def __init__(self,
output_image_size=None,
num_control_points=None,
margins=None):
super(TPSSpatialTransformer, self).__init__()
self.output_image_size = output_image_size
self.num_control_points = num_control_points
self.margins = margins
self.target_height, self.target_width = output_image_size
target_control_points = build_output_control_points(num_control_points,
margins)
N = num_control_points
# create padded kernel matrix
forward_kernel = torch.zeros(N + 3, N + 3)
target_control_partial_repr = compute_partial_repr(target_control_points, target_control_points)
forward_kernel[:N, :N].copy_(target_control_partial_repr)
forward_kernel[:N, -3].fill_(1)
forward_kernel[-3, :N].fill_(1)
forward_kernel[:N, -2:].copy_(target_control_points)
forward_kernel[-2:, :N].copy_(target_control_points.transpose(0, 1))
# compute inverse matrix
inverse_kernel = torch.inverse(forward_kernel)
# create target cordinate matrix
HW = self.target_height * self.target_width
target_coordinate = list(
itertools.product(
range(self.target_height), range(self.target_width)))
target_coordinate = torch.Tensor(target_coordinate) # HW x 2
Y, X = target_coordinate.split(1, dim = 1)
Y = Y / (self.target_height - 1)
X = X / (self.target_width - 1)
target_coordinate = torch.cat([X, Y], dim = 1) # convert from (y, x) to (x, y)
target_coordinate_partial_repr = compute_partial_repr(
target_coordinate, target_control_points)
target_coordinate_repr = torch.cat(
[
target_coordinate_partial_repr,
torch.ones(HW, 1),
target_coordinate
],
dim=1)
# register precomputed matrices
self.inverse_kernel = inverse_kernel
self.padding_matrix = torch.zeros(3, 2)
self.target_coordinate_repr = target_coordinate_repr
self.target_control_points = target_control_points
def forward(self, input, source_control_points):
assert source_control_points.ndimension() == 3
assert source_control_points.shape[1] == self.num_control_points
assert source_control_points.shape[2] == 2
batch_size = source_control_points.size(0)
Y = torch.cat([source_control_points, self.padding_matrix.expand(batch_size, 3, 2)], 1)
mapping_matrix = torch.matmul(self.inverse_kernel, Y)
source_coordinate = torch.matmul(self.target_coordinate_repr, mapping_matrix)
# grid = source_coordinate.view(-1, self.target_height, self.target_width, 2)
grid = torch.reshape(
source_coordinate,
shape=[-1, self.target_height, self.target_width, 2])
grid = torch.clamp(grid, 0, 1) # the source_control_points may be out of [0, 1].
# the input to grid_sample is normalized [-1, 1], but what we get is [0, 1]
grid = 2.0 * grid - 1.0
output_maps = grid_sample(input, grid, canvas=None)
return output_maps, source_coordinate
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is refer from:
https://github.com/FudanVI/FudanOCR/blob/main/text-gestalt/model/tsrn.py
"""
import math
import torch
import torch.nn.functional as F
from torch import nn
from collections import OrderedDict
import sys
import numpy as np
import warnings
import math, copy
import cv2
warnings.filterwarnings("ignore")
from .tps_spatial_transformer import TPSSpatialTransformer
from .stn import STN as STN_model
from pytorchocr.modeling.heads.sr_rensnet_transformer import Transformer
class TSRN(nn.Module):
def __init__(self,
in_channels,
scale_factor=2,
width=128,
height=32,
STN=False,
srb_nums=5,
mask=False,
hidden_units=32,
infer_mode=False,
**kwargs):
super(TSRN, self).__init__()
in_planes = 3
if mask:
in_planes = 4
assert math.log(scale_factor, 2) % 1 == 0
upsample_block_num = int(math.log(scale_factor, 2))
self.block1 = nn.Sequential(
nn.Conv2d(
in_planes, 2 * hidden_units, kernel_size=9, padding=4),
nn.PReLU())
self.srb_nums = srb_nums
for i in range(srb_nums):
setattr(self, 'block%d' % (i + 2),
RecurrentResidualBlock(2 * hidden_units))
setattr(
self,
'block%d' % (srb_nums + 2),
nn.Sequential(
nn.Conv2d(
2 * hidden_units,
2 * hidden_units,
kernel_size=3,
padding=1),
nn.BatchNorm2d(2 * hidden_units)))
block_ = [
UpsampleBLock(2 * hidden_units, 2)
for _ in range(upsample_block_num)
]
block_.append(
nn.Conv2d(2 * hidden_units, in_planes, kernel_size=9, padding=4)
)
setattr(self, 'block%d' % (srb_nums + 3), nn.Sequential(*block_))
self.tps_inputsize = [height // scale_factor, width // scale_factor]
tps_outputsize = [height // scale_factor, width // scale_factor]
num_control_points = 20
tps_margins = [0.05, 0.05]
self.stn = STN
if self.stn:
self.tps = TPSSpatialTransformer(
output_image_size=tuple(tps_outputsize),
num_control_points=num_control_points,
margins=tuple(tps_margins))
self.stn_head = STN_model(
in_channels=in_planes,
num_ctrlpoints=num_control_points,
activation='none')
self.out_channels = in_channels
self.r34_transformer = Transformer()
for param in self.r34_transformer.parameters():
param.trainable = False
self.infer_mode = infer_mode
def forward(self, x):
output = {}
if self.infer_mode:
output["lr_img"] = x
y = x
else:
output["lr_img"] = x[0]
output["hr_img"] = x[1]
y = x[0]
if self.stn and self.training:
_, ctrl_points_x = self.stn_head(y)
y, _ = self.tps(y, ctrl_points_x)
block = {'1': self.block1(y)}
for i in range(self.srb_nums + 1):
block[str(i + 2)] = getattr(self,
'block%d' % (i + 2))(block[str(i + 1)])
block[str(self.srb_nums + 3)] = getattr(self, 'block%d' % (self.srb_nums + 3)) \
((block['1'] + block[str(self.srb_nums + 2)]))
sr_img = torch.tanh(block[str(self.srb_nums + 3)])
output["sr_img"] = sr_img
if self.training:
hr_img = x[1]
length = x[2]
input_tensor = x[3]
# add transformer
sr_pred, word_attention_map_pred, _ = self.r34_transformer(
sr_img, length, input_tensor)
hr_pred, word_attention_map_gt, _ = self.r34_transformer(
hr_img, length, input_tensor)
output["hr_img"] = hr_img
output["hr_pred"] = hr_pred
output["word_attention_map_gt"] = word_attention_map_gt
output["sr_pred"] = sr_pred
output["word_attention_map_pred"] = word_attention_map_pred
return output
class RecurrentResidualBlock(nn.Module):
def __init__(self, channels):
super(RecurrentResidualBlock, self).__init__()
self.conv1 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
self.bn1 = nn.BatchNorm2d(channels)
self.gru1 = GruBlock(channels, channels)
self.prelu = mish()
self.conv2 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
self.bn2 = nn.BatchNorm2d(channels)
self.gru2 = GruBlock(channels, channels)
def forward(self, x):
residual = self.conv1(x)
residual = self.bn1(residual)
residual = self.prelu(residual)
residual = self.conv2(residual)
residual = self.bn2(residual)
residual = self.gru1(residual.permute(0, 1, 3, 2).contiguous()).permute(0, 1, 3, 2).contiguous()
return self.gru2(x + residual).contiguous()
class UpsampleBLock(nn.Module):
def __init__(self, in_channels, up_scale):
super(UpsampleBLock, self).__init__()
self.conv = nn.Conv2d(
in_channels, in_channels * up_scale**2, kernel_size=3, padding=1)
self.pixel_shuffle = nn.PixelShuffle(up_scale)
self.prelu = mish()
def forward(self, x):
x = self.conv(x)
x = self.pixel_shuffle(x)
x = self.prelu(x)
return x
class mish(nn.Module):
def __init__(self, ):
super(mish, self).__init__()
self.activated = True
def forward(self, x):
if self.activated:
x = x * (torch.tanh(F.softplus(x)))
return x
class GruBlock(nn.Module):
def __init__(self, in_channels, out_channels):
super(GruBlock, self).__init__()
assert out_channels % 2 == 0
self.conv1 = nn.Conv2d(
in_channels, out_channels, kernel_size=1, padding=0)
self.gru = nn.GRU(out_channels,
out_channels // 2,
bidirectional=True,
batch_first=True,
)
def forward(self, x):
# x: b, c, w, h
x = self.conv1(x)
x = x.permute(0, 2, 3, 1).contiguous() # b, w, h, c
batch_size, w, h, c = x.size()
x = x.view(batch_size * w, h, c) # b*w, h, c
x, _ = self.gru(x)
x = x.view(batch_size, w, h, c)
x = x.permute(0, 3, 1, 2).contiguous()
return x
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import copy
__all__ = ['build_post_process']
def build_post_process(config, global_config=None):
from .db_postprocess import DBPostProcess
from .east_postprocess import EASTPostProcess
from .sast_postprocess import SASTPostProcess
from .fce_postprocess import FCEPostProcess
from .rec_postprocess import CTCLabelDecode, AttnLabelDecode, SRNLabelDecode, TableLabelDecode, \
NRTRLabelDecode, SARLabelDecode, ViTSTRLabelDecode, RFLLabelDecode
from .cls_postprocess import ClsPostProcess
from .pg_postprocess import PGPostProcess
from .rec_postprocess import CANLabelDecode
support_dict = [
'DBPostProcess', 'EASTPostProcess', 'SASTPostProcess', 'CTCLabelDecode',
'AttnLabelDecode', 'ClsPostProcess', 'SRNLabelDecode', 'PGPostProcess',
'TableLabelDecode', 'NRTRLabelDecode', 'SARLabelDecode', 'FCEPostProcess',
'ViTSTRLabelDecode','CANLabelDecode', 'RFLLabelDecode'
]
if config['name'] == 'PSEPostProcess':
from .pse_postprocess import PSEPostProcess
support_dict.append('PSEPostProcess')
config = copy.deepcopy(config)
module_name = config.pop('name')
if global_config is not None:
config.update(global_config)
assert module_name in support_dict, Exception(
'post process only support {}, but got {}'.format(support_dict, module_name))
module_class = eval(module_name)(**config)
return module_class
\ No newline at end of file
import torch
class ClsPostProcess(object):
""" Convert between text-label and text-index """
def __init__(self, label_list, **kwargs):
super(ClsPostProcess, self).__init__()
self.label_list = label_list
def __call__(self, preds, label=None, *args, **kwargs):
if isinstance(preds, torch.Tensor):
preds = preds.cpu().numpy()
pred_idxs = preds.argmax(axis=1)
decode_out = [(self.label_list[idx], preds[i, idx])
for i, idx in enumerate(pred_idxs)]
if label is None:
return decode_out
label = [(self.label_list[idx], 1.0) for idx in label]
return decode_out, label
\ No newline at end of file
"""
This code is refered from:
https://github.com/WenmuZhou/DBNet.pytorch/blob/master/post_processing/seg_detector_representer.py
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import cv2
import torch
from shapely.geometry import Polygon
import pyclipper
class DBPostProcess(object):
"""
The post process for Differentiable Binarization (DB).
"""
def __init__(self,
thresh=0.3,
box_thresh=0.7,
max_candidates=1000,
unclip_ratio=2.0,
use_dilation=False,
score_mode="fast",
**kwargs):
self.thresh = thresh
self.box_thresh = box_thresh
self.max_candidates = max_candidates
self.unclip_ratio = unclip_ratio
self.min_size = 3
self.score_mode = score_mode
assert score_mode in [
"slow", "fast"
], "Score mode must be in [slow, fast] but got: {}".format(score_mode)
self.dilation_kernel = None if not use_dilation else np.array(
[[1, 1], [1, 1]])
def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height):
'''
_bitmap: single map with shape (1, H, W),
whose values are binarized as {0, 1}
'''
bitmap = _bitmap
height, width = bitmap.shape
outs = cv2.findContours((bitmap * 255).astype(np.uint8), cv2.RETR_LIST,
cv2.CHAIN_APPROX_SIMPLE)
if len(outs) == 3:
img, contours, _ = outs[0], outs[1], outs[2]
elif len(outs) == 2:
contours, _ = outs[0], outs[1]
num_contours = min(len(contours), self.max_candidates)
boxes = []
scores = []
for index in range(num_contours):
contour = contours[index]
points, sside = self.get_mini_boxes(contour)
if sside < self.min_size:
continue
points = np.array(points)
if self.score_mode == "fast":
score = self.box_score_fast(pred, points.reshape(-1, 2))
else:
score = self.box_score_slow(pred, contour)
if self.box_thresh > score:
continue
box = self.unclip(points).reshape(-1, 1, 2)
box, sside = self.get_mini_boxes(box)
if sside < self.min_size + 2:
continue
box = np.array(box)
box[:, 0] = np.clip(
np.round(box[:, 0] / width * dest_width), 0, dest_width)
box[:, 1] = np.clip(
np.round(box[:, 1] / height * dest_height), 0, dest_height)
boxes.append(box.astype(np.int16))
scores.append(score)
return np.array(boxes, dtype=np.int16), scores
def unclip(self, box):
unclip_ratio = self.unclip_ratio
poly = Polygon(box)
distance = poly.area * unclip_ratio / poly.length
offset = pyclipper.PyclipperOffset()
offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
expanded = np.array(offset.Execute(distance))
return expanded
def get_mini_boxes(self, contour):
bounding_box = cv2.minAreaRect(contour)
points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])
index_1, index_2, index_3, index_4 = 0, 1, 2, 3
if points[1][1] > points[0][1]:
index_1 = 0
index_4 = 1
else:
index_1 = 1
index_4 = 0
if points[3][1] > points[2][1]:
index_2 = 2
index_3 = 3
else:
index_2 = 3
index_3 = 2
box = [
points[index_1], points[index_2], points[index_3], points[index_4]
]
return box, min(bounding_box[1])
def box_score_fast(self, bitmap, _box):
'''
box_score_fast: use bbox mean score as the mean score
'''
h, w = bitmap.shape[:2]
box = _box.copy()
xmin = np.clip(np.floor(box[:, 0].min()).astype(np.int64), 0, w - 1)
xmax = np.clip(np.ceil(box[:, 0].max()).astype(np.int64), 0, w - 1)
ymin = np.clip(np.floor(box[:, 1].min()).astype(np.int64), 0, h - 1)
ymax = np.clip(np.ceil(box[:, 1].max()).astype(np.int64), 0, h - 1)
mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
box[:, 0] = box[:, 0] - xmin
box[:, 1] = box[:, 1] - ymin
cv2.fillPoly(mask, box.reshape(1, -1, 2).astype(np.int32), 1)
return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0]
def box_score_slow(self, bitmap, contour):
'''
box_score_slow: use polyon mean score as the mean score
'''
h, w = bitmap.shape[:2]
contour = contour.copy()
contour = np.reshape(contour, (-1, 2))
xmin = np.clip(np.min(contour[:, 0]), 0, w - 1)
xmax = np.clip(np.max(contour[:, 0]), 0, w - 1)
ymin = np.clip(np.min(contour[:, 1]), 0, h - 1)
ymax = np.clip(np.max(contour[:, 1]), 0, h - 1)
mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
contour[:, 0] = contour[:, 0] - xmin
contour[:, 1] = contour[:, 1] - ymin
cv2.fillPoly(mask, contour.reshape(1, -1, 2).astype(np.int32), 1)
return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0]
def __call__(self, outs_dict, shape_list):
pred = outs_dict['maps']
if isinstance(pred, torch.Tensor):
pred = pred.cpu().numpy()
pred = pred[:, 0, :, :]
segmentation = pred > self.thresh
boxes_batch = []
for batch_index in range(pred.shape[0]):
src_h, src_w, ratio_h, ratio_w = shape_list[batch_index]
if self.dilation_kernel is not None:
mask = cv2.dilate(
np.array(segmentation[batch_index]).astype(np.uint8),
self.dilation_kernel)
else:
mask = segmentation[batch_index]
boxes, scores = self.boxes_from_bitmap(pred[batch_index], mask,
src_w, src_h)
boxes_batch.append({'points': boxes})
return boxes_batch
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
from .locality_aware_nms import nms_locality
import cv2
# import paddle
import torch
import os
import sys
class EASTPostProcess(object):
"""
The post process for EAST.
"""
def __init__(self,
score_thresh=0.8,
cover_thresh=0.1,
nms_thresh=0.2,
**kwargs):
self.score_thresh = score_thresh
self.cover_thresh = cover_thresh
self.nms_thresh = nms_thresh
# c++ la-nms is faster, but only support python 3.5
self.is_python35 = False
if sys.version_info.major == 3 and sys.version_info.minor == 5:
self.is_python35 = True
def restore_rectangle_quad(self, origin, geometry):
"""
Restore rectangle from quadrangle.
"""
# quad
origin_concat = np.concatenate(
(origin, origin, origin, origin), axis=1) # (n, 8)
pred_quads = origin_concat - geometry
pred_quads = pred_quads.reshape((-1, 4, 2)) # (n, 4, 2)
return pred_quads
def detect(self,
score_map,
geo_map,
score_thresh=0.8,
cover_thresh=0.1,
nms_thresh=0.2):
"""
restore text boxes from score map and geo map
"""
score_map = score_map[0]
geo_map = np.swapaxes(geo_map, 1, 0)
geo_map = np.swapaxes(geo_map, 1, 2)
# filter the score map
xy_text = np.argwhere(score_map > score_thresh)
if len(xy_text) == 0:
return []
# sort the text boxes via the y axis
xy_text = xy_text[np.argsort(xy_text[:, 0])]
# restore quad proposals
text_box_restored = self.restore_rectangle_quad(
xy_text[:, ::-1] * 4, geo_map[xy_text[:, 0], xy_text[:, 1], :])
boxes = np.zeros((text_box_restored.shape[0], 9), dtype=np.float32)
boxes[:, :8] = text_box_restored.reshape((-1, 8))
boxes[:, 8] = score_map[xy_text[:, 0], xy_text[:, 1]]
if self.is_python35:
import lanms
boxes = lanms.merge_quadrangle_n9(boxes, nms_thresh)
else:
boxes = nms_locality(boxes.astype(np.float64), nms_thresh)
if boxes.shape[0] == 0:
return []
# Here we filter some low score boxes by the average score map,
# this is different from the orginal paper.
for i, box in enumerate(boxes):
mask = np.zeros_like(score_map, dtype=np.uint8)
cv2.fillPoly(mask, box[:8].reshape(
(-1, 4, 2)).astype(np.int32) // 4, 1)
boxes[i, 8] = cv2.mean(score_map, mask)[0]
boxes = boxes[boxes[:, 8] > cover_thresh]
return boxes
def sort_poly(self, p):
"""
Sort polygons.
"""
min_axis = np.argmin(np.sum(p, axis=1))
p = p[[min_axis, (min_axis + 1) % 4, \
(min_axis + 2) % 4, (min_axis + 3) % 4]]
if abs(p[0, 0] - p[1, 0]) > abs(p[0, 1] - p[1, 1]):
return p
else:
return p[[0, 3, 2, 1]]
def __call__(self, outs_dict, shape_list):
score_list = outs_dict['f_score']
geo_list = outs_dict['f_geo']
if isinstance(score_list, torch.Tensor):
score_list = score_list.cpu().numpy()
geo_list = geo_list.cpu().numpy()
img_num = len(shape_list)
dt_boxes_list = []
for ino in range(img_num):
score = score_list[ino]
geo = geo_list[ino]
boxes = self.detect(
score_map=score,
geo_map=geo,
score_thresh=self.score_thresh,
cover_thresh=self.cover_thresh,
nms_thresh=self.nms_thresh)
boxes_norm = []
if len(boxes) > 0:
h, w = score.shape[1:]
src_h, src_w, ratio_h, ratio_w = shape_list[ino]
boxes = boxes[:, :8].reshape((-1, 4, 2))
boxes[:, :, 0] /= ratio_w
boxes[:, :, 1] /= ratio_h
for i_box, box in enumerate(boxes):
box = self.sort_poly(box.astype(np.int32))
if np.linalg.norm(box[0] - box[1]) < 5 \
or np.linalg.norm(box[3] - box[0]) < 5:
continue
boxes_norm.append(box)
dt_boxes_list.append({'points': np.array(boxes_norm)})
return dt_boxes_list
\ No newline at end of file
"""
This code is refer from:
https://github.com/open-mmlab/mmocr/blob/v0.3.0/mmocr/models/textdet/postprocess/wrapper.py
"""
import cv2
import torch
import numpy as np
from numpy.fft import ifft
from pytorchocr.utils.poly_nms import poly_nms, valid_boundary
def fill_hole(input_mask):
h, w = input_mask.shape
canvas = np.zeros((h + 2, w + 2), np.uint8)
canvas[1:h + 1, 1:w + 1] = input_mask.copy()
mask = np.zeros((h + 4, w + 4), np.uint8)
cv2.floodFill(canvas, mask, (0, 0), 1)
canvas = canvas[1:h + 1, 1:w + 1].astype(np.bool)
return ~canvas | input_mask
def fourier2poly(fourier_coeff, num_reconstr_points=50):
""" Inverse Fourier transform
Args:
fourier_coeff (ndarray): Fourier coefficients shaped (n, 2k+1),
with n and k being candidates number and Fourier degree
respectively.
num_reconstr_points (int): Number of reconstructed polygon points.
Returns:
Polygons (ndarray): The reconstructed polygons shaped (n, n')
"""
a = np.zeros((len(fourier_coeff), num_reconstr_points), dtype='complex')
k = (len(fourier_coeff[0]) - 1) // 2
a[:, 0:k + 1] = fourier_coeff[:, k:]
a[:, -k:] = fourier_coeff[:, :k]
poly_complex = ifft(a) * num_reconstr_points
polygon = np.zeros((len(fourier_coeff), num_reconstr_points, 2))
polygon[:, :, 0] = poly_complex.real
polygon[:, :, 1] = poly_complex.imag
return polygon.astype('int32').reshape((len(fourier_coeff), -1))
class FCEPostProcess(object):
"""
The post process for FCENet.
"""
def __init__(self,
scales,
fourier_degree=5,
num_reconstr_points=50,
decoding_type='fcenet',
score_thr=0.3,
nms_thr=0.1,
alpha=1.0,
beta=1.0,
box_type='poly',
**kwargs):
self.scales = scales
self.fourier_degree = fourier_degree
self.num_reconstr_points = num_reconstr_points
self.decoding_type = decoding_type
self.score_thr = score_thr
self.nms_thr = nms_thr
self.alpha = alpha
self.beta = beta
self.box_type = box_type
def __call__(self, preds, shape_list):
score_maps = []
for key, value in preds.items():
if isinstance(value, torch.Tensor):
value = value.numpy()
cls_res = value[:, :4, :, :]
reg_res = value[:, 4:, :, :]
score_maps.append([cls_res, reg_res])
return self.get_boundary(score_maps, shape_list)
def resize_boundary(self, boundaries, scale_factor):
"""Rescale boundaries via scale_factor.
Args:
boundaries (list[list[float]]): The boundary list. Each boundary
with size 2k+1 with k>=4.
scale_factor(ndarray): The scale factor of size (4,).
Returns:
boundaries (list[list[float]]): The scaled boundaries.
"""
boxes = []
scores = []
for b in boundaries:
sz = len(b)
valid_boundary(b, True)
scores.append(b[-1])
b = (np.array(b[:sz - 1]) *
(np.tile(scale_factor[:2], int(
(sz - 1) / 2)).reshape(1, sz - 1))).flatten().tolist()
boxes.append(np.array(b).reshape([-1, 2]))
return np.array(boxes, dtype=np.float32), scores
def get_boundary(self, score_maps, shape_list):
assert len(score_maps) == len(self.scales)
boundaries = []
for idx, score_map in enumerate(score_maps):
scale = self.scales[idx]
boundaries = boundaries + self._get_boundary_single(score_map,
scale)
# nms
boundaries = poly_nms(boundaries, self.nms_thr)
boundaries, scores = self.resize_boundary(
boundaries, (1 / shape_list[0, 2:]).tolist()[::-1])
boxes_batch = [dict(points=boundaries, scores=scores)]
return boxes_batch
def _get_boundary_single(self, score_map, scale):
assert len(score_map) == 2
assert score_map[1].shape[1] == 4 * self.fourier_degree + 2
return self.fcenet_decode(
preds=score_map,
fourier_degree=self.fourier_degree,
num_reconstr_points=self.num_reconstr_points,
scale=scale,
alpha=self.alpha,
beta=self.beta,
box_type=self.box_type,
score_thr=self.score_thr,
nms_thr=self.nms_thr)
def fcenet_decode(self,
preds,
fourier_degree,
num_reconstr_points,
scale,
alpha=1.0,
beta=2.0,
box_type='poly',
score_thr=0.3,
nms_thr=0.1):
"""Decoding predictions of FCENet to instances.
Args:
preds (list(Tensor)): The head output tensors.
fourier_degree (int): The maximum Fourier transform degree k.
num_reconstr_points (int): The points number of the polygon
reconstructed from predicted Fourier coefficients.
scale (int): The down-sample scale of the prediction.
alpha (float) : The parameter to calculate final scores. Score_{final}
= (Score_{text region} ^ alpha)
* (Score_{text center region}^ beta)
beta (float) : The parameter to calculate final score.
box_type (str): Boundary encoding type 'poly' or 'quad'.
score_thr (float) : The threshold used to filter out the final
candidates.
nms_thr (float) : The threshold of nms.
Returns:
boundaries (list[list[float]]): The instance boundary and confidence
list.
"""
assert isinstance(preds, list)
assert len(preds) == 2
assert box_type in ['poly', 'quad']
cls_pred = preds[0][0]
tr_pred = cls_pred[0:2]
tcl_pred = cls_pred[2:]
reg_pred = preds[1][0].transpose([1, 2, 0])
x_pred = reg_pred[:, :, :2 * fourier_degree + 1]
y_pred = reg_pred[:, :, 2 * fourier_degree + 1:]
score_pred = (tr_pred[1]**alpha) * (tcl_pred[1]**beta)
tr_pred_mask = (score_pred) > score_thr
tr_mask = fill_hole(tr_pred_mask)
tr_contours, _ = cv2.findContours(
tr_mask.astype(np.uint8), cv2.RETR_TREE,
cv2.CHAIN_APPROX_SIMPLE) # opencv4
mask = np.zeros_like(tr_mask)
boundaries = []
for cont in tr_contours:
deal_map = mask.copy().astype(np.int8)
cv2.drawContours(deal_map, [cont], -1, 1, -1)
score_map = score_pred * deal_map
score_mask = score_map > 0
xy_text = np.argwhere(score_mask)
dxy = xy_text[:, 1] + xy_text[:, 0] * 1j
x, y = x_pred[score_mask], y_pred[score_mask]
c = x + y * 1j
c[:, fourier_degree] = c[:, fourier_degree] + dxy
c *= scale
polygons = fourier2poly(c, num_reconstr_points)
score = score_map[score_mask].reshape(-1, 1)
polygons = poly_nms(np.hstack((polygons, score)).tolist(), nms_thr)
boundaries = boundaries + polygons
boundaries = poly_nms(boundaries, nms_thr)
if box_type == 'quad':
new_boundaries = []
for boundary in boundaries:
poly = np.array(boundary[:-1]).reshape(-1, 2).astype(np.float32)
score = boundary[-1]
points = cv2.boxPoints(cv2.minAreaRect(poly))
points = np.int0(points)
new_boundaries.append(points.reshape(-1).tolist() + [score])
boundaries = new_boundaries
return boundaries
"""
Locality aware nms.
"""
import numpy as np
from shapely.geometry import Polygon
def intersection(g, p):
"""
Intersection.
"""
g = Polygon(g[:8].reshape((4, 2)))
p = Polygon(p[:8].reshape((4, 2)))
g = g.buffer(0)
p = p.buffer(0)
if not g.is_valid or not p.is_valid:
return 0
inter = Polygon(g).intersection(Polygon(p)).area
union = g.area + p.area - inter
if union == 0:
return 0
else:
return inter / union
def intersection_iog(g, p):
"""
Intersection_iog.
"""
g = Polygon(g[:8].reshape((4, 2)))
p = Polygon(p[:8].reshape((4, 2)))
if not g.is_valid or not p.is_valid:
return 0
inter = Polygon(g).intersection(Polygon(p)).area
#union = g.area + p.area - inter
union = p.area
if union == 0:
print("p_area is very small")
return 0
else:
return inter / union
def weighted_merge(g, p):
"""
Weighted merge.
"""
g[:8] = (g[8] * g[:8] + p[8] * p[:8]) / (g[8] + p[8])
g[8] = (g[8] + p[8])
return g
def standard_nms(S, thres):
"""
Standard nms.
"""
order = np.argsort(S[:, 8])[::-1]
keep = []
while order.size > 0:
i = order[0]
keep.append(i)
ovr = np.array([intersection(S[i], S[t]) for t in order[1:]])
inds = np.where(ovr <= thres)[0]
order = order[inds + 1]
return S[keep]
def standard_nms_inds(S, thres):
"""
Standard nms, retun inds.
"""
order = np.argsort(S[:, 8])[::-1]
keep = []
while order.size > 0:
i = order[0]
keep.append(i)
ovr = np.array([intersection(S[i], S[t]) for t in order[1:]])
inds = np.where(ovr <= thres)[0]
order = order[inds + 1]
return keep
def nms(S, thres):
"""
nms.
"""
order = np.argsort(S[:, 8])[::-1]
keep = []
while order.size > 0:
i = order[0]
keep.append(i)
ovr = np.array([intersection(S[i], S[t]) for t in order[1:]])
inds = np.where(ovr <= thres)[0]
order = order[inds + 1]
return keep
def soft_nms(boxes_in, Nt_thres=0.3, threshold=0.8, sigma=0.5, method=2):
"""
soft_nms
:para boxes_in, N x 9 (coords + score)
:para threshould, eliminate cases min score(0.001)
:para Nt_thres, iou_threshi
:para sigma, gaussian weght
:method, linear or gaussian
"""
boxes = boxes_in.copy()
N = boxes.shape[0]
if N is None or N < 1:
return np.array([])
pos, maxpos = 0, 0
weight = 0.0
inds = np.arange(N)
tbox, sbox = boxes[0].copy(), boxes[0].copy()
for i in range(N):
maxscore = boxes[i, 8]
maxpos = i
tbox = boxes[i].copy()
ti = inds[i]
pos = i + 1
#get max box
while pos < N:
if maxscore < boxes[pos, 8]:
maxscore = boxes[pos, 8]
maxpos = pos
pos = pos + 1
#add max box as a detection
boxes[i, :] = boxes[maxpos, :]
inds[i] = inds[maxpos]
#swap
boxes[maxpos, :] = tbox
inds[maxpos] = ti
tbox = boxes[i].copy()
pos = i + 1
#NMS iteration
while pos < N:
sbox = boxes[pos].copy()
ts_iou_val = intersection(tbox, sbox)
if ts_iou_val > 0:
if method == 1:
if ts_iou_val > Nt_thres:
weight = 1 - ts_iou_val
else:
weight = 1
elif method == 2:
weight = np.exp(-1.0 * ts_iou_val**2 / sigma)
else:
if ts_iou_val > Nt_thres:
weight = 0
else:
weight = 1
boxes[pos, 8] = weight * boxes[pos, 8]
#if box score falls below thresold, discard the box by
#swaping last box update N
if boxes[pos, 8] < threshold:
boxes[pos, :] = boxes[N - 1, :]
inds[pos] = inds[N - 1]
N = N - 1
pos = pos - 1
pos = pos + 1
return boxes[:N]
def nms_locality(polys, thres=0.3):
"""
locality aware nms of EAST
:param polys: a N*9 numpy array. first 8 coordinates, then prob
:return: boxes after nms
"""
S = []
p = None
for g in polys:
if p is not None and intersection(g, p) > thres:
p = weighted_merge(g, p)
else:
if p is not None:
S.append(p)
p = g
if p is not None:
S.append(p)
if len(S) == 0:
return np.array([])
return standard_nms(np.array(S), thres)
if __name__ == '__main__':
# 343,350,448,135,474,143,369,359
print(
Polygon(np.array([[343, 350], [448, 135], [474, 143], [369, 359]]))
.area)
\ No newline at end of file
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import sys
__dir__ = os.path.dirname(__file__)
sys.path.append(__dir__)
sys.path.append(os.path.join(__dir__, '..'))
from pytorchocr.utils.e2e_utils.pgnet_pp_utils import PGNet_PostProcess
class PGPostProcess(object):
"""
The post process for PGNet.
"""
def __init__(self, character_dict_path, valid_set, score_thresh, mode,
**kwargs):
self.character_dict_path = character_dict_path
self.valid_set = valid_set
self.score_thresh = score_thresh
self.mode = mode
# c++ la-nms is faster, but only support python 3.5
self.is_python35 = False
if sys.version_info.major == 3 and sys.version_info.minor == 5:
self.is_python35 = True
def __call__(self, outs_dict, shape_list):
post = PGNet_PostProcess(self.character_dict_path, self.valid_set,
self.score_thresh, outs_dict, shape_list)
if self.mode == 'fast':
data = post.pg_postprocess_fast()
else:
data = post.pg_postprocess_slow()
return data
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .pse_postprocess import PSEPostProcess
\ No newline at end of file
## 编译
This code is refer from:
https://github.com/whai362/PSENet/blob/python3/models/post_processing/pse
```python
python3 setup.py build_ext --inplace
```
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import os
import subprocess
python_path = sys.executable
ori_path = os.getcwd()
os.chdir('pytorchocr/postprocess/pse_postprocess/pse')
if subprocess.call(
'{} setup.py build_ext --inplace'.format(python_path), shell=True) != 0:
raise RuntimeError(
'Cannot compile pse: {}, if your system is windows, you need to install all the default components of `desktop development using C++` in visual studio 2019+'.
format(os.path.dirname(os.path.realpath(__file__))))
os.chdir(ori_path)
from .pse import pse
import numpy as np
import cv2
cimport numpy as np
cimport cython
cimport libcpp
cimport libcpp.pair
cimport libcpp.queue
from libcpp.pair cimport *
from libcpp.queue cimport *
@cython.boundscheck(False)
@cython.wraparound(False)
cdef np.ndarray[np.int32_t, ndim=2] _pse(np.ndarray[np.uint8_t, ndim=3] kernels,
np.ndarray[np.int32_t, ndim=2] label,
int kernel_num,
int label_num,
float min_area=0):
cdef np.ndarray[np.int32_t, ndim=2] pred
pred = np.zeros((label.shape[0], label.shape[1]), dtype=np.int32)
for label_idx in range(1, label_num):
if np.sum(label == label_idx) < min_area:
label[label == label_idx] = 0
cdef libcpp.queue.queue[libcpp.pair.pair[np.int16_t,np.int16_t]] que = \
queue[libcpp.pair.pair[np.int16_t,np.int16_t]]()
cdef libcpp.queue.queue[libcpp.pair.pair[np.int16_t,np.int16_t]] nxt_que = \
queue[libcpp.pair.pair[np.int16_t,np.int16_t]]()
cdef np.int16_t* dx = [-1, 1, 0, 0]
cdef np.int16_t* dy = [0, 0, -1, 1]
cdef np.int16_t tmpx, tmpy
points = np.array(np.where(label > 0)).transpose((1, 0))
for point_idx in range(points.shape[0]):
tmpx, tmpy = points[point_idx, 0], points[point_idx, 1]
que.push(pair[np.int16_t,np.int16_t](tmpx, tmpy))
pred[tmpx, tmpy] = label[tmpx, tmpy]
cdef libcpp.pair.pair[np.int16_t,np.int16_t] cur
cdef int cur_label
for kernel_idx in range(kernel_num - 1, -1, -1):
while not que.empty():
cur = que.front()
que.pop()
cur_label = pred[cur.first, cur.second]
is_edge = True
for j in range(4):
tmpx = cur.first + dx[j]
tmpy = cur.second + dy[j]
if tmpx < 0 or tmpx >= label.shape[0] or tmpy < 0 or tmpy >= label.shape[1]:
continue
if kernels[kernel_idx, tmpx, tmpy] == 0 or pred[tmpx, tmpy] > 0:
continue
que.push(pair[np.int16_t,np.int16_t](tmpx, tmpy))
pred[tmpx, tmpy] = cur_label
is_edge = False
if is_edge:
nxt_que.push(cur)
que, nxt_que = nxt_que, que
return pred
def pse(kernels, min_area):
kernel_num = kernels.shape[0]
label_num, label = cv2.connectedComponents(kernels[-1], connectivity=4)
return _pse(kernels[:-1], label, kernel_num, label_num, min_area)
\ No newline at end of file
from distutils.core import setup, Extension
from Cython.Build import cythonize
import numpy
setup(ext_modules=cythonize(Extension(
'pse',
sources=['pse.pyx'],
language='c++',
include_dirs=[numpy.get_include()],
library_dirs=[],
libraries=[],
extra_compile_args=['-O3'],
extra_link_args=[]
)))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment