Commit 1f6d3149 authored by LDOUBLEV's avatar LDOUBLEV
Browse files

Merge branch 'dygraph' of https://github.com/PaddlePaddle/PaddleOCR into fix_doc

parents 2b4f62f9 f14b79b4
......@@ -28,8 +28,10 @@ def build_backbone(config, model_type):
from .rec_mv1_enhance import MobileNetV1Enhance
from .rec_nrtr_mtb import MTB
from .rec_resnet_31 import ResNet31
from .rec_resnet_aster import ResNet_ASTER
support_dict = [
'MobileNetV1Enhance', 'MobileNetV3', 'ResNet', 'ResNetFPN', 'MTB', "ResNet31"
'MobileNetV1Enhance', 'MobileNetV3', 'ResNet', 'ResNetFPN', 'MTB',
"ResNet31", "ResNet_ASTER"
]
elif model_type == "e2e":
from .e2e_resnet_vd_pg import ResNet
......
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import sys
import math
def conv3x3(in_planes, out_planes, stride=1):
"""3x3 convolution with padding"""
return nn.Conv2D(
in_planes,
out_planes,
kernel_size=3,
stride=stride,
padding=1,
bias_attr=False)
def conv1x1(in_planes, out_planes, stride=1):
"""1x1 convolution"""
return nn.Conv2D(
in_planes, out_planes, kernel_size=1, stride=stride, bias_attr=False)
def get_sinusoid_encoding(n_position, feat_dim, wave_length=10000):
# [n_position]
positions = paddle.arange(0, n_position)
# [feat_dim]
dim_range = paddle.arange(0, feat_dim)
dim_range = paddle.pow(wave_length, 2 * (dim_range // 2) / feat_dim)
# [n_position, feat_dim]
angles = paddle.unsqueeze(
positions, axis=1) / paddle.unsqueeze(
dim_range, axis=0)
angles = paddle.cast(angles, "float32")
angles[:, 0::2] = paddle.sin(angles[:, 0::2])
angles[:, 1::2] = paddle.cos(angles[:, 1::2])
return angles
class AsterBlock(nn.Layer):
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(AsterBlock, self).__init__()
self.conv1 = conv1x1(inplanes, planes, stride)
self.bn1 = nn.BatchNorm2D(planes)
self.relu = nn.ReLU()
self.conv2 = conv3x3(planes, planes)
self.bn2 = nn.BatchNorm2D(planes)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class ResNet_ASTER(nn.Layer):
"""For aster or crnn"""
def __init__(self, with_lstm=True, n_group=1, in_channels=3):
super(ResNet_ASTER, self).__init__()
self.with_lstm = with_lstm
self.n_group = n_group
self.layer0 = nn.Sequential(
nn.Conv2D(
in_channels,
32,
kernel_size=(3, 3),
stride=1,
padding=1,
bias_attr=False),
nn.BatchNorm2D(32),
nn.ReLU())
self.inplanes = 32
self.layer1 = self._make_layer(32, 3, [2, 2]) # [16, 50]
self.layer2 = self._make_layer(64, 4, [2, 2]) # [8, 25]
self.layer3 = self._make_layer(128, 6, [2, 1]) # [4, 25]
self.layer4 = self._make_layer(256, 6, [2, 1]) # [2, 25]
self.layer5 = self._make_layer(512, 3, [2, 1]) # [1, 25]
if with_lstm:
self.rnn = nn.LSTM(512, 256, direction="bidirect", num_layers=2)
self.out_channels = 2 * 256
else:
self.out_channels = 512
def _make_layer(self, planes, blocks, stride):
downsample = None
if stride != [1, 1] or self.inplanes != planes:
downsample = nn.Sequential(
conv1x1(self.inplanes, planes, stride), nn.BatchNorm2D(planes))
layers = []
layers.append(AsterBlock(self.inplanes, planes, stride, downsample))
self.inplanes = planes
for _ in range(1, blocks):
layers.append(AsterBlock(self.inplanes, planes))
return nn.Sequential(*layers)
def forward(self, x):
x0 = self.layer0(x)
x1 = self.layer1(x0)
x2 = self.layer2(x1)
x3 = self.layer3(x2)
x4 = self.layer4(x3)
x5 = self.layer5(x4)
cnn_feat = x5.squeeze(2) # [N, c, w]
cnn_feat = paddle.transpose(cnn_feat, perm=[0, 2, 1])
if self.with_lstm:
rnn_feat, _ = self.rnn(cnn_feat)
return rnn_feat
else:
return cnn_feat
......@@ -29,13 +29,14 @@ def build_head(config):
from .rec_srn_head import SRNHead
from .rec_nrtr_head import Transformer
from .rec_sar_head import SARHead
from .rec_aster_head import AsterHead
# cls head
from .cls_head import ClsHead
support_dict = [
'DBHead', 'PSEHead', 'EASTHead', 'SASTHead', 'CTCHead', 'ClsHead',
'AttentionHead', 'SRNHead', 'PGHead', 'Transformer',
'TableAttentionHead', 'SARHead'
'TableAttentionHead', 'SARHead', 'AsterHead'
]
#table head
......
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import sys
import paddle
from paddle import nn
from paddle.nn import functional as F
class AsterHead(nn.Layer):
def __init__(self,
in_channels,
out_channels,
sDim,
attDim,
max_len_labels,
time_step=25,
beam_width=5,
**kwargs):
super(AsterHead, self).__init__()
self.num_classes = out_channels
self.in_planes = in_channels
self.sDim = sDim
self.attDim = attDim
self.max_len_labels = max_len_labels
self.decoder = AttentionRecognitionHead(in_channels, out_channels, sDim,
attDim, max_len_labels)
self.time_step = time_step
self.embeder = Embedding(self.time_step, in_channels)
self.beam_width = beam_width
self.eos = self.num_classes - 1
def forward(self, x, targets=None, embed=None):
return_dict = {}
embedding_vectors = self.embeder(x)
if self.training:
rec_targets, rec_lengths, _ = targets
rec_pred = self.decoder([x, rec_targets, rec_lengths],
embedding_vectors)
return_dict['rec_pred'] = rec_pred
return_dict['embedding_vectors'] = embedding_vectors
else:
rec_pred, rec_pred_scores = self.decoder.beam_search(
x, self.beam_width, self.eos, embedding_vectors)
return_dict['rec_pred'] = rec_pred
return_dict['rec_pred_scores'] = rec_pred_scores
return_dict['embedding_vectors'] = embedding_vectors
return return_dict
class Embedding(nn.Layer):
def __init__(self, in_timestep, in_planes, mid_dim=4096, embed_dim=300):
super(Embedding, self).__init__()
self.in_timestep = in_timestep
self.in_planes = in_planes
self.embed_dim = embed_dim
self.mid_dim = mid_dim
self.eEmbed = nn.Linear(
in_timestep * in_planes,
self.embed_dim) # Embed encoder output to a word-embedding like
def forward(self, x):
x = paddle.reshape(x, [paddle.shape(x)[0], -1])
x = self.eEmbed(x)
return x
class AttentionRecognitionHead(nn.Layer):
"""
input: [b x 16 x 64 x in_planes]
output: probability sequence: [b x T x num_classes]
"""
def __init__(self, in_channels, out_channels, sDim, attDim, max_len_labels):
super(AttentionRecognitionHead, self).__init__()
self.num_classes = out_channels # this is the output classes. So it includes the <EOS>.
self.in_planes = in_channels
self.sDim = sDim
self.attDim = attDim
self.max_len_labels = max_len_labels
self.decoder = DecoderUnit(
sDim=sDim, xDim=in_channels, yDim=self.num_classes, attDim=attDim)
def forward(self, x, embed):
x, targets, lengths = x
batch_size = paddle.shape(x)[0]
# Decoder
state = self.decoder.get_initial_state(embed)
outputs = []
for i in range(max(lengths)):
if i == 0:
y_prev = paddle.full(
shape=[batch_size], fill_value=self.num_classes)
else:
y_prev = targets[:, i - 1]
output, state = self.decoder(x, state, y_prev)
outputs.append(output)
outputs = paddle.concat([_.unsqueeze(1) for _ in outputs], 1)
return outputs
# inference stage.
def sample(self, x):
x, _, _ = x
batch_size = x.size(0)
# Decoder
state = paddle.zeros([1, batch_size, self.sDim])
predicted_ids, predicted_scores = [], []
for i in range(self.max_len_labels):
if i == 0:
y_prev = paddle.full(
shape=[batch_size], fill_value=self.num_classes)
else:
y_prev = predicted
output, state = self.decoder(x, state, y_prev)
output = F.softmax(output, axis=1)
score, predicted = output.max(1)
predicted_ids.append(predicted.unsqueeze(1))
predicted_scores.append(score.unsqueeze(1))
predicted_ids = paddle.concat([predicted_ids, 1])
predicted_scores = paddle.concat([predicted_scores, 1])
# return predicted_ids.squeeze(), predicted_scores.squeeze()
return predicted_ids, predicted_scores
def beam_search(self, x, beam_width, eos, embed):
def _inflate(tensor, times, dim):
repeat_dims = [1] * tensor.dim()
repeat_dims[dim] = times
output = paddle.tile(tensor, repeat_dims)
return output
# https://github.com/IBM/pytorch-seq2seq/blob/fede87655ddce6c94b38886089e05321dc9802af/seq2seq/models/TopKDecoder.py
batch_size, l, d = x.shape
x = paddle.tile(
paddle.transpose(
x.unsqueeze(1), perm=[1, 0, 2, 3]), [beam_width, 1, 1, 1])
inflated_encoder_feats = paddle.reshape(
paddle.transpose(
x, perm=[1, 0, 2, 3]), [-1, l, d])
# Initialize the decoder
state = self.decoder.get_initial_state(embed, tile_times=beam_width)
pos_index = paddle.reshape(
paddle.arange(batch_size) * beam_width, shape=[-1, 1])
# Initialize the scores
sequence_scores = paddle.full(
shape=[batch_size * beam_width, 1], fill_value=-float('Inf'))
index = [i * beam_width for i in range(0, batch_size)]
sequence_scores[index] = 0.0
# Initialize the input vector
y_prev = paddle.full(
shape=[batch_size * beam_width], fill_value=self.num_classes)
# Store decisions for backtracking
stored_scores = list()
stored_predecessors = list()
stored_emitted_symbols = list()
for i in range(self.max_len_labels):
output, state = self.decoder(inflated_encoder_feats, state, y_prev)
state = paddle.unsqueeze(state, axis=0)
log_softmax_output = paddle.nn.functional.log_softmax(
output, axis=1)
sequence_scores = _inflate(sequence_scores, self.num_classes, 1)
sequence_scores += log_softmax_output
scores, candidates = paddle.topk(
paddle.reshape(sequence_scores, [batch_size, -1]),
beam_width,
axis=1)
# Reshape input = (bk, 1) and sequence_scores = (bk, 1)
y_prev = paddle.reshape(
candidates % self.num_classes, shape=[batch_size * beam_width])
sequence_scores = paddle.reshape(
scores, shape=[batch_size * beam_width, 1])
# Update fields for next timestep
pos_index = paddle.expand_as(pos_index, candidates)
predecessors = paddle.cast(
candidates / self.num_classes + pos_index, dtype='int64')
predecessors = paddle.reshape(
predecessors, shape=[batch_size * beam_width, 1])
state = paddle.index_select(
state, index=predecessors.squeeze(), axis=1)
# Update sequence socres and erase scores for <eos> symbol so that they aren't expanded
stored_scores.append(sequence_scores.clone())
y_prev = paddle.reshape(y_prev, shape=[-1, 1])
eos_prev = paddle.full_like(y_prev, fill_value=eos)
mask = eos_prev == y_prev
mask = paddle.nonzero(mask)
if mask.dim() > 0:
sequence_scores = sequence_scores.numpy()
mask = mask.numpy()
sequence_scores[mask] = -float('inf')
sequence_scores = paddle.to_tensor(sequence_scores)
# Cache results for backtracking
stored_predecessors.append(predecessors)
y_prev = paddle.squeeze(y_prev)
stored_emitted_symbols.append(y_prev)
# Do backtracking to return the optimal values
#====== backtrak ======#
# Initialize return variables given different types
p = list()
l = [[self.max_len_labels] * beam_width for _ in range(batch_size)
] # Placeholder for lengths of top-k sequences
# the last step output of the beams are not sorted
# thus they are sorted here
sorted_score, sorted_idx = paddle.topk(
paddle.reshape(
stored_scores[-1], shape=[batch_size, beam_width]),
beam_width)
# initialize the sequence scores with the sorted last step beam scores
s = sorted_score.clone()
batch_eos_found = [0] * batch_size # the number of EOS found
# in the backward loop below for each batch
t = self.max_len_labels - 1
# initialize the back pointer with the sorted order of the last step beams.
# add pos_index for indexing variable with b*k as the first dimension.
t_predecessors = paddle.reshape(
sorted_idx + pos_index.expand_as(sorted_idx),
shape=[batch_size * beam_width])
while t >= 0:
# Re-order the variables with the back pointer
current_symbol = paddle.index_select(
stored_emitted_symbols[t], index=t_predecessors, axis=0)
t_predecessors = paddle.index_select(
stored_predecessors[t].squeeze(), index=t_predecessors, axis=0)
eos_indices = stored_emitted_symbols[t] == eos
eos_indices = paddle.nonzero(eos_indices)
if eos_indices.dim() > 0:
for i in range(eos_indices.shape[0] - 1, -1, -1):
# Indices of the EOS symbol for both variables
# with b*k as the first dimension, and b, k for
# the first two dimensions
idx = eos_indices[i]
b_idx = int(idx[0] / beam_width)
# The indices of the replacing position
# according to the replacement strategy noted above
res_k_idx = beam_width - (batch_eos_found[b_idx] %
beam_width) - 1
batch_eos_found[b_idx] += 1
res_idx = b_idx * beam_width + res_k_idx
# Replace the old information in return variables
# with the new ended sequence information
t_predecessors[res_idx] = stored_predecessors[t][idx[0]]
current_symbol[res_idx] = stored_emitted_symbols[t][idx[0]]
s[b_idx, res_k_idx] = stored_scores[t][idx[0], 0]
l[b_idx][res_k_idx] = t + 1
# record the back tracked results
p.append(current_symbol)
t -= 1
# Sort and re-order again as the added ended sequences may change
# the order (very unlikely)
s, re_sorted_idx = s.topk(beam_width)
for b_idx in range(batch_size):
l[b_idx] = [
l[b_idx][k_idx.item()] for k_idx in re_sorted_idx[b_idx, :]
]
re_sorted_idx = paddle.reshape(
re_sorted_idx + pos_index.expand_as(re_sorted_idx),
[batch_size * beam_width])
# Reverse the sequences and re-order at the same time
# It is reversed because the backtracking happens in reverse time order
p = [
paddle.reshape(
paddle.index_select(step, re_sorted_idx, 0),
shape=[batch_size, beam_width, -1]) for step in reversed(p)
]
p = paddle.concat(p, -1)[:, 0, :]
return p, paddle.ones_like(p)
class AttentionUnit(nn.Layer):
def __init__(self, sDim, xDim, attDim):
super(AttentionUnit, self).__init__()
self.sDim = sDim
self.xDim = xDim
self.attDim = attDim
self.sEmbed = nn.Linear(sDim, attDim)
self.xEmbed = nn.Linear(xDim, attDim)
self.wEmbed = nn.Linear(attDim, 1)
def forward(self, x, sPrev):
batch_size, T, _ = x.shape # [b x T x xDim]
x = paddle.reshape(x, [-1, self.xDim]) # [(b x T) x xDim]
xProj = self.xEmbed(x) # [(b x T) x attDim]
xProj = paddle.reshape(xProj, [batch_size, T, -1]) # [b x T x attDim]
sPrev = sPrev.squeeze(0)
sProj = self.sEmbed(sPrev) # [b x attDim]
sProj = paddle.unsqueeze(sProj, 1) # [b x 1 x attDim]
sProj = paddle.expand(sProj,
[batch_size, T, self.attDim]) # [b x T x attDim]
sumTanh = paddle.tanh(sProj + xProj)
sumTanh = paddle.reshape(sumTanh, [-1, self.attDim])
vProj = self.wEmbed(sumTanh) # [(b x T) x 1]
vProj = paddle.reshape(vProj, [batch_size, T])
alpha = F.softmax(
vProj, axis=1) # attention weights for each sample in the minibatch
return alpha
class DecoderUnit(nn.Layer):
def __init__(self, sDim, xDim, yDim, attDim):
super(DecoderUnit, self).__init__()
self.sDim = sDim
self.xDim = xDim
self.yDim = yDim
self.attDim = attDim
self.emdDim = attDim
self.attention_unit = AttentionUnit(sDim, xDim, attDim)
self.tgt_embedding = nn.Embedding(
yDim + 1, self.emdDim, weight_attr=nn.initializer.Normal(
std=0.01)) # the last is used for <BOS>
self.gru = nn.GRUCell(input_size=xDim + self.emdDim, hidden_size=sDim)
self.fc = nn.Linear(
sDim,
yDim,
weight_attr=nn.initializer.Normal(std=0.01),
bias_attr=nn.initializer.Constant(value=0))
self.embed_fc = nn.Linear(300, self.sDim)
def get_initial_state(self, embed, tile_times=1):
assert embed.shape[1] == 300
state = self.embed_fc(embed) # N * sDim
if tile_times != 1:
state = state.unsqueeze(1)
trans_state = paddle.transpose(state, perm=[1, 0, 2])
state = paddle.tile(trans_state, repeat_times=[tile_times, 1, 1])
trans_state = paddle.transpose(state, perm=[1, 0, 2])
state = paddle.reshape(trans_state, shape=[-1, self.sDim])
state = state.unsqueeze(0) # 1 * N * sDim
return state
def forward(self, x, sPrev, yPrev):
# x: feature sequence from the image decoder.
batch_size, T, _ = x.shape
alpha = self.attention_unit(x, sPrev)
context = paddle.squeeze(paddle.matmul(alpha.unsqueeze(1), x), axis=1)
yPrev = paddle.cast(yPrev, dtype="int64")
yProj = self.tgt_embedding(yPrev)
concat_context = paddle.concat([yProj, context], 1)
concat_context = paddle.squeeze(concat_context, 1)
sPrev = paddle.squeeze(sPrev, 0)
output, state = self.gru(concat_context, sPrev)
output = paddle.squeeze(output, axis=1)
output = self.fc(output)
return output, state
\ No newline at end of file
......@@ -38,6 +38,7 @@ class CTCHead(nn.Layer):
out_channels,
fc_decay=0.0004,
mid_channels=None,
return_feats=False,
**kwargs):
super(CTCHead, self).__init__()
if mid_channels is None:
......@@ -66,14 +67,22 @@ class CTCHead(nn.Layer):
bias_attr=bias_attr2)
self.out_channels = out_channels
self.mid_channels = mid_channels
self.return_feats = return_feats
def forward(self, x, targets=None):
if self.mid_channels is None:
predicts = self.fc(x)
else:
predicts = self.fc1(x)
predicts = self.fc2(predicts)
x = self.fc1(x)
predicts = self.fc2(x)
if self.return_feats:
result = (x, predicts)
else:
result = predicts
if not self.training:
predicts = F.softmax(predicts, axis=2)
return predicts
result = predicts
return result
......@@ -17,8 +17,9 @@ __all__ = ['build_transform']
def build_transform(config):
from .tps import TPS
from .stn import STN_ON
support_dict = ['TPS']
support_dict = ['TPS', 'STN_ON']
module_name = config.pop('name')
assert module_name in support_dict, Exception(
......
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import paddle
from paddle import nn, ParamAttr
from paddle.nn import functional as F
import numpy as np
from .tps_spatial_transformer import TPSSpatialTransformer
def conv3x3_block(in_channels, out_channels, stride=1):
n = 3 * 3 * out_channels
w = math.sqrt(2. / n)
conv_layer = nn.Conv2D(
in_channels,
out_channels,
kernel_size=3,
stride=stride,
padding=1,
weight_attr=nn.initializer.Normal(
mean=0.0, std=w),
bias_attr=nn.initializer.Constant(0))
block = nn.Sequential(conv_layer, nn.BatchNorm2D(out_channels), nn.ReLU())
return block
class STN(nn.Layer):
def __init__(self, in_channels, num_ctrlpoints, activation='none'):
super(STN, self).__init__()
self.in_channels = in_channels
self.num_ctrlpoints = num_ctrlpoints
self.activation = activation
self.stn_convnet = nn.Sequential(
conv3x3_block(in_channels, 32), #32x64
nn.MaxPool2D(
kernel_size=2, stride=2),
conv3x3_block(32, 64), #16x32
nn.MaxPool2D(
kernel_size=2, stride=2),
conv3x3_block(64, 128), # 8*16
nn.MaxPool2D(
kernel_size=2, stride=2),
conv3x3_block(128, 256), # 4*8
nn.MaxPool2D(
kernel_size=2, stride=2),
conv3x3_block(256, 256), # 2*4,
nn.MaxPool2D(
kernel_size=2, stride=2),
conv3x3_block(256, 256)) # 1*2
self.stn_fc1 = nn.Sequential(
nn.Linear(
2 * 256,
512,
weight_attr=nn.initializer.Normal(0, 0.001),
bias_attr=nn.initializer.Constant(0)),
nn.BatchNorm1D(512),
nn.ReLU())
fc2_bias = self.init_stn()
self.stn_fc2 = nn.Linear(
512,
num_ctrlpoints * 2,
weight_attr=nn.initializer.Constant(0.0),
bias_attr=nn.initializer.Assign(fc2_bias))
def init_stn(self):
margin = 0.01
sampling_num_per_side = int(self.num_ctrlpoints / 2)
ctrl_pts_x = np.linspace(margin, 1. - margin, sampling_num_per_side)
ctrl_pts_y_top = np.ones(sampling_num_per_side) * margin
ctrl_pts_y_bottom = np.ones(sampling_num_per_side) * (1 - margin)
ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1)
ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1)
ctrl_points = np.concatenate(
[ctrl_pts_top, ctrl_pts_bottom], axis=0).astype(np.float32)
if self.activation == 'none':
pass
elif self.activation == 'sigmoid':
ctrl_points = -np.log(1. / ctrl_points - 1.)
ctrl_points = paddle.to_tensor(ctrl_points)
fc2_bias = paddle.reshape(
ctrl_points, shape=[ctrl_points.shape[0] * ctrl_points.shape[1]])
return fc2_bias
def forward(self, x):
x = self.stn_convnet(x)
batch_size, _, h, w = x.shape
x = paddle.reshape(x, shape=(batch_size, -1))
img_feat = self.stn_fc1(x)
x = self.stn_fc2(0.1 * img_feat)
if self.activation == 'sigmoid':
x = F.sigmoid(x)
x = paddle.reshape(x, shape=[-1, self.num_ctrlpoints, 2])
return img_feat, x
class STN_ON(nn.Layer):
def __init__(self, in_channels, tps_inputsize, tps_outputsize,
num_control_points, tps_margins, stn_activation):
super(STN_ON, self).__init__()
self.tps = TPSSpatialTransformer(
output_image_size=tuple(tps_outputsize),
num_control_points=num_control_points,
margins=tuple(tps_margins))
self.stn_head = STN(in_channels=in_channels,
num_ctrlpoints=num_control_points,
activation=stn_activation)
self.tps_inputsize = tps_inputsize
self.out_channels = in_channels
def forward(self, image):
stn_input = paddle.nn.functional.interpolate(
image, self.tps_inputsize, mode="bilinear", align_corners=True)
stn_img_feat, ctrl_points = self.stn_head(stn_input)
x, _ = self.tps(image, ctrl_points)
return x
......@@ -231,7 +231,8 @@ class GridGenerator(nn.Layer):
""" Return inv_delta_C which is needed to calculate T """
F = self.F
hat_eye = paddle.eye(F, dtype='float64') # F x F
hat_C = paddle.norm(C.reshape([1, F, 2]) - C.reshape([F, 1, 2]), axis=2) + hat_eye
hat_C = paddle.norm(
C.reshape([1, F, 2]) - C.reshape([F, 1, 2]), axis=2) + hat_eye
hat_C = (hat_C**2) * paddle.log(hat_C)
delta_C = paddle.concat( # F+3 x F+3
[
......
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import paddle
from paddle import nn, ParamAttr
from paddle.nn import functional as F
import numpy as np
import itertools
def grid_sample(input, grid, canvas=None):
input.stop_gradient = False
output = F.grid_sample(input, grid)
if canvas is None:
return output
else:
input_mask = paddle.ones(shape=input.shape)
output_mask = F.grid_sample(input_mask, grid)
padded_output = output * output_mask + canvas * (1 - output_mask)
return padded_output
# phi(x1, x2) = r^2 * log(r), where r = ||x1 - x2||_2
def compute_partial_repr(input_points, control_points):
N = input_points.shape[0]
M = control_points.shape[0]
pairwise_diff = paddle.reshape(
input_points, shape=[N, 1, 2]) - paddle.reshape(
control_points, shape=[1, M, 2])
# original implementation, very slow
# pairwise_dist = torch.sum(pairwise_diff ** 2, dim = 2) # square of distance
pairwise_diff_square = pairwise_diff * pairwise_diff
pairwise_dist = pairwise_diff_square[:, :, 0] + pairwise_diff_square[:, :,
1]
repr_matrix = 0.5 * pairwise_dist * paddle.log(pairwise_dist)
# fix numerical error for 0 * log(0), substitute all nan with 0
mask = repr_matrix != repr_matrix
repr_matrix[mask] = 0
return repr_matrix
# output_ctrl_pts are specified, according to our task.
def build_output_control_points(num_control_points, margins):
margin_x, margin_y = margins
num_ctrl_pts_per_side = num_control_points // 2
ctrl_pts_x = np.linspace(margin_x, 1.0 - margin_x, num_ctrl_pts_per_side)
ctrl_pts_y_top = np.ones(num_ctrl_pts_per_side) * margin_y
ctrl_pts_y_bottom = np.ones(num_ctrl_pts_per_side) * (1.0 - margin_y)
ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1)
ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1)
output_ctrl_pts_arr = np.concatenate(
[ctrl_pts_top, ctrl_pts_bottom], axis=0)
output_ctrl_pts = paddle.to_tensor(output_ctrl_pts_arr)
return output_ctrl_pts
class TPSSpatialTransformer(nn.Layer):
def __init__(self,
output_image_size=None,
num_control_points=None,
margins=None):
super(TPSSpatialTransformer, self).__init__()
self.output_image_size = output_image_size
self.num_control_points = num_control_points
self.margins = margins
self.target_height, self.target_width = output_image_size
target_control_points = build_output_control_points(num_control_points,
margins)
N = num_control_points
# create padded kernel matrix
forward_kernel = paddle.zeros(shape=[N + 3, N + 3])
target_control_partial_repr = compute_partial_repr(
target_control_points, target_control_points)
target_control_partial_repr = paddle.cast(target_control_partial_repr,
forward_kernel.dtype)
forward_kernel[:N, :N] = target_control_partial_repr
forward_kernel[:N, -3] = 1
forward_kernel[-3, :N] = 1
target_control_points = paddle.cast(target_control_points,
forward_kernel.dtype)
forward_kernel[:N, -2:] = target_control_points
forward_kernel[-2:, :N] = paddle.transpose(
target_control_points, perm=[1, 0])
# compute inverse matrix
inverse_kernel = paddle.inverse(forward_kernel)
# create target cordinate matrix
HW = self.target_height * self.target_width
target_coordinate = list(
itertools.product(
range(self.target_height), range(self.target_width)))
target_coordinate = paddle.to_tensor(target_coordinate) # HW x 2
Y, X = paddle.split(
target_coordinate, target_coordinate.shape[1], axis=1)
Y = Y / (self.target_height - 1)
X = X / (self.target_width - 1)
target_coordinate = paddle.concat(
[X, Y], axis=1) # convert from (y, x) to (x, y)
target_coordinate_partial_repr = compute_partial_repr(
target_coordinate, target_control_points)
target_coordinate_repr = paddle.concat(
[
target_coordinate_partial_repr, paddle.ones(shape=[HW, 1]),
target_coordinate
],
axis=1)
# register precomputed matrices
self.inverse_kernel = inverse_kernel
self.padding_matrix = paddle.zeros(shape=[3, 2])
self.target_coordinate_repr = target_coordinate_repr
self.target_control_points = target_control_points
def forward(self, input, source_control_points):
assert source_control_points.ndimension() == 3
assert source_control_points.shape[1] == self.num_control_points
assert source_control_points.shape[2] == 2
batch_size = paddle.shape(source_control_points)[0]
self.padding_matrix = paddle.expand(
self.padding_matrix, shape=[batch_size, 3, 2])
Y = paddle.concat([source_control_points, self.padding_matrix], 1)
mapping_matrix = paddle.matmul(self.inverse_kernel, Y)
source_coordinate = paddle.matmul(self.target_coordinate_repr,
mapping_matrix)
grid = paddle.reshape(
source_coordinate,
shape=[-1, self.target_height, self.target_width, 2])
grid = paddle.clip(grid, 0,
1) # the source_control_points may be out of [0, 1].
# the input to grid_sample is normalized [-1, 1], but what we get is [0, 1]
grid = 2.0 * grid - 1.0
output_maps = grid_sample(input, grid, canvas=None)
return output_maps, source_coordinate
......@@ -127,3 +127,34 @@ class RMSProp(object):
grad_clip=self.grad_clip,
parameters=parameters)
return opt
class Adadelta(object):
def __init__(self,
learning_rate=0.001,
epsilon=1e-08,
rho=0.95,
parameter_list=None,
weight_decay=None,
grad_clip=None,
name=None,
**kwargs):
self.learning_rate = learning_rate
self.epsilon = epsilon
self.rho = rho
self.parameter_list = parameter_list
self.learning_rate = learning_rate
self.weight_decay = weight_decay
self.grad_clip = grad_clip
self.name = name
def __call__(self, parameters):
opt = optim.Adadelta(
learning_rate=self.learning_rate,
epsilon=self.epsilon,
rho=self.rho,
weight_decay=self.weight_decay,
grad_clip=self.grad_clip,
name=self.name,
parameters=parameters)
return opt
......@@ -18,17 +18,21 @@ from __future__ import print_function
from __future__ import unicode_literals
import copy
import platform
__all__ = ['build_post_process']
from .db_postprocess import DBPostProcess, DistillationDBPostProcess
from .east_postprocess import EASTPostProcess
from .sast_postprocess import SASTPostProcess
from .rec_postprocess import CTCLabelDecode, AttnLabelDecode, SRNLabelDecode, DistillationCTCLabelDecode, NRTRLabelDecode, \
TableLabelDecode, SARLabelDecode
from .rec_postprocess import CTCLabelDecode, AttnLabelDecode, SRNLabelDecode, DistillationCTCLabelDecode, \
TableLabelDecode, NRTRLabelDecode, SARLabelDecode , SEEDLabelDecode
from .cls_postprocess import ClsPostProcess
from .pg_postprocess import PGPostProcess
from .pse_postprocess import PSEPostProcess
if platform.system() != "Windows":
# pse is not support in Windows
from .pse_postprocess import PSEPostProcess
def build_post_process(config, global_config=None):
......@@ -36,7 +40,8 @@ def build_post_process(config, global_config=None):
'DBPostProcess', 'PSEPostProcess', 'EASTPostProcess', 'SASTPostProcess',
'CTCLabelDecode', 'AttnLabelDecode', 'ClsPostProcess', 'SRNLabelDecode',
'PGPostProcess', 'DistillationCTCLabelDecode', 'TableLabelDecode',
'DistillationDBPostProcess', 'NRTRLabelDecode', 'SARLabelDecode'
'DistillationDBPostProcess', 'NRTRLabelDecode', 'SARLabelDecode',
'SEEDLabelDecode'
]
config = copy.deepcopy(config)
......
......@@ -111,6 +111,8 @@ class CTCLabelDecode(BaseRecLabelDecode):
character_type, use_space_char)
def __call__(self, preds, label=None, *args, **kwargs):
if isinstance(preds, tuple):
preds = preds[-1]
if isinstance(preds, paddle.Tensor):
preds = preds.numpy()
preds_idx = preds.argmax(axis=2)
......@@ -308,6 +310,87 @@ class AttnLabelDecode(BaseRecLabelDecode):
return idx
class SEEDLabelDecode(BaseRecLabelDecode):
""" Convert between text-label and text-index """
def __init__(self,
character_dict_path=None,
character_type='ch',
use_space_char=False,
**kwargs):
super(SEEDLabelDecode, self).__init__(character_dict_path,
character_type, use_space_char)
def add_special_char(self, dict_character):
self.beg_str = "sos"
self.end_str = "eos"
dict_character = dict_character + [self.end_str]
return dict_character
def get_ignored_tokens(self):
end_idx = self.get_beg_end_flag_idx("eos")
return [end_idx]
def get_beg_end_flag_idx(self, beg_or_end):
if beg_or_end == "sos":
idx = np.array(self.dict[self.beg_str])
elif beg_or_end == "eos":
idx = np.array(self.dict[self.end_str])
else:
assert False, "unsupport type %s in get_beg_end_flag_idx" % beg_or_end
return idx
def decode(self, text_index, text_prob=None, is_remove_duplicate=False):
""" convert text-index into text-label. """
result_list = []
[end_idx] = self.get_ignored_tokens()
batch_size = len(text_index)
for batch_idx in range(batch_size):
char_list = []
conf_list = []
for idx in range(len(text_index[batch_idx])):
if int(text_index[batch_idx][idx]) == int(end_idx):
break
if is_remove_duplicate:
# only for predict
if idx > 0 and text_index[batch_idx][idx - 1] == text_index[
batch_idx][idx]:
continue
char_list.append(self.character[int(text_index[batch_idx][
idx])])
if text_prob is not None:
conf_list.append(text_prob[batch_idx][idx])
else:
conf_list.append(1)
text = ''.join(char_list)
result_list.append((text, np.mean(conf_list)))
return result_list
def __call__(self, preds, label=None, *args, **kwargs):
"""
text = self.decode(text)
if label is None:
return text
else:
label = self.decode(label, is_remove_duplicate=False)
return text, label
"""
preds_idx = preds["rec_pred"]
if isinstance(preds_idx, paddle.Tensor):
preds_idx = preds_idx.numpy()
if "rec_pred_scores" in preds:
preds_idx = preds["rec_pred"]
preds_prob = preds["rec_pred_scores"]
else:
preds_idx = preds["rec_pred"].argmax(axis=2)
preds_prob = preds["rec_pred"].max(axis=2)
text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False)
if label is None:
return text
label = self.decode(label, is_remove_duplicate=False)
return text, label
class SRNLabelDecode(BaseRecLabelDecode):
""" Convert between text-label and text-index """
......
......@@ -11,4 +11,5 @@ opencv-contrib-python==4.4.0.46
cython
lxml
premailer
openpyxl
\ No newline at end of file
openpyxl
fasttext==0.9.1
\ No newline at end of file
......@@ -54,8 +54,7 @@ def main():
config['Architecture']["Head"]['out_channels'] = char_num
model = build_model(config['Architecture'])
use_srn = config['Architecture']['algorithm'] == "SRN"
use_sar = config['Architecture']['algorithm'] == "SAR"
extra_input = config['Architecture']['algorithm'] in ["SRN", "SAR"]
if "model_type" in config['Architecture'].keys():
model_type = config['Architecture']['model_type']
else:
......@@ -72,7 +71,7 @@ def main():
# start eval
metric = program.eval(model, valid_dataloader, post_process_class,
eval_class, model_type, use_srn, use_sar)
eval_class, model_type, extra_input)
logger.info('metric eval ***************')
for k, v in metric.items():
logger.info('{}:{}'.format(k, v))
......
......@@ -49,6 +49,12 @@ def export_single_model(model, arch_config, save_path, logger):
]
]
model = to_static(model, input_spec=other_shape)
elif arch_config["algorithm"] == "SAR":
other_shape = [
paddle.static.InputSpec(
shape=[None, 3, 48, 160], dtype="float32"),
]
model = to_static(model, input_spec=other_shape)
else:
infer_shape = [3, -1, -1]
if arch_config["model_type"] == "rec":
......
......@@ -68,6 +68,13 @@ class TextRecognizer(object):
"character_dict_path": args.rec_char_dict_path,
"use_space_char": args.use_space_char
}
elif self.rec_algorithm == "SAR":
postprocess_params = {
'name': 'SARLabelDecode',
"character_type": args.rec_char_type,
"character_dict_path": args.rec_char_dict_path,
"use_space_char": args.use_space_char
}
self.postprocess_op = build_post_process(postprocess_params)
self.predictor, self.input_tensor, self.output_tensors, self.config = \
utility.create_predictor(args, 'rec', logger)
......@@ -194,6 +201,41 @@ class TextRecognizer(object):
return (norm_img, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1,
gsrm_slf_attn_bias2)
def resize_norm_img_sar(self, img, image_shape,
width_downsample_ratio=0.25):
imgC, imgH, imgW_min, imgW_max = image_shape
h = img.shape[0]
w = img.shape[1]
valid_ratio = 1.0
# make sure new_width is an integral multiple of width_divisor.
width_divisor = int(1 / width_downsample_ratio)
# resize
ratio = w / float(h)
resize_w = math.ceil(imgH * ratio)
if resize_w % width_divisor != 0:
resize_w = round(resize_w / width_divisor) * width_divisor
if imgW_min is not None:
resize_w = max(imgW_min, resize_w)
if imgW_max is not None:
valid_ratio = min(1.0, 1.0 * resize_w / imgW_max)
resize_w = min(imgW_max, resize_w)
resized_image = cv2.resize(img, (resize_w, imgH))
resized_image = resized_image.astype('float32')
# norm
if image_shape[0] == 1:
resized_image = resized_image / 255
resized_image = resized_image[np.newaxis, :]
else:
resized_image = resized_image.transpose((2, 0, 1)) / 255
resized_image -= 0.5
resized_image /= 0.5
resize_shape = resized_image.shape
padding_im = -1.0 * np.ones((imgC, imgH, imgW_max), dtype=np.float32)
padding_im[:, :, 0:resize_w] = resized_image
pad_shape = padding_im.shape
return padding_im, resize_shape, pad_shape, valid_ratio
def __call__(self, img_list):
img_num = len(img_list)
# Calculate the aspect ratio of all text bars
......@@ -216,11 +258,19 @@ class TextRecognizer(object):
wh_ratio = w * 1.0 / h
max_wh_ratio = max(max_wh_ratio, wh_ratio)
for ino in range(beg_img_no, end_img_no):
if self.rec_algorithm != "SRN":
if self.rec_algorithm != "SRN" and self.rec_algorithm != "SAR":
norm_img = self.resize_norm_img(img_list[indices[ino]],
max_wh_ratio)
norm_img = norm_img[np.newaxis, :]
norm_img_batch.append(norm_img)
elif self.rec_algorithm == "SAR":
norm_img, _, _, valid_ratio = self.resize_norm_img_sar(
img_list[indices[ino]], self.rec_image_shape)
norm_img = norm_img[np.newaxis, :]
valid_ratio = np.expand_dims(valid_ratio, axis=0)
valid_ratios = []
valid_ratios.append(valid_ratio)
norm_img_batch.append(norm_img)
else:
norm_img = self.process_image_srn(
img_list[indices[ino]], self.rec_image_shape, 8, 25)
......@@ -266,6 +316,25 @@ class TextRecognizer(object):
if self.benchmark:
self.autolog.times.stamp()
preds = {"predict": outputs[2]}
elif self.rec_algorithm == "SAR":
valid_ratios = np.concatenate(valid_ratios)
inputs = [
norm_img_batch,
valid_ratios,
]
input_names = self.predictor.get_input_names()
for i in range(len(input_names)):
input_tensor = self.predictor.get_input_handle(input_names[
i])
input_tensor.copy_from_cpu(inputs[i])
self.predictor.run()
outputs = []
for output_tensor in self.output_tensors:
output = output_tensor.copy_to_cpu()
outputs.append(output)
if self.benchmark:
self.autolog.times.stamp()
preds = outputs[0]
else:
self.input_tensor.copy_from_cpu(norm_img_batch)
self.predictor.run()
......
......@@ -186,12 +186,13 @@ def train(config,
model.train()
use_srn = config['Architecture']['algorithm'] == "SRN"
use_nrtr = config['Architecture']['algorithm'] == "NRTR"
use_sar = config['Architecture']['algorithm'] == 'SAR'
extra_input = config['Architecture'][
'algorithm'] in ["SRN", "NRTR", "SAR", "SEED"]
try:
model_type = config['Architecture']['model_type']
except:
model_type = None
algorithm = config['Architecture']['algorithm']
if 'start_epoch' in best_model_dict:
start_epoch = best_model_dict['start_epoch']
......@@ -215,7 +216,7 @@ def train(config,
images = batch[0]
if use_srn:
model_average = True
if use_srn or model_type == 'table' or use_nrtr or use_sar:
if model_type == 'table' or extra_input:
preds = model(images, data=batch[1:])
else:
preds = model(images)
......@@ -279,8 +280,7 @@ def train(config,
post_process_class,
eval_class,
model_type,
use_srn=use_srn,
use_sar=use_sar)
extra_input=extra_input)
cur_metric_str = 'cur metric, {}'.format(', '.join(
['{}: {}'.format(k, v) for k, v in cur_metric.items()]))
logger.info(cur_metric_str)
......@@ -351,9 +351,8 @@ def eval(model,
valid_dataloader,
post_process_class,
eval_class,
model_type,
use_srn=False,
use_sar=False):
model_type=None,
extra_input=False):
model.eval()
with paddle.no_grad():
total_frame = 0.0
......@@ -366,7 +365,7 @@ def eval(model,
break
images = batch[0]
start = time.time()
if use_srn or model_type == 'table' or use_sar:
if model_type == 'table' or extra_input:
preds = model(images, data=batch[1:])
else:
preds = model(images)
......@@ -395,6 +394,18 @@ def preprocess(is_train=False):
config = load_config(FLAGS.config)
merge_config(FLAGS.opt)
if is_train:
# save_config
save_model_dir = config['Global']['save_model_dir']
os.makedirs(save_model_dir, exist_ok=True)
with open(os.path.join(save_model_dir, 'config.yml'), 'w') as f:
yaml.dump(
dict(config), f, default_flow_style=False, sort_keys=False)
log_file = '{}/train.log'.format(save_model_dir)
else:
log_file = None
logger = get_logger(name='root', log_file=log_file)
# check if set use_gpu=True in paddlepaddle cpu version
use_gpu = config['Global']['use_gpu']
check_gpu(use_gpu)
......@@ -402,24 +413,19 @@ def preprocess(is_train=False):
alg = config['Architecture']['algorithm']
assert alg in [
'EAST', 'DB', 'SAST', 'Rosetta', 'CRNN', 'STARNet', 'RARE', 'SRN',
'CLS', 'PGNet', 'Distillation', 'NRTR', 'TableAttn', 'SAR', 'PSE'
]
'CLS', 'PGNet', 'Distillation', 'NRTR', 'TableAttn', 'SAR', 'PSE',
'SEED']
windows_not_support_list = ['PSE']
if platform.system() == "Windows" and alg in windows_not_support_list:
logger.warning('{} is not support in Windows now'.format(
windows_not_support_list))
sys.exit()
device = 'gpu:{}'.format(dist.ParallelEnv().dev_id) if use_gpu else 'cpu'
device = paddle.set_device(device)
config['Global']['distributed'] = dist.get_world_size() != 1
if is_train:
# save_config
save_model_dir = config['Global']['save_model_dir']
os.makedirs(save_model_dir, exist_ok=True)
with open(os.path.join(save_model_dir, 'config.yml'), 'w') as f:
yaml.dump(
dict(config), f, default_flow_style=False, sort_keys=False)
log_file = '{}/train.log'.format(save_model_dir)
else:
log_file = None
logger = get_logger(name='root', log_file=log_file)
if config['Global']['use_visualdl']:
from visualdl import LogWriter
save_model_dir = config['Global']['save_model_dir']
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment