"test/magic_number_division.cpp" did not exist on "6dfb92bbef33b4caea55f6b4ed7c449927ae771c"
Unverified Commit 7c09c97d authored by zhoujun's avatar zhoujun Committed by GitHub
Browse files

Merge pull request #4 from PaddlePaddle/develop

merge paddleocr
parents c1d19ce2 5ee40948
...@@ -58,6 +58,10 @@ class RecModel(object): ...@@ -58,6 +58,10 @@ class RecModel(object):
self.loss_type = global_params['loss_type'] self.loss_type = global_params['loss_type']
self.image_shape = global_params['image_shape'] self.image_shape = global_params['image_shape']
self.max_text_length = global_params['max_text_length'] self.max_text_length = global_params['max_text_length']
if "num_heads" in global_params:
self.num_heads = global_params["num_heads"]
else:
self.num_heads = None
def create_feed(self, mode): def create_feed(self, mode):
image_shape = deepcopy(self.image_shape) image_shape = deepcopy(self.image_shape)
...@@ -77,6 +81,48 @@ class RecModel(object): ...@@ -77,6 +81,48 @@ class RecModel(object):
lod_level=1) lod_level=1)
feed_list = [image, label_in, label_out] feed_list = [image, label_in, label_out]
labels = {'label_in': label_in, 'label_out': label_out} labels = {'label_in': label_in, 'label_out': label_out}
elif self.loss_type == "srn":
encoder_word_pos = fluid.data(
name="encoder_word_pos",
shape=[
-1, int((image_shape[-2] / 8) * (image_shape[-1] / 8)),
1
],
dtype="int64")
gsrm_word_pos = fluid.data(
name="gsrm_word_pos",
shape=[-1, self.max_text_length, 1],
dtype="int64")
gsrm_slf_attn_bias1 = fluid.data(
name="gsrm_slf_attn_bias1",
shape=[
-1, self.num_heads, self.max_text_length,
self.max_text_length
],
dtype="float32")
gsrm_slf_attn_bias2 = fluid.data(
name="gsrm_slf_attn_bias2",
shape=[
-1, self.num_heads, self.max_text_length,
self.max_text_length
],
dtype="float32")
lbl_weight = fluid.layers.data(
name="lbl_weight", shape=[-1, 1], dtype='int64')
label = fluid.data(
name='label', shape=[-1, 1], dtype='int32', lod_level=1)
feed_list = [
image, label, encoder_word_pos, gsrm_word_pos,
gsrm_slf_attn_bias1, gsrm_slf_attn_bias2, lbl_weight
]
labels = {
'label': label,
'encoder_word_pos': encoder_word_pos,
'gsrm_word_pos': gsrm_word_pos,
'gsrm_slf_attn_bias1': gsrm_slf_attn_bias1,
'gsrm_slf_attn_bias2': gsrm_slf_attn_bias2,
'lbl_weight': lbl_weight
}
else: else:
label = fluid.data( label = fluid.data(
name='label', shape=[None, 1], dtype='int32', lod_level=1) name='label', shape=[None, 1], dtype='int32', lod_level=1)
...@@ -88,6 +134,8 @@ class RecModel(object): ...@@ -88,6 +134,8 @@ class RecModel(object):
use_double_buffer=True, use_double_buffer=True,
iterable=False) iterable=False)
else: else:
labels = None
loader = None
if self.char_type == "ch" and self.infer_img: if self.char_type == "ch" and self.infer_img:
image_shape[-1] = -1 image_shape[-1] = -1
if self.tps != None: if self.tps != None:
...@@ -98,8 +146,42 @@ class RecModel(object): ...@@ -98,8 +146,42 @@ class RecModel(object):
) )
image_shape = deepcopy(self.image_shape) image_shape = deepcopy(self.image_shape)
image = fluid.data(name='image', shape=image_shape, dtype='float32') image = fluid.data(name='image', shape=image_shape, dtype='float32')
labels = None if self.loss_type == "srn":
loader = None encoder_word_pos = fluid.data(
name="encoder_word_pos",
shape=[
-1, int((image_shape[-2] / 8) * (image_shape[-1] / 8)),
1
],
dtype="int64")
gsrm_word_pos = fluid.data(
name="gsrm_word_pos",
shape=[-1, self.max_text_length, 1],
dtype="int64")
gsrm_slf_attn_bias1 = fluid.data(
name="gsrm_slf_attn_bias1",
shape=[
-1, self.num_heads, self.max_text_length,
self.max_text_length
],
dtype="float32")
gsrm_slf_attn_bias2 = fluid.data(
name="gsrm_slf_attn_bias2",
shape=[
-1, self.num_heads, self.max_text_length,
self.max_text_length
],
dtype="float32")
feed_list = [
image, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1,
gsrm_slf_attn_bias2
]
labels = {
'encoder_word_pos': encoder_word_pos,
'gsrm_word_pos': gsrm_word_pos,
'gsrm_slf_attn_bias1': gsrm_slf_attn_bias1,
'gsrm_slf_attn_bias2': gsrm_slf_attn_bias2
}
return image, labels, loader return image, labels, loader
def __call__(self, mode): def __call__(self, mode):
...@@ -117,13 +199,27 @@ class RecModel(object): ...@@ -117,13 +199,27 @@ class RecModel(object):
label = labels['label_out'] label = labels['label_out']
else: else:
label = labels['label'] label = labels['label']
outputs = {'total_loss':loss, 'decoded_out':\ if self.loss_type == 'srn':
decoded_out, 'label':label} total_loss, img_loss, word_loss = self.loss(predicts, labels)
outputs = {
'total_loss': total_loss,
'img_loss': img_loss,
'word_loss': word_loss,
'decoded_out': decoded_out,
'label': label
}
else:
outputs = {'total_loss':loss, 'decoded_out':\
decoded_out, 'label':label}
return loader, outputs return loader, outputs
elif mode == "export": elif mode == "export":
predict = predicts['predict'] predict = predicts['predict']
if self.loss_type == "ctc": if self.loss_type == "ctc":
predict = fluid.layers.softmax(predict) predict = fluid.layers.softmax(predict)
if self.loss_type == "srn":
raise Exception(
"Warning! SRN does not support export model currently")
return [image, {'decoded_out': decoded_out, 'predicts': predict}] return [image, {'decoded_out': decoded_out, 'predicts': predict}]
else: else:
predict = predicts['predict'] predict = predicts['predict']
......
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle.fluid as fluid
from paddle.fluid.param_attr import ParamAttr
__all__ = ["ResNet"]
class ResNet(object):
def __init__(self, params):
"""
the Resnet backbone network for detection module.
Args:
params(dict): the super parameters for network build
"""
self.layers = params['layers']
supported_layers = [18, 34, 50, 101, 152]
assert self.layers in supported_layers, \
"supported layers are {} but input layer is {}".format(supported_layers, self.layers)
self.is_3x3 = True
def __call__(self, input):
layers = self.layers
is_3x3 = self.is_3x3
# if layers == 18:
# depth = [2, 2, 2, 2]
# elif layers == 34 or layers == 50:
# depth = [3, 4, 6, 3]
# elif layers == 101:
# depth = [3, 4, 23, 3]
# elif layers == 152:
# depth = [3, 8, 36, 3]
# elif layers == 200:
# depth = [3, 12, 48, 3]
# num_filters = [64, 128, 256, 512]
# outs = []
if layers == 18:
depth = [2, 2, 2, 2]#, 3, 3]
elif layers == 34 or layers == 50:
#depth = [3, 4, 6, 3]#, 3, 3]
depth = [3, 4, 6, 3, 3]#, 3]
elif layers == 101:
depth = [3, 4, 23, 3]#, 3, 3]
elif layers == 152:
depth = [3, 8, 36, 3]#, 3, 3]
num_filters = [64, 128, 256, 512, 512]#, 512]
blocks = {}
idx = 'block_0'
blocks[idx] = input
if is_3x3 == False:
conv = self.conv_bn_layer(
input=input,
num_filters=64,
filter_size=7,
stride=2,
act='relu')
else:
conv = self.conv_bn_layer(
input=input,
num_filters=32,
filter_size=3,
stride=2,
act='relu',
name='conv1_1')
conv = self.conv_bn_layer(
input=conv,
num_filters=32,
filter_size=3,
stride=1,
act='relu',
name='conv1_2')
conv = self.conv_bn_layer(
input=conv,
num_filters=64,
filter_size=3,
stride=1,
act='relu',
name='conv1_3')
idx = 'block_1'
blocks[idx] = conv
conv = fluid.layers.pool2d(
input=conv,
pool_size=3,
pool_stride=2,
pool_padding=1,
pool_type='max')
if layers >= 50:
for block in range(len(depth)):
for i in range(depth[block]):
if layers in [101, 152, 200] and block == 2:
if i == 0:
conv_name = "res" + str(block + 2) + "a"
else:
conv_name = "res" + str(block + 2) + "b" + str(i)
else:
conv_name = "res" + str(block + 2) + chr(97 + i)
conv = self.bottleneck_block(
input=conv,
num_filters=num_filters[block],
stride=2 if i == 0 and block != 0 else 1,
if_first=block == i == 0,
name=conv_name)
# outs.append(conv)
idx = 'block_' + str(block + 2)
blocks[idx] = conv
else:
for block in range(len(depth)):
for i in range(depth[block]):
conv_name = "res" + str(block + 2) + chr(97 + i)
conv = self.basic_block(
input=conv,
num_filters=num_filters[block],
stride=2 if i == 0 and block != 0 else 1,
if_first=block == i == 0,
name=conv_name)
# outs.append(conv)
idx = 'block_' + str(block + 2)
blocks[idx] = conv
# return outs
return blocks
def conv_bn_layer(self,
input,
num_filters,
filter_size,
stride=1,
groups=1,
act=None,
name=None):
conv = fluid.layers.conv2d(
input=input,
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
groups=groups,
act=None,
param_attr=ParamAttr(name=name + "_weights"),
bias_attr=False)
if name == "conv1":
bn_name = "bn_" + name
else:
bn_name = "bn" + name[3:]
return fluid.layers.batch_norm(
input=conv,
act=act,
param_attr=ParamAttr(name=bn_name + '_scale'),
bias_attr=ParamAttr(bn_name + '_offset'),
moving_mean_name=bn_name + '_mean',
moving_variance_name=bn_name + '_variance')
def conv_bn_layer_new(self,
input,
num_filters,
filter_size,
stride=1,
groups=1,
act=None,
name=None):
pool = fluid.layers.pool2d(
input=input,
pool_size=2,
pool_stride=2,
pool_padding=0,
pool_type='avg',
ceil_mode=True)
conv = fluid.layers.conv2d(
input=pool,
num_filters=num_filters,
filter_size=filter_size,
stride=1,
padding=(filter_size - 1) // 2,
groups=groups,
act=None,
param_attr=ParamAttr(name=name + "_weights"),
bias_attr=False)
if name == "conv1":
bn_name = "bn_" + name
else:
bn_name = "bn" + name[3:]
return fluid.layers.batch_norm(
input=conv,
act=act,
param_attr=ParamAttr(name=bn_name + '_scale'),
bias_attr=ParamAttr(bn_name + '_offset'),
moving_mean_name=bn_name + '_mean',
moving_variance_name=bn_name + '_variance')
def shortcut(self, input, ch_out, stride, name, if_first=False):
ch_in = input.shape[1]
if ch_in != ch_out or stride != 1:
if if_first:
return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
else:
return self.conv_bn_layer_new(
input, ch_out, 1, stride, name=name)
elif if_first:
return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
else:
return input
def bottleneck_block(self, input, num_filters, stride, name, if_first):
conv0 = self.conv_bn_layer(
input=input,
num_filters=num_filters,
filter_size=1,
act='relu',
name=name + "_branch2a")
conv1 = self.conv_bn_layer(
input=conv0,
num_filters=num_filters,
filter_size=3,
stride=stride,
act='relu',
name=name + "_branch2b")
conv2 = self.conv_bn_layer(
input=conv1,
num_filters=num_filters * 4,
filter_size=1,
act=None,
name=name + "_branch2c")
short = self.shortcut(
input,
num_filters * 4,
stride,
if_first=if_first,
name=name + "_branch1")
return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
def basic_block(self, input, num_filters, stride, name, if_first):
conv0 = self.conv_bn_layer(
input=input,
num_filters=num_filters,
filter_size=3,
act='relu',
stride=stride,
name=name + "_branch2a")
conv1 = self.conv_bn_layer(
input=conv0,
num_filters=num_filters,
filter_size=3,
act=None,
name=name + "_branch2b")
short = self.shortcut(
input,
num_filters,
stride,
if_first=if_first,
name=name + "_branch1")
return fluid.layers.elementwise_add(x=short, y=conv1, act='relu')
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import paddle
import paddle.fluid as fluid
from paddle.fluid.param_attr import ParamAttr
__all__ = ["ResNet", "ResNet18", "ResNet34", "ResNet50", "ResNet101", "ResNet152"]
Trainable = True
w_nolr = fluid.ParamAttr(
trainable = Trainable)
train_parameters = {
"input_size": [3, 224, 224],
"input_mean": [0.485, 0.456, 0.406],
"input_std": [0.229, 0.224, 0.225],
"learning_strategy": {
"name": "piecewise_decay",
"batch_size": 256,
"epochs": [30, 60, 90],
"steps": [0.1, 0.01, 0.001, 0.0001]
}
}
class ResNet():
def __init__(self, params):
self.layers = params['layers']
self.params = train_parameters
def __call__(self, input):
layers = self.layers
supported_layers = [18, 34, 50, 101, 152]
assert layers in supported_layers, \
"supported layers are {} but input layer is {}".format(supported_layers, layers)
if layers == 18:
depth = [2, 2, 2, 2]
elif layers == 34 or layers == 50:
depth = [3, 4, 6, 3]
elif layers == 101:
depth = [3, 4, 23, 3]
elif layers == 152:
depth = [3, 8, 36, 3]
stride_list = [(2,2),(2,2),(1,1),(1,1)]
num_filters = [64, 128, 256, 512]
conv = self.conv_bn_layer(
input=input, num_filters=64, filter_size=7, stride=2, act='relu', name="conv1")
F = []
if layers >= 50:
for block in range(len(depth)):
for i in range(depth[block]):
if layers in [101, 152] and block == 2:
if i == 0:
conv_name = "res" + str(block + 2) + "a"
else:
conv_name = "res" + str(block + 2) + "b" + str(i)
else:
conv_name = "res" + str(block + 2) + chr(97 + i)
conv = self.bottleneck_block(
input=conv,
num_filters=num_filters[block],
stride=stride_list[block] if i == 0 else 1, name=conv_name)
F.append(conv)
base = F[-1]
for i in [-2, -3]:
b, c, w, h = F[i].shape
if (w,h) == base.shape[2:]:
base = base
else:
base = fluid.layers.conv2d_transpose( input=base, num_filters=c,filter_size=4, stride=2,
padding=1,act=None,
param_attr=w_nolr,
bias_attr=w_nolr)
base = fluid.layers.batch_norm(base, act = "relu", param_attr=w_nolr, bias_attr=w_nolr)
base = fluid.layers.concat([base, F[i]], axis=1)
base = fluid.layers.conv2d(base, num_filters=c, filter_size=1, param_attr=w_nolr, bias_attr=w_nolr)
base = fluid.layers.conv2d(base, num_filters=c, filter_size=3,padding = 1, param_attr=w_nolr, bias_attr=w_nolr)
base = fluid.layers.batch_norm(base, act = "relu", param_attr=w_nolr, bias_attr=w_nolr)
base = fluid.layers.conv2d(base, num_filters=512, filter_size=1,bias_attr=w_nolr,param_attr=w_nolr)
return base
def conv_bn_layer(self,
input,
num_filters,
filter_size,
stride=1,
groups=1,
act=None,
name=None):
conv = fluid.layers.conv2d(
input=input,
num_filters=num_filters,
filter_size= 2 if stride==(1,1) else filter_size,
dilation = 2 if stride==(1,1) else 1,
stride=stride,
padding=(filter_size - 1) // 2,
groups=groups,
act=None,
param_attr=ParamAttr(name=name + "_weights",trainable = Trainable),
bias_attr=False,
name=name + '.conv2d.output.1')
if name == "conv1":
bn_name = "bn_" + name
else:
bn_name = "bn" + name[3:]
return fluid.layers.batch_norm(input=conv,
act=act,
name=bn_name + '.output.1',
param_attr=ParamAttr(name=bn_name + '_scale',trainable = Trainable),
bias_attr=ParamAttr(bn_name + '_offset',trainable = Trainable),
moving_mean_name=bn_name + '_mean',
moving_variance_name=bn_name + '_variance', )
def shortcut(self, input, ch_out, stride, is_first, name):
ch_in = input.shape[1]
if ch_in != ch_out or stride != 1 or is_first == True:
if stride == (1,1):
return self.conv_bn_layer(input, ch_out, 1, 1, name=name)
else: #stride == (2,2)
return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
else:
return input
def bottleneck_block(self, input, num_filters, stride, name):
conv0 = self.conv_bn_layer(
input=input, num_filters=num_filters, filter_size=1, act='relu', name=name + "_branch2a")
conv1 = self.conv_bn_layer(
input=conv0,
num_filters=num_filters,
filter_size=3,
stride=stride,
act='relu',
name=name + "_branch2b")
conv2 = self.conv_bn_layer(
input=conv1, num_filters=num_filters * 4, filter_size=1, act=None, name=name + "_branch2c")
short = self.shortcut(input, num_filters * 4, stride, is_first=False, name=name + "_branch1")
return fluid.layers.elementwise_add(x=short, y=conv2, act='relu', name=name + ".add.output.5")
def basic_block(self, input, num_filters, stride, is_first, name):
conv0 = self.conv_bn_layer(input=input, num_filters=num_filters, filter_size=3, act='relu', stride=stride,
name=name + "_branch2a")
conv1 = self.conv_bn_layer(input=conv0, num_filters=num_filters, filter_size=3, act=None,
name=name + "_branch2b")
short = self.shortcut(input, num_filters, stride, is_first, name=name + "_branch1")
return fluid.layers.elementwise_add(x=short, y=conv1, act='relu')
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle.fluid as fluid
from ..common_functions import conv_bn_layer, deconv_bn_layer
from collections import OrderedDict
class SASTHead(object):
"""
SAST:
see arxiv: https://arxiv.org/abs/1908.05498
args:
params(dict): the super parameters for network build
"""
def __init__(self, params):
self.model_name = params['model_name']
self.with_cab = params['with_cab']
def FPN_Up_Fusion(self, blocks):
"""
blocks{}: contain block_2, block_3, block_4, block_5, block_6, block_7 with
1/4, 1/8, 1/16, 1/32, 1/64, 1/128 resolution.
"""
f = [blocks['block_6'], blocks['block_5'], blocks['block_4'], blocks['block_3'], blocks['block_2']]
num_outputs = [256, 256, 192, 192, 128]
g = [None, None, None, None, None]
h = [None, None, None, None, None]
for i in range(5):
h[i] = conv_bn_layer(input=f[i], num_filters=num_outputs[i],
filter_size=1, stride=1, act=None, name='fpn_up_h'+str(i))
for i in range(4):
if i == 0:
g[i] = deconv_bn_layer(input=h[i], num_filters=num_outputs[i + 1], act=None, name='fpn_up_g0')
print("g[{}] shape: {}".format(i, g[i].shape))
else:
g[i] = fluid.layers.elementwise_add(x=g[i - 1], y=h[i])
g[i] = fluid.layers.relu(g[i])
#g[i] = conv_bn_layer(input=g[i], num_filters=num_outputs[i],
# filter_size=1, stride=1, act='relu')
g[i] = conv_bn_layer(input=g[i], num_filters=num_outputs[i],
filter_size=3, stride=1, act='relu', name='fpn_up_g%d_1'%i)
g[i] = deconv_bn_layer(input=g[i], num_filters=num_outputs[i + 1], act=None, name='fpn_up_g%d_2'%i)
print("g[{}] shape: {}".format(i, g[i].shape))
g[4] = fluid.layers.elementwise_add(x=g[3], y=h[4])
g[4] = fluid.layers.relu(g[4])
g[4] = conv_bn_layer(input=g[4], num_filters=num_outputs[4],
filter_size=3, stride=1, act='relu', name='fpn_up_fusion_1')
g[4] = conv_bn_layer(input=g[4], num_filters=num_outputs[4],
filter_size=1, stride=1, act=None, name='fpn_up_fusion_2')
return g[4]
def FPN_Down_Fusion(self, blocks):
"""
blocks{}: contain block_2, block_3, block_4, block_5, block_6, block_7 with
1/4, 1/8, 1/16, 1/32, 1/64, 1/128 resolution.
"""
f = [blocks['block_0'], blocks['block_1'], blocks['block_2']]
num_outputs = [32, 64, 128]
g = [None, None, None]
h = [None, None, None]
for i in range(3):
h[i] = conv_bn_layer(input=f[i], num_filters=num_outputs[i],
filter_size=3, stride=1, act=None, name='fpn_down_h'+str(i))
for i in range(2):
if i == 0:
g[i] = conv_bn_layer(input=h[i], num_filters=num_outputs[i+1], filter_size=3, stride=2, act=None, name='fpn_down_g0')
else:
g[i] = fluid.layers.elementwise_add(x=g[i - 1], y=h[i])
g[i] = fluid.layers.relu(g[i])
g[i] = conv_bn_layer(input=g[i], num_filters=num_outputs[i], filter_size=3, stride=1, act='relu', name='fpn_down_g%d_1'%i)
g[i] = conv_bn_layer(input=g[i], num_filters=num_outputs[i+1], filter_size=3, stride=2, act=None, name='fpn_down_g%d_2'%i)
# print("g[{}] shape: {}".format(i, g[i].shape))
g[2] = fluid.layers.elementwise_add(x=g[1], y=h[2])
g[2] = fluid.layers.relu(g[2])
g[2] = conv_bn_layer(input=g[2], num_filters=num_outputs[2],
filter_size=3, stride=1, act='relu', name='fpn_down_fusion_1')
g[2] = conv_bn_layer(input=g[2], num_filters=num_outputs[2],
filter_size=1, stride=1, act=None, name='fpn_down_fusion_2')
return g[2]
def SAST_Header1(self, f_common):
"""Detector header."""
#f_score
f_score = conv_bn_layer(input=f_common, num_filters=64, filter_size=1, stride=1, act='relu', name='f_score1')
f_score = conv_bn_layer(input=f_score, num_filters=64, filter_size=3, stride=1, act='relu', name='f_score2')
f_score = conv_bn_layer(input=f_score, num_filters=128, filter_size=1, stride=1, act='relu', name='f_score3')
f_score = conv_bn_layer(input=f_score, num_filters=1, filter_size=3, stride=1, name='f_score4')
f_score = fluid.layers.sigmoid(f_score)
# print("f_score shape: {}".format(f_score.shape))
#f_boder
f_border = conv_bn_layer(input=f_common, num_filters=64, filter_size=1, stride=1, act='relu', name='f_border1')
f_border = conv_bn_layer(input=f_border, num_filters=64, filter_size=3, stride=1, act='relu', name='f_border2')
f_border = conv_bn_layer(input=f_border, num_filters=128, filter_size=1, stride=1, act='relu', name='f_border3')
f_border = conv_bn_layer(input=f_border, num_filters=4, filter_size=3, stride=1, name='f_border4')
# print("f_border shape: {}".format(f_border.shape))
return f_score, f_border
def SAST_Header2(self, f_common):
"""Detector header."""
#f_tvo
f_tvo = conv_bn_layer(input=f_common, num_filters=64, filter_size=1, stride=1, act='relu', name='f_tvo1')
f_tvo = conv_bn_layer(input=f_tvo, num_filters=64, filter_size=3, stride=1, act='relu', name='f_tvo2')
f_tvo = conv_bn_layer(input=f_tvo, num_filters=128, filter_size=1, stride=1, act='relu', name='f_tvo3')
f_tvo = conv_bn_layer(input=f_tvo, num_filters=8, filter_size=3, stride=1, name='f_tvo4')
# print("f_tvo shape: {}".format(f_tvo.shape))
#f_tco
f_tco = conv_bn_layer(input=f_common, num_filters=64, filter_size=1, stride=1, act='relu', name='f_tco1')
f_tco = conv_bn_layer(input=f_tco, num_filters=64, filter_size=3, stride=1, act='relu', name='f_tco2')
f_tco = conv_bn_layer(input=f_tco, num_filters=128, filter_size=1, stride=1, act='relu', name='f_tco3')
f_tco = conv_bn_layer(input=f_tco, num_filters=2, filter_size=3, stride=1, name='f_tco4')
# print("f_tco shape: {}".format(f_tco.shape))
return f_tvo, f_tco
def cross_attention(self, f_common):
"""
"""
f_shape = fluid.layers.shape(f_common)
f_theta = conv_bn_layer(input=f_common, num_filters=128, filter_size=1, stride=1, act='relu', name='f_theta')
f_phi = conv_bn_layer(input=f_common, num_filters=128, filter_size=1, stride=1, act='relu', name='f_phi')
f_g = conv_bn_layer(input=f_common, num_filters=128, filter_size=1, stride=1, act='relu', name='f_g')
### horizon
fh_theta = f_theta
fh_phi = f_phi
fh_g = f_g
#flatten
fh_theta = fluid.layers.transpose(fh_theta, [0, 2, 3, 1])
fh_theta = fluid.layers.reshape(fh_theta, [f_shape[0] * f_shape[2], f_shape[3], 128])
fh_phi = fluid.layers.transpose(fh_phi, [0, 2, 3, 1])
fh_phi = fluid.layers.reshape(fh_phi, [f_shape[0] * f_shape[2], f_shape[3], 128])
fh_g = fluid.layers.transpose(fh_g, [0, 2, 3, 1])
fh_g = fluid.layers.reshape(fh_g, [f_shape[0] * f_shape[2], f_shape[3], 128])
#correlation
fh_attn = fluid.layers.matmul(fh_theta, fluid.layers.transpose(fh_phi, [0, 2, 1]))
#scale
fh_attn = fh_attn / (128 ** 0.5)
fh_attn = fluid.layers.softmax(fh_attn)
#weighted sum
fh_weight = fluid.layers.matmul(fh_attn, fh_g)
fh_weight = fluid.layers.reshape(fh_weight, [f_shape[0], f_shape[2], f_shape[3], 128])
# print("fh_weight: {}".format(fh_weight.shape))
fh_weight = fluid.layers.transpose(fh_weight, [0, 3, 1, 2])
fh_weight = conv_bn_layer(input=fh_weight, num_filters=128, filter_size=1, stride=1, name='fh_weight')
#short cut
fh_sc = conv_bn_layer(input=f_common, num_filters=128, filter_size=1, stride=1, name='fh_sc')
f_h = fluid.layers.relu(fh_weight + fh_sc)
######
#vertical
fv_theta = fluid.layers.transpose(f_theta, [0, 1, 3, 2])
fv_phi = fluid.layers.transpose(f_phi, [0, 1, 3, 2])
fv_g = fluid.layers.transpose(f_g, [0, 1, 3, 2])
#flatten
fv_theta = fluid.layers.transpose(fv_theta, [0, 2, 3, 1])
fv_theta = fluid.layers.reshape(fv_theta, [f_shape[0] * f_shape[3], f_shape[2], 128])
fv_phi = fluid.layers.transpose(fv_phi, [0, 2, 3, 1])
fv_phi = fluid.layers.reshape(fv_phi, [f_shape[0] * f_shape[3], f_shape[2], 128])
fv_g = fluid.layers.transpose(fv_g, [0, 2, 3, 1])
fv_g = fluid.layers.reshape(fv_g, [f_shape[0] * f_shape[3], f_shape[2], 128])
#correlation
fv_attn = fluid.layers.matmul(fv_theta, fluid.layers.transpose(fv_phi, [0, 2, 1]))
#scale
fv_attn = fv_attn / (128 ** 0.5)
fv_attn = fluid.layers.softmax(fv_attn)
#weighted sum
fv_weight = fluid.layers.matmul(fv_attn, fv_g)
fv_weight = fluid.layers.reshape(fv_weight, [f_shape[0], f_shape[3], f_shape[2], 128])
# print("fv_weight: {}".format(fv_weight.shape))
fv_weight = fluid.layers.transpose(fv_weight, [0, 3, 2, 1])
fv_weight = conv_bn_layer(input=fv_weight, num_filters=128, filter_size=1, stride=1, name='fv_weight')
#short cut
fv_sc = conv_bn_layer(input=f_common, num_filters=128, filter_size=1, stride=1, name='fv_sc')
f_v = fluid.layers.relu(fv_weight + fv_sc)
######
f_attn = fluid.layers.concat([f_h, f_v], axis=1)
f_attn = conv_bn_layer(input=f_attn, num_filters=128, filter_size=1, stride=1, act='relu', name='f_attn')
return f_attn
def __call__(self, blocks, with_cab=False):
# for k, v in blocks.items():
# print(k, v.shape)
#down fpn
f_down = self.FPN_Down_Fusion(blocks)
# print("f_down shape: {}".format(f_down.shape))
#up fpn
f_up = self.FPN_Up_Fusion(blocks)
# print("f_up shape: {}".format(f_up.shape))
#fusion
f_common = fluid.layers.elementwise_add(x=f_down, y=f_up)
f_common = fluid.layers.relu(f_common)
# print("f_common: {}".format(f_common.shape))
if self.with_cab:
# print('enhence f_common with CAB.')
f_common = self.cross_attention(f_common)
f_score, f_border= self.SAST_Header1(f_common)
f_tvo, f_tco = self.SAST_Header2(f_common)
predicts = OrderedDict()
predicts['f_score'] = f_score
predicts['f_border'] = f_border
predicts['f_tvo'] = f_tvo
predicts['f_tco'] = f_tco
return predicts
\ No newline at end of file
#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import paddle
import paddle.fluid as fluid
from paddle.fluid.param_attr import ParamAttr
import numpy as np
from .self_attention.model import wrap_encoder
from .self_attention.model import wrap_encoder_forFeature
gradient_clip = 10
class SRNPredict(object):
def __init__(self, params):
super(SRNPredict, self).__init__()
self.char_num = params['char_num']
self.max_length = params['max_text_length']
self.num_heads = params['num_heads']
self.num_encoder_TUs = params['num_encoder_TUs']
self.num_decoder_TUs = params['num_decoder_TUs']
self.hidden_dims = params['hidden_dims']
def pvam(self, inputs, others):
b, c, h, w = inputs.shape
conv_features = fluid.layers.reshape(x=inputs, shape=[-1, c, h * w])
conv_features = fluid.layers.transpose(x=conv_features, perm=[0, 2, 1])
#===== Transformer encoder =====
b, t, c = conv_features.shape
encoder_word_pos = others["encoder_word_pos"]
gsrm_word_pos = others["gsrm_word_pos"]
enc_inputs = [conv_features, encoder_word_pos, None]
word_features = wrap_encoder_forFeature(
src_vocab_size=-1,
max_length=t,
n_layer=self.num_encoder_TUs,
n_head=self.num_heads,
d_key=int(self.hidden_dims / self.num_heads),
d_value=int(self.hidden_dims / self.num_heads),
d_model=self.hidden_dims,
d_inner_hid=self.hidden_dims,
prepostprocess_dropout=0.1,
attention_dropout=0.1,
relu_dropout=0.1,
preprocess_cmd="n",
postprocess_cmd="da",
weight_sharing=True,
enc_inputs=enc_inputs, )
fluid.clip.set_gradient_clip(
fluid.clip.GradientClipByValue(gradient_clip))
#===== Parallel Visual Attention Module =====
b, t, c = word_features.shape
word_features = fluid.layers.fc(word_features, c, num_flatten_dims=2)
word_features_ = fluid.layers.reshape(word_features, [-1, 1, t, c])
word_features_ = fluid.layers.expand(word_features_,
[1, self.max_length, 1, 1])
word_pos_feature = fluid.layers.embedding(gsrm_word_pos,
[self.max_length, c])
word_pos_ = fluid.layers.reshape(word_pos_feature,
[-1, self.max_length, 1, c])
word_pos_ = fluid.layers.expand(word_pos_, [1, 1, t, 1])
temp = fluid.layers.elementwise_add(
word_features_, word_pos_, act='tanh')
attention_weight = fluid.layers.fc(input=temp,
size=1,
num_flatten_dims=3,
bias_attr=False)
attention_weight = fluid.layers.reshape(
x=attention_weight, shape=[-1, self.max_length, t])
attention_weight = fluid.layers.softmax(input=attention_weight, axis=-1)
pvam_features = fluid.layers.matmul(attention_weight,
word_features) #[b, max_length, c]
return pvam_features
def gsrm(self, pvam_features, others):
#===== GSRM Visual-to-semantic embedding block =====
b, t, c = pvam_features.shape
word_out = fluid.layers.fc(
input=fluid.layers.reshape(pvam_features, [-1, c]),
size=self.char_num,
act="softmax")
#word_out.stop_gradient = True
word_ids = fluid.layers.argmax(word_out, axis=1)
word_ids.stop_gradient = True
word_ids = fluid.layers.reshape(x=word_ids, shape=[-1, t, 1])
#===== GSRM Semantic reasoning block =====
"""
This module is achieved through bi-transformers,
ngram_feature1 is the froward one, ngram_fetaure2 is the backward one
"""
pad_idx = self.char_num
gsrm_word_pos = others["gsrm_word_pos"]
gsrm_slf_attn_bias1 = others["gsrm_slf_attn_bias1"]
gsrm_slf_attn_bias2 = others["gsrm_slf_attn_bias2"]
def prepare_bi(word_ids):
"""
prepare bi for gsrm
word1 for forward; word2 for backward
"""
word1 = fluid.layers.cast(word_ids, "float32")
word1 = fluid.layers.pad(word1, [0, 0, 1, 0, 0, 0],
pad_value=1.0 * pad_idx)
word1 = fluid.layers.cast(word1, "int64")
word1 = word1[:, :-1, :]
word2 = word_ids
return word1, word2
word1, word2 = prepare_bi(word_ids)
word1.stop_gradient = True
word2.stop_gradient = True
enc_inputs_1 = [word1, gsrm_word_pos, gsrm_slf_attn_bias1]
enc_inputs_2 = [word2, gsrm_word_pos, gsrm_slf_attn_bias2]
gsrm_feature1 = wrap_encoder(
src_vocab_size=self.char_num + 1,
max_length=self.max_length,
n_layer=self.num_decoder_TUs,
n_head=self.num_heads,
d_key=int(self.hidden_dims / self.num_heads),
d_value=int(self.hidden_dims / self.num_heads),
d_model=self.hidden_dims,
d_inner_hid=self.hidden_dims,
prepostprocess_dropout=0.1,
attention_dropout=0.1,
relu_dropout=0.1,
preprocess_cmd="n",
postprocess_cmd="da",
weight_sharing=True,
enc_inputs=enc_inputs_1, )
gsrm_feature2 = wrap_encoder(
src_vocab_size=self.char_num + 1,
max_length=self.max_length,
n_layer=self.num_decoder_TUs,
n_head=self.num_heads,
d_key=int(self.hidden_dims / self.num_heads),
d_value=int(self.hidden_dims / self.num_heads),
d_model=self.hidden_dims,
d_inner_hid=self.hidden_dims,
prepostprocess_dropout=0.1,
attention_dropout=0.1,
relu_dropout=0.1,
preprocess_cmd="n",
postprocess_cmd="da",
weight_sharing=True,
enc_inputs=enc_inputs_2, )
gsrm_feature2 = fluid.layers.pad(gsrm_feature2, [0, 0, 0, 1, 0, 0],
pad_value=0.)
gsrm_feature2 = gsrm_feature2[:, 1:, ]
gsrm_features = gsrm_feature1 + gsrm_feature2
b, t, c = gsrm_features.shape
gsrm_out = fluid.layers.matmul(
x=gsrm_features,
y=fluid.default_main_program().global_block().var(
"src_word_emb_table"),
transpose_y=True)
b, t, c = gsrm_out.shape
gsrm_out = fluid.layers.softmax(input=fluid.layers.reshape(gsrm_out,
[-1, c]))
return gsrm_features, word_out, gsrm_out
def vsfd(self, pvam_features, gsrm_features):
#===== Visual-Semantic Fusion Decoder Module =====
b, t, c1 = pvam_features.shape
b, t, c2 = gsrm_features.shape
combine_features_ = fluid.layers.concat(
[pvam_features, gsrm_features], axis=2)
img_comb_features_ = fluid.layers.reshape(
x=combine_features_, shape=[-1, c1 + c2])
img_comb_features_map = fluid.layers.fc(input=img_comb_features_,
size=c1,
act="sigmoid")
img_comb_features_map = fluid.layers.reshape(
x=img_comb_features_map, shape=[-1, t, c1])
combine_features = img_comb_features_map * pvam_features + (
1.0 - img_comb_features_map) * gsrm_features
img_comb_features = fluid.layers.reshape(
x=combine_features, shape=[-1, c1])
fc_out = fluid.layers.fc(input=img_comb_features,
size=self.char_num,
act="softmax")
return fc_out
def __call__(self, inputs, others, mode=None):
pvam_features = self.pvam(inputs, others)
gsrm_features, word_out, gsrm_out = self.gsrm(pvam_features, others)
final_out = self.vsfd(pvam_features, gsrm_features)
_, decoded_out = fluid.layers.topk(input=final_out, k=1)
predicts = {
'predict': final_out,
'decoded_out': decoded_out,
'word_out': word_out,
'gsrm_out': gsrm_out
}
return predicts
This diff is collapsed.
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle.fluid as fluid
class SASTLoss(object):
"""
SAST Loss function
"""
def __init__(self, params=None):
super(SASTLoss, self).__init__()
def __call__(self, predicts, labels):
"""
tcl_pos: N x 128 x 3
tcl_mask: N x 128 x 1
tcl_label: N x X list or LoDTensor
"""
f_score = predicts['f_score']
f_border = predicts['f_border']
f_tvo = predicts['f_tvo']
f_tco = predicts['f_tco']
l_score = labels['input_score']
l_border = labels['input_border']
l_mask = labels['input_mask']
l_tvo = labels['input_tvo']
l_tco = labels['input_tco']
#score_loss
intersection = fluid.layers.reduce_sum(f_score * l_score * l_mask)
union = fluid.layers.reduce_sum(f_score * l_mask) + fluid.layers.reduce_sum(l_score * l_mask)
score_loss = 1.0 - 2 * intersection / (union + 1e-5)
#border loss
l_border_split, l_border_norm = fluid.layers.split(l_border, num_or_sections=[4, 1], dim=1)
f_border_split = f_border
l_border_norm_split = fluid.layers.expand(x=l_border_norm, expand_times=[1, 4, 1, 1])
l_border_score = fluid.layers.expand(x=l_score, expand_times=[1, 4, 1, 1])
l_border_mask = fluid.layers.expand(x=l_mask, expand_times=[1, 4, 1, 1])
border_diff = l_border_split - f_border_split
abs_border_diff = fluid.layers.abs(border_diff)
border_sign = abs_border_diff < 1.0
border_sign = fluid.layers.cast(border_sign, dtype='float32')
border_sign.stop_gradient = True
border_in_loss = 0.5 * abs_border_diff * abs_border_diff * border_sign + \
(abs_border_diff - 0.5) * (1.0 - border_sign)
border_out_loss = l_border_norm_split * border_in_loss
border_loss = fluid.layers.reduce_sum(border_out_loss * l_border_score * l_border_mask) / \
(fluid.layers.reduce_sum(l_border_score * l_border_mask) + 1e-5)
#tvo_loss
l_tvo_split, l_tvo_norm = fluid.layers.split(l_tvo, num_or_sections=[8, 1], dim=1)
f_tvo_split = f_tvo
l_tvo_norm_split = fluid.layers.expand(x=l_tvo_norm, expand_times=[1, 8, 1, 1])
l_tvo_score = fluid.layers.expand(x=l_score, expand_times=[1, 8, 1, 1])
l_tvo_mask = fluid.layers.expand(x=l_mask, expand_times=[1, 8, 1, 1])
#
tvo_geo_diff = l_tvo_split - f_tvo_split
abs_tvo_geo_diff = fluid.layers.abs(tvo_geo_diff)
tvo_sign = abs_tvo_geo_diff < 1.0
tvo_sign = fluid.layers.cast(tvo_sign, dtype='float32')
tvo_sign.stop_gradient = True
tvo_in_loss = 0.5 * abs_tvo_geo_diff * abs_tvo_geo_diff * tvo_sign + \
(abs_tvo_geo_diff - 0.5) * (1.0 - tvo_sign)
tvo_out_loss = l_tvo_norm_split * tvo_in_loss
tvo_loss = fluid.layers.reduce_sum(tvo_out_loss * l_tvo_score * l_tvo_mask) / \
(fluid.layers.reduce_sum(l_tvo_score * l_tvo_mask) + 1e-5)
#tco_loss
l_tco_split, l_tco_norm = fluid.layers.split(l_tco, num_or_sections=[2, 1], dim=1)
f_tco_split = f_tco
l_tco_norm_split = fluid.layers.expand(x=l_tco_norm, expand_times=[1, 2, 1, 1])
l_tco_score = fluid.layers.expand(x=l_score, expand_times=[1, 2, 1, 1])
l_tco_mask = fluid.layers.expand(x=l_mask, expand_times=[1, 2, 1, 1])
#
tco_geo_diff = l_tco_split - f_tco_split
abs_tco_geo_diff = fluid.layers.abs(tco_geo_diff)
tco_sign = abs_tco_geo_diff < 1.0
tco_sign = fluid.layers.cast(tco_sign, dtype='float32')
tco_sign.stop_gradient = True
tco_in_loss = 0.5 * abs_tco_geo_diff * abs_tco_geo_diff * tco_sign + \
(abs_tco_geo_diff - 0.5) * (1.0 - tco_sign)
tco_out_loss = l_tco_norm_split * tco_in_loss
tco_loss = fluid.layers.reduce_sum(tco_out_loss * l_tco_score * l_tco_mask) / \
(fluid.layers.reduce_sum(l_tco_score * l_tco_mask) + 1e-5)
# total loss
tvo_lw, tco_lw = 1.5, 1.5
score_lw, border_lw = 1.0, 1.0
total_loss = score_loss * score_lw + border_loss * border_lw + \
tvo_loss * tvo_lw + tco_loss * tco_lw
losses = {'total_loss':total_loss, "score_loss":score_loss,\
"border_loss":border_loss, 'tvo_loss':tvo_loss, 'tco_loss':tco_loss}
return losses
\ No newline at end of file
#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import paddle
import paddle.fluid as fluid
class SRNLoss(object):
def __init__(self, params):
super(SRNLoss, self).__init__()
self.char_num = params['char_num']
def __call__(self, predicts, others):
predict = predicts['predict']
word_predict = predicts['word_out']
gsrm_predict = predicts['gsrm_out']
label = others['label']
lbl_weight = others['lbl_weight']
casted_label = fluid.layers.cast(x=label, dtype='int64')
cost_word = fluid.layers.cross_entropy(
input=word_predict, label=casted_label)
cost_gsrm = fluid.layers.cross_entropy(
input=gsrm_predict, label=casted_label)
cost_vsfd = fluid.layers.cross_entropy(
input=predict, label=casted_label)
cost_word = fluid.layers.reshape(
x=fluid.layers.reduce_sum(cost_word), shape=[1])
cost_gsrm = fluid.layers.reshape(
x=fluid.layers.reduce_sum(cost_gsrm), shape=[1])
cost_vsfd = fluid.layers.reshape(
x=fluid.layers.reduce_sum(cost_vsfd), shape=[1])
sum_cost = fluid.layers.sum(
[cost_word, cost_vsfd * 2.0, cost_gsrm * 0.15])
return [sum_cost, cost_vsfd, cost_word]
...@@ -65,3 +65,44 @@ def AdamDecay(params, parameter_list=None): ...@@ -65,3 +65,44 @@ def AdamDecay(params, parameter_list=None):
regularization=L2Decay(regularization_coeff=l2_decay), regularization=L2Decay(regularization_coeff=l2_decay),
parameter_list=parameter_list) parameter_list=parameter_list)
return optimizer return optimizer
def RMSProp(params, parameter_list=None):
"""
define optimizer function
args:
params(dict): the super parameters
parameter_list (list): list of Variable names to update to minimize loss
return:
"""
base_lr = params.get("base_lr", 0.001)
l2_decay = params.get("l2_decay", 0.00005)
if 'decay' in params:
supported_decay_mode = ["cosine_decay", "piecewise_decay"]
params = params['decay']
decay_mode = params['function']
assert decay_mode in supported_decay_mode, "Supported decay mode is {}, but got {}".format(
supported_decay_mode, decay_mode)
if decay_mode == "cosine_decay":
step_each_epoch = params['step_each_epoch']
total_epoch = params['total_epoch']
base_lr = fluid.layers.cosine_decay(
learning_rate=base_lr,
step_each_epoch=step_each_epoch,
epochs=total_epoch)
elif decay_mode == "piecewise_decay":
boundaries = params["boundaries"]
decay_rate = params["decay_rate"]
values = [
base_lr * decay_rate**idx
for idx in range(len(boundaries) + 1)
]
base_lr = fluid.layers.piecewise_decay(boundaries, values)
optimizer = fluid.optimizer.RMSProp(
learning_rate=base_lr,
regularization=fluid.regularizer.L2Decay(regularization_coeff=l2_decay))
return optimizer
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import sys
__dir__ = os.path.dirname(__file__)
sys.path.append(__dir__)
sys.path.append(os.path.join(__dir__, '..'))
import numpy as np
from .locality_aware_nms import nms_locality
# import lanms
import cv2
import time
class SASTPostProcess(object):
"""
The post process for SAST.
"""
def __init__(self, params):
self.score_thresh = params.get('score_thresh', 0.5)
self.nms_thresh = params.get('nms_thresh', 0.2)
self.sample_pts_num = params.get('sample_pts_num', 2)
self.shrink_ratio_of_width = params.get('shrink_ratio_of_width', 0.3)
self.expand_scale = params.get('expand_scale', 1.0)
self.tcl_map_thresh = 0.5
# c++ la-nms is faster, but only support python 3.5
self.is_python35 = False
if sys.version_info.major == 3 and sys.version_info.minor == 5:
self.is_python35 = True
def point_pair2poly(self, point_pair_list):
"""
Transfer vertical point_pairs into poly point in clockwise.
"""
# constract poly
point_num = len(point_pair_list) * 2
point_list = [0] * point_num
for idx, point_pair in enumerate(point_pair_list):
point_list[idx] = point_pair[0]
point_list[point_num - 1 - idx] = point_pair[1]
return np.array(point_list).reshape(-1, 2)
def shrink_quad_along_width(self, quad, begin_width_ratio=0., end_width_ratio=1.):
"""
Generate shrink_quad_along_width.
"""
ratio_pair = np.array([[begin_width_ratio], [end_width_ratio]], dtype=np.float32)
p0_1 = quad[0] + (quad[1] - quad[0]) * ratio_pair
p3_2 = quad[3] + (quad[2] - quad[3]) * ratio_pair
return np.array([p0_1[0], p0_1[1], p3_2[1], p3_2[0]])
def expand_poly_along_width(self, poly, shrink_ratio_of_width=0.3):
"""
expand poly along width.
"""
point_num = poly.shape[0]
left_quad = np.array([poly[0], poly[1], poly[-2], poly[-1]], dtype=np.float32)
left_ratio = -shrink_ratio_of_width * np.linalg.norm(left_quad[0] - left_quad[3]) / \
(np.linalg.norm(left_quad[0] - left_quad[1]) + 1e-6)
left_quad_expand = self.shrink_quad_along_width(left_quad, left_ratio, 1.0)
right_quad = np.array([poly[point_num // 2 - 2], poly[point_num // 2 - 1],
poly[point_num // 2], poly[point_num // 2 + 1]], dtype=np.float32)
right_ratio = 1.0 + \
shrink_ratio_of_width * np.linalg.norm(right_quad[0] - right_quad[3]) / \
(np.linalg.norm(right_quad[0] - right_quad[1]) + 1e-6)
right_quad_expand = self.shrink_quad_along_width(right_quad, 0.0, right_ratio)
poly[0] = left_quad_expand[0]
poly[-1] = left_quad_expand[-1]
poly[point_num // 2 - 1] = right_quad_expand[1]
poly[point_num // 2] = right_quad_expand[2]
return poly
def restore_quad(self, tcl_map, tcl_map_thresh, tvo_map):
"""Restore quad."""
xy_text = np.argwhere(tcl_map[:, :, 0] > tcl_map_thresh)
xy_text = xy_text[:, ::-1] # (n, 2)
# Sort the text boxes via the y axis
xy_text = xy_text[np.argsort(xy_text[:, 1])]
scores = tcl_map[xy_text[:, 1], xy_text[:, 0], 0]
scores = scores[:, np.newaxis]
# Restore
point_num = int(tvo_map.shape[-1] / 2)
assert point_num == 4
tvo_map = tvo_map[xy_text[:, 1], xy_text[:, 0], :]
xy_text_tile = np.tile(xy_text, (1, point_num)) # (n, point_num * 2)
quads = xy_text_tile - tvo_map
return scores, quads, xy_text
def quad_area(self, quad):
"""
compute area of a quad.
"""
edge = [
(quad[1][0] - quad[0][0]) * (quad[1][1] + quad[0][1]),
(quad[2][0] - quad[1][0]) * (quad[2][1] + quad[1][1]),
(quad[3][0] - quad[2][0]) * (quad[3][1] + quad[2][1]),
(quad[0][0] - quad[3][0]) * (quad[0][1] + quad[3][1])
]
return np.sum(edge) / 2.
def nms(self, dets):
if self.is_python35:
import lanms
dets = lanms.merge_quadrangle_n9(dets, self.nms_thresh)
else:
dets = nms_locality(dets, self.nms_thresh)
return dets
def cluster_by_quads_tco(self, tcl_map, tcl_map_thresh, quads, tco_map):
"""
Cluster pixels in tcl_map based on quads.
"""
instance_count = quads.shape[0] + 1 # contain background
instance_label_map = np.zeros(tcl_map.shape[:2], dtype=np.int32)
if instance_count == 1:
return instance_count, instance_label_map
# predict text center
xy_text = np.argwhere(tcl_map[:, :, 0] > tcl_map_thresh)
n = xy_text.shape[0]
xy_text = xy_text[:, ::-1] # (n, 2)
tco = tco_map[xy_text[:, 1], xy_text[:, 0], :] # (n, 2)
pred_tc = xy_text - tco
# get gt text center
m = quads.shape[0]
gt_tc = np.mean(quads, axis=1) # (m, 2)
pred_tc_tile = np.tile(pred_tc[:, np.newaxis, :], (1, m, 1)) # (n, m, 2)
gt_tc_tile = np.tile(gt_tc[np.newaxis, :, :], (n, 1, 1)) # (n, m, 2)
dist_mat = np.linalg.norm(pred_tc_tile - gt_tc_tile, axis=2) # (n, m)
xy_text_assign = np.argmin(dist_mat, axis=1) + 1 # (n,)
instance_label_map[xy_text[:, 1], xy_text[:, 0]] = xy_text_assign
return instance_count, instance_label_map
def estimate_sample_pts_num(self, quad, xy_text):
"""
Estimate sample points number.
"""
eh = (np.linalg.norm(quad[0] - quad[3]) + np.linalg.norm(quad[1] - quad[2])) / 2.0
ew = (np.linalg.norm(quad[0] - quad[1]) + np.linalg.norm(quad[2] - quad[3])) / 2.0
dense_sample_pts_num = max(2, int(ew))
dense_xy_center_line = xy_text[np.linspace(0, xy_text.shape[0] - 1, dense_sample_pts_num,
endpoint=True, dtype=np.float32).astype(np.int32)]
dense_xy_center_line_diff = dense_xy_center_line[1:] - dense_xy_center_line[:-1]
estimate_arc_len = np.sum(np.linalg.norm(dense_xy_center_line_diff, axis=1))
sample_pts_num = max(2, int(estimate_arc_len / eh))
return sample_pts_num
def detect_sast(self, tcl_map, tvo_map, tbo_map, tco_map, ratio_w, ratio_h, src_w, src_h,
shrink_ratio_of_width=0.3, tcl_map_thresh=0.5, offset_expand=1.0, out_strid=4.0):
"""
first resize the tcl_map, tvo_map and tbo_map to the input_size, then restore the polys
"""
# restore quad
scores, quads, xy_text = self.restore_quad(tcl_map, tcl_map_thresh, tvo_map)
dets = np.hstack((quads, scores)).astype(np.float32, copy=False)
dets = self.nms(dets)
if dets.shape[0] == 0:
return []
quads = dets[:, :-1].reshape(-1, 4, 2)
# Compute quad area
quad_areas = []
for quad in quads:
quad_areas.append(-self.quad_area(quad))
# instance segmentation
# instance_count, instance_label_map = cv2.connectedComponents(tcl_map.astype(np.uint8), connectivity=8)
instance_count, instance_label_map = self.cluster_by_quads_tco(tcl_map, tcl_map_thresh, quads, tco_map)
# restore single poly with tcl instance.
poly_list = []
for instance_idx in range(1, instance_count):
xy_text = np.argwhere(instance_label_map == instance_idx)[:, ::-1]
quad = quads[instance_idx - 1]
q_area = quad_areas[instance_idx - 1]
if q_area < 5:
continue
#
len1 = float(np.linalg.norm(quad[0] -quad[1]))
len2 = float(np.linalg.norm(quad[1] -quad[2]))
min_len = min(len1, len2)
if min_len < 3:
continue
# filter small CC
if xy_text.shape[0] <= 0:
continue
# filter low confidence instance
xy_text_scores = tcl_map[xy_text[:, 1], xy_text[:, 0], 0]
if np.sum(xy_text_scores) / quad_areas[instance_idx - 1] < 0.1:
# if np.sum(xy_text_scores) / quad_areas[instance_idx - 1] < 0.05:
continue
# sort xy_text
left_center_pt = np.array([[(quad[0, 0] + quad[-1, 0]) / 2.0,
(quad[0, 1] + quad[-1, 1]) / 2.0]]) # (1, 2)
right_center_pt = np.array([[(quad[1, 0] + quad[2, 0]) / 2.0,
(quad[1, 1] + quad[2, 1]) / 2.0]]) # (1, 2)
proj_unit_vec = (right_center_pt - left_center_pt) / \
(np.linalg.norm(right_center_pt - left_center_pt) + 1e-6)
proj_value = np.sum(xy_text * proj_unit_vec, axis=1)
xy_text = xy_text[np.argsort(proj_value)]
# Sample pts in tcl map
if self.sample_pts_num == 0:
sample_pts_num = self.estimate_sample_pts_num(quad, xy_text)
else:
sample_pts_num = self.sample_pts_num
xy_center_line = xy_text[np.linspace(0, xy_text.shape[0] - 1, sample_pts_num,
endpoint=True, dtype=np.float32).astype(np.int32)]
point_pair_list = []
for x, y in xy_center_line:
# get corresponding offset
offset = tbo_map[y, x, :].reshape(2, 2)
if offset_expand != 1.0:
offset_length = np.linalg.norm(offset, axis=1, keepdims=True)
expand_length = np.clip(offset_length * (offset_expand - 1), a_min=0.5, a_max=3.0)
offset_detal = offset / offset_length * expand_length
offset = offset + offset_detal
# original point
ori_yx = np.array([y, x], dtype=np.float32)
point_pair = (ori_yx + offset)[:, ::-1]* out_strid / np.array([ratio_w, ratio_h]).reshape(-1, 2)
point_pair_list.append(point_pair)
# ndarry: (x, 2), expand poly along width
detected_poly = self.point_pair2poly(point_pair_list)
detected_poly = self.expand_poly_along_width(detected_poly, shrink_ratio_of_width)
detected_poly[:, 0] = np.clip(detected_poly[:, 0], a_min=0, a_max=src_w)
detected_poly[:, 1] = np.clip(detected_poly[:, 1], a_min=0, a_max=src_h)
poly_list.append(detected_poly)
return poly_list
def __call__(self, outs_dict, ratio_list):
score_list = outs_dict['f_score']
border_list = outs_dict['f_border']
tvo_list = outs_dict['f_tvo']
tco_list = outs_dict['f_tco']
img_num = len(ratio_list)
poly_lists = []
for ino in range(img_num):
p_score = score_list[ino].transpose((1,2,0))
p_border = border_list[ino].transpose((1,2,0))
p_tvo = tvo_list[ino].transpose((1,2,0))
p_tco = tco_list[ino].transpose((1,2,0))
# print(p_score.shape, p_border.shape, p_tvo.shape, p_tco.shape)
ratio_h, ratio_w, src_h, src_w = ratio_list[ino]
poly_list = self.detect_sast(p_score, p_tvo, p_border, p_tco, ratio_w, ratio_h, src_w, src_h,
shrink_ratio_of_width=self.shrink_ratio_of_width,
tcl_map_thresh=self.tcl_map_thresh, offset_expand=self.expand_scale)
poly_lists.append(poly_list)
return poly_lists
...@@ -25,6 +25,9 @@ class CharacterOps(object): ...@@ -25,6 +25,9 @@ class CharacterOps(object):
def __init__(self, config): def __init__(self, config):
self.character_type = config['character_type'] self.character_type = config['character_type']
self.loss_type = config['loss_type'] self.loss_type = config['loss_type']
self.max_text_len = config['max_text_length']
if self.loss_type == "srn" and self.character_type != "en":
raise Exception("SRN can only support in character_type == en")
if self.character_type == "en": if self.character_type == "en":
self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz" self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz"
dict_character = list(self.character_str) dict_character = list(self.character_str)
...@@ -54,6 +57,8 @@ class CharacterOps(object): ...@@ -54,6 +57,8 @@ class CharacterOps(object):
self.end_str = "eos" self.end_str = "eos"
if self.loss_type == "attention": if self.loss_type == "attention":
dict_character = [self.beg_str, self.end_str] + dict_character dict_character = [self.beg_str, self.end_str] + dict_character
elif self.loss_type == "srn":
dict_character = dict_character + [self.beg_str, self.end_str]
self.dict = {} self.dict = {}
for i, char in enumerate(dict_character): for i, char in enumerate(dict_character):
self.dict[char] = i self.dict[char] = i
...@@ -147,6 +152,39 @@ def cal_predicts_accuracy(char_ops, ...@@ -147,6 +152,39 @@ def cal_predicts_accuracy(char_ops,
return acc, acc_num, img_num return acc, acc_num, img_num
def cal_predicts_accuracy_srn(char_ops,
preds,
labels,
max_text_len,
is_debug=False):
acc_num = 0
img_num = 0
total_len = preds.shape[0]
img_num = int(total_len / max_text_len)
for i in range(img_num):
cur_label = []
cur_pred = []
for j in range(max_text_len):
if labels[j + i * max_text_len] != 37: #0
cur_label.append(labels[j + i * max_text_len][0])
else:
break
for j in range(max_text_len + 1):
if j < len(cur_label) and preds[j + i * max_text_len][
0] != cur_label[j]:
break
elif j == len(cur_label) and j == max_text_len:
acc_num += 1
break
elif j == len(cur_label) and preds[j + i * max_text_len][0] == 37:
acc_num += 1
break
acc = acc_num * 1.0 / img_num
return acc, acc_num, img_num
def convert_rec_attention_infer_res(preds): def convert_rec_attention_infer_res(preds):
img_num = preds.shape[0] img_num = preds.shape[0]
target_lod = [0] target_lod = [0]
......
File mode changed from 100755 to 100644
...@@ -88,8 +88,8 @@ class DetectionIoUEvaluator(object): ...@@ -88,8 +88,8 @@ class DetectionIoUEvaluator(object):
points = gt[n]['points'] points = gt[n]['points']
# transcription = gt[n]['text'] # transcription = gt[n]['text']
dontCare = gt[n]['ignore'] dontCare = gt[n]['ignore']
points = Polygon(points) # points = Polygon(points)
points = points.buffer(0) # points = points.buffer(0)
if not Polygon(points).is_valid or not Polygon(points).is_simple: if not Polygon(points).is_valid or not Polygon(points).is_simple:
continue continue
...@@ -105,8 +105,8 @@ class DetectionIoUEvaluator(object): ...@@ -105,8 +105,8 @@ class DetectionIoUEvaluator(object):
for n in range(len(pred)): for n in range(len(pred)):
points = pred[n]['points'] points = pred[n]['points']
points = Polygon(points) # points = Polygon(points)
points = points.buffer(0) # points = points.buffer(0)
if not Polygon(points).is_valid or not Polygon(points).is_simple: if not Polygon(points).is_valid or not Polygon(points).is_simple:
continue continue
......
...@@ -29,7 +29,7 @@ FORMAT = '%(asctime)s-%(levelname)s: %(message)s' ...@@ -29,7 +29,7 @@ FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
logging.basicConfig(level=logging.INFO, format=FORMAT) logging.basicConfig(level=logging.INFO, format=FORMAT)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
from ppocr.utils.character import cal_predicts_accuracy from ppocr.utils.character import cal_predicts_accuracy, cal_predicts_accuracy_srn
from ppocr.utils.character import convert_rec_label_to_lod from ppocr.utils.character import convert_rec_label_to_lod
from ppocr.utils.character import convert_rec_attention_infer_res from ppocr.utils.character import convert_rec_attention_infer_res
from ppocr.utils.utility import create_module from ppocr.utils.utility import create_module
...@@ -60,22 +60,60 @@ def eval_rec_run(exe, config, eval_info_dict, mode): ...@@ -60,22 +60,60 @@ def eval_rec_run(exe, config, eval_info_dict, mode):
for ino in range(img_num): for ino in range(img_num):
img_list.append(data[ino][0]) img_list.append(data[ino][0])
label_list.append(data[ino][1]) label_list.append(data[ino][1])
img_list = np.concatenate(img_list, axis=0)
outs = exe.run(eval_info_dict['program'], \ if config['Global']['loss_type'] != "srn":
img_list = np.concatenate(img_list, axis=0)
outs = exe.run(eval_info_dict['program'], \
feed={'image': img_list}, \ feed={'image': img_list}, \
fetch_list=eval_info_dict['fetch_varname_list'], \ fetch_list=eval_info_dict['fetch_varname_list'], \
return_numpy=False) return_numpy=False)
preds = np.array(outs[0]) preds = np.array(outs[0])
if preds.shape[1] != 1:
preds, preds_lod = convert_rec_attention_infer_res(preds) if config['Global']['loss_type'] == "attention":
preds, preds_lod = convert_rec_attention_infer_res(preds)
else:
preds_lod = outs[0].lod()[0]
labels, labels_lod = convert_rec_label_to_lod(label_list)
acc, acc_num, sample_num = cal_predicts_accuracy(
char_ops, preds, preds_lod, labels, labels_lod,
is_remove_duplicate)
else: else:
preds_lod = outs[0].lod()[0] encoder_word_pos_list = []
labels, labels_lod = convert_rec_label_to_lod(label_list) gsrm_word_pos_list = []
acc, acc_num, sample_num = cal_predicts_accuracy( gsrm_slf_attn_bias1_list = []
char_ops, preds, preds_lod, labels, labels_lod, is_remove_duplicate) gsrm_slf_attn_bias2_list = []
for ino in range(img_num):
encoder_word_pos_list.append(data[ino][2])
gsrm_word_pos_list.append(data[ino][3])
gsrm_slf_attn_bias1_list.append(data[ino][4])
gsrm_slf_attn_bias2_list.append(data[ino][5])
img_list = np.concatenate(img_list, axis=0)
label_list = np.concatenate(label_list, axis=0)
encoder_word_pos_list = np.concatenate(
encoder_word_pos_list, axis=0).astype(np.int64)
gsrm_word_pos_list = np.concatenate(
gsrm_word_pos_list, axis=0).astype(np.int64)
gsrm_slf_attn_bias1_list = np.concatenate(
gsrm_slf_attn_bias1_list, axis=0).astype(np.float32)
gsrm_slf_attn_bias2_list = np.concatenate(
gsrm_slf_attn_bias2_list, axis=0).astype(np.float32)
labels = label_list
outs = exe.run(eval_info_dict['program'], \
feed={'image': img_list, 'encoder_word_pos': encoder_word_pos_list,
'gsrm_word_pos': gsrm_word_pos_list, 'gsrm_slf_attn_bias1': gsrm_slf_attn_bias1_list,
'gsrm_slf_attn_bias2': gsrm_slf_attn_bias2_list}, \
fetch_list=eval_info_dict['fetch_varname_list'], \
return_numpy=False)
preds = np.array(outs[0])
acc, acc_num, sample_num = cal_predicts_accuracy_srn(
char_ops, preds, labels, config['Global']['max_text_length'])
total_acc_num += acc_num total_acc_num += acc_num
total_sample_num += sample_num total_sample_num += sample_num
logger.info("eval batch id: {}, acc: {}".format(total_batch_num, acc)) #logger.info("eval batch id: {}, acc: {}".format(total_batch_num, acc))
total_batch_num += 1 total_batch_num += 1
avg_acc = total_acc_num * 1.0 / total_sample_num avg_acc = total_acc_num * 1.0 / total_sample_num
metrics = {'avg_acc': avg_acc, "total_acc_num": total_acc_num, \ metrics = {'avg_acc': avg_acc, "total_acc_num": total_acc_num, \
......
...@@ -40,7 +40,8 @@ class TextRecognizer(object): ...@@ -40,7 +40,8 @@ class TextRecognizer(object):
char_ops_params = { char_ops_params = {
"character_type": args.rec_char_type, "character_type": args.rec_char_type,
"character_dict_path": args.rec_char_dict_path, "character_dict_path": args.rec_char_dict_path,
"use_space_char": args.use_space_char "use_space_char": args.use_space_char,
"max_text_length": args.max_text_length
} }
if self.rec_algorithm != "RARE": if self.rec_algorithm != "RARE":
char_ops_params['loss_type'] = 'ctc' char_ops_params['loss_type'] = 'ctc'
......
...@@ -59,6 +59,7 @@ def parse_args(): ...@@ -59,6 +59,7 @@ def parse_args():
parser.add_argument("--rec_image_shape", type=str, default="3, 32, 320") parser.add_argument("--rec_image_shape", type=str, default="3, 32, 320")
parser.add_argument("--rec_char_type", type=str, default='ch') parser.add_argument("--rec_char_type", type=str, default='ch')
parser.add_argument("--rec_batch_num", type=int, default=30) parser.add_argument("--rec_batch_num", type=int, default=30)
parser.add_argument("--max_text_length", type=int, default=25)
parser.add_argument( parser.add_argument(
"--rec_char_dict_path", "--rec_char_dict_path",
type=str, type=str,
......
...@@ -64,7 +64,6 @@ def main(): ...@@ -64,7 +64,6 @@ def main():
exe = fluid.Executor(place) exe = fluid.Executor(place)
rec_model = create_module(config['Architecture']['function'])(params=config) rec_model = create_module(config['Architecture']['function'])(params=config)
startup_prog = fluid.Program() startup_prog = fluid.Program()
eval_prog = fluid.Program() eval_prog = fluid.Program()
with fluid.program_guard(eval_prog, startup_prog): with fluid.program_guard(eval_prog, startup_prog):
...@@ -86,10 +85,36 @@ def main(): ...@@ -86,10 +85,36 @@ def main():
for i in range(max_img_num): for i in range(max_img_num):
logger.info("infer_img:%s" % infer_list[i]) logger.info("infer_img:%s" % infer_list[i])
img = next(blobs) img = next(blobs)
predict = exe.run(program=eval_prog, if loss_type != "srn":
feed={"image": img}, predict = exe.run(program=eval_prog,
fetch_list=fetch_varname_list, feed={"image": img},
return_numpy=False) fetch_list=fetch_varname_list,
return_numpy=False)
else:
encoder_word_pos_list = []
gsrm_word_pos_list = []
gsrm_slf_attn_bias1_list = []
gsrm_slf_attn_bias2_list = []
encoder_word_pos_list.append(img[1])
gsrm_word_pos_list.append(img[2])
gsrm_slf_attn_bias1_list.append(img[3])
gsrm_slf_attn_bias2_list.append(img[4])
encoder_word_pos_list = np.concatenate(
encoder_word_pos_list, axis=0).astype(np.int64)
gsrm_word_pos_list = np.concatenate(
gsrm_word_pos_list, axis=0).astype(np.int64)
gsrm_slf_attn_bias1_list = np.concatenate(
gsrm_slf_attn_bias1_list, axis=0).astype(np.float32)
gsrm_slf_attn_bias2_list = np.concatenate(
gsrm_slf_attn_bias2_list, axis=0).astype(np.float32)
predict = exe.run(program=eval_prog, \
feed={'image': img[0], 'encoder_word_pos': encoder_word_pos_list,
'gsrm_word_pos': gsrm_word_pos_list, 'gsrm_slf_attn_bias1': gsrm_slf_attn_bias1_list,
'gsrm_slf_attn_bias2': gsrm_slf_attn_bias2_list}, \
fetch_list=fetch_varname_list, \
return_numpy=False)
if loss_type == "ctc": if loss_type == "ctc":
preds = np.array(predict[0]) preds = np.array(predict[0])
preds = preds.reshape(-1) preds = preds.reshape(-1)
...@@ -114,7 +139,18 @@ def main(): ...@@ -114,7 +139,18 @@ def main():
score = np.mean(probs[0, 1:end_pos[1]]) score = np.mean(probs[0, 1:end_pos[1]])
preds = preds.reshape(-1) preds = preds.reshape(-1)
preds_text = char_ops.decode(preds) preds_text = char_ops.decode(preds)
elif loss_type == "srn":
cur_pred = []
preds = np.array(predict[0])
preds = preds.reshape(-1)
probs = np.array(predict[1])
ind = np.argmax(probs, axis=1)
valid_ind = np.where(preds != 37)[0]
if len(valid_ind) == 0:
continue
score = np.mean(probs[valid_ind, ind[valid_ind]])
preds = preds[:valid_ind[-1] + 1]
preds_text = char_ops.decode(preds)
logger.info("\t index: {}".format(preds)) logger.info("\t index: {}".format(preds))
logger.info("\t word : {}".format(preds_text)) logger.info("\t word : {}".format(preds_text))
logger.info("\t score: {}".format(score)) logger.info("\t score: {}".format(score))
......
...@@ -32,7 +32,8 @@ from eval_utils.eval_det_utils import eval_det_run ...@@ -32,7 +32,8 @@ from eval_utils.eval_det_utils import eval_det_run
from eval_utils.eval_rec_utils import eval_rec_run from eval_utils.eval_rec_utils import eval_rec_run
from ppocr.utils.save_load import save_model from ppocr.utils.save_load import save_model
import numpy as np import numpy as np
from ppocr.utils.character import cal_predicts_accuracy, CharacterOps from ppocr.utils.character import cal_predicts_accuracy, cal_predicts_accuracy_srn, CharacterOps
class ArgsParser(ArgumentParser): class ArgsParser(ArgumentParser):
def __init__(self): def __init__(self):
...@@ -81,10 +82,8 @@ default_config = {'Global': {'debug': False, }} ...@@ -81,10 +82,8 @@ default_config = {'Global': {'debug': False, }}
def load_config(file_path): def load_config(file_path):
""" """
Load config from yml/yaml file. Load config from yml/yaml file.
Args: Args:
file_path (str): Path of the config file to be loaded. file_path (str): Path of the config file to be loaded.
Returns: global config Returns: global config
""" """
merge_config(default_config) merge_config(default_config)
...@@ -103,10 +102,8 @@ def load_config(file_path): ...@@ -103,10 +102,8 @@ def load_config(file_path):
def merge_config(config): def merge_config(config):
""" """
Merge config into global config. Merge config into global config.
Args: Args:
config (dict): Config to be merged. config (dict): Config to be merged.
Returns: global config Returns: global config
""" """
for key, value in config.items(): for key, value in config.items():
...@@ -157,13 +154,11 @@ def build(config, main_prog, startup_prog, mode): ...@@ -157,13 +154,11 @@ def build(config, main_prog, startup_prog, mode):
3. create a model 3. create a model
4. create fetchs 4. create fetchs
5. create an optimizer 5. create an optimizer
Args: Args:
config(dict): config config(dict): config
main_prog(): main program main_prog(): main program
startup_prog(): startup program startup_prog(): startup program
is_train(bool): train or valid is_train(bool): train or valid
Returns: Returns:
dataloader(): a bridge between the model and the data dataloader(): a bridge between the model and the data
fetchs(dict): dict of model outputs(included loss and measures) fetchs(dict): dict of model outputs(included loss and measures)
...@@ -176,8 +171,16 @@ def build(config, main_prog, startup_prog, mode): ...@@ -176,8 +171,16 @@ def build(config, main_prog, startup_prog, mode):
fetch_name_list = list(outputs.keys()) fetch_name_list = list(outputs.keys())
fetch_varname_list = [outputs[v].name for v in fetch_name_list] fetch_varname_list = [outputs[v].name for v in fetch_name_list]
opt_loss_name = None opt_loss_name = None
model_average = None
img_loss_name = None
word_loss_name = None
if mode == "train": if mode == "train":
opt_loss = outputs['total_loss'] opt_loss = outputs['total_loss']
# srn loss
#img_loss = outputs['img_loss']
#word_loss = outputs['word_loss']
#img_loss_name = img_loss.name
#word_loss_name = word_loss.name
opt_params = config['Optimizer'] opt_params = config['Optimizer']
optimizer = create_module(opt_params['function'])(opt_params) optimizer = create_module(opt_params['function'])(opt_params)
optimizer.minimize(opt_loss) optimizer.minimize(opt_loss)
...@@ -185,7 +188,17 @@ def build(config, main_prog, startup_prog, mode): ...@@ -185,7 +188,17 @@ def build(config, main_prog, startup_prog, mode):
global_lr = optimizer._global_learning_rate() global_lr = optimizer._global_learning_rate()
fetch_name_list.insert(0, "lr") fetch_name_list.insert(0, "lr")
fetch_varname_list.insert(0, global_lr.name) fetch_varname_list.insert(0, global_lr.name)
return (dataloader, fetch_name_list, fetch_varname_list, opt_loss_name) if "loss_type" in config["Global"]:
if config['Global']["loss_type"] == 'srn':
model_average = fluid.optimizer.ModelAverage(
config['Global']['average_window'],
min_average_window=config['Global'][
'min_average_window'],
max_average_window=config['Global'][
'max_average_window'])
return (dataloader, fetch_name_list, fetch_varname_list, opt_loss_name,
model_average)
def build_export(config, main_prog, startup_prog): def build_export(config, main_prog, startup_prog):
...@@ -329,14 +342,20 @@ def train_eval_rec_run(config, exe, train_info_dict, eval_info_dict): ...@@ -329,14 +342,20 @@ def train_eval_rec_run(config, exe, train_info_dict, eval_info_dict):
lr = np.mean(np.array(train_outs[fetch_map['lr']])) lr = np.mean(np.array(train_outs[fetch_map['lr']]))
preds_idx = fetch_map['decoded_out'] preds_idx = fetch_map['decoded_out']
preds = np.array(train_outs[preds_idx]) preds = np.array(train_outs[preds_idx])
preds_lod = train_outs[preds_idx].lod()[0]
labels_idx = fetch_map['label'] labels_idx = fetch_map['label']
labels = np.array(train_outs[labels_idx]) labels = np.array(train_outs[labels_idx])
labels_lod = train_outs[labels_idx].lod()[0]
acc, acc_num, img_num = cal_predicts_accuracy( if config['Global']['loss_type'] != 'srn':
config['Global']['char_ops'], preds, preds_lod, labels, preds_lod = train_outs[preds_idx].lod()[0]
labels_lod) labels_lod = train_outs[labels_idx].lod()[0]
acc, acc_num, img_num = cal_predicts_accuracy(
config['Global']['char_ops'], preds, preds_lod, labels,
labels_lod)
else:
acc, acc_num, img_num = cal_predicts_accuracy_srn(
config['Global']['char_ops'], preds, labels,
config['Global']['max_text_length'])
t2 = time.time() t2 = time.time()
train_batch_elapse = t2 - t1 train_batch_elapse = t2 - t1
stats = {'loss': loss, 'acc': acc} stats = {'loss': loss, 'acc': acc}
...@@ -350,6 +369,9 @@ def train_eval_rec_run(config, exe, train_info_dict, eval_info_dict): ...@@ -350,6 +369,9 @@ def train_eval_rec_run(config, exe, train_info_dict, eval_info_dict):
if train_batch_id > 0 and\ if train_batch_id > 0 and\
train_batch_id % eval_batch_step == 0: train_batch_id % eval_batch_step == 0:
model_average = train_info_dict['model_average']
if model_average != None:
model_average.apply(exe)
metrics = eval_rec_run(exe, config, eval_info_dict, "eval") metrics = eval_rec_run(exe, config, eval_info_dict, "eval")
eval_acc = metrics['avg_acc'] eval_acc = metrics['avg_acc']
eval_sample_num = metrics['total_sample_num'] eval_sample_num = metrics['total_sample_num']
...@@ -375,6 +397,7 @@ def train_eval_rec_run(config, exe, train_info_dict, eval_info_dict): ...@@ -375,6 +397,7 @@ def train_eval_rec_run(config, exe, train_info_dict, eval_info_dict):
save_model(train_info_dict['train_program'], save_path) save_model(train_info_dict['train_program'], save_path)
return return
def preprocess(): def preprocess():
FLAGS = ArgsParser().parse_args() FLAGS = ArgsParser().parse_args()
config = load_config(FLAGS.config) config = load_config(FLAGS.config)
...@@ -386,15 +409,15 @@ def preprocess(): ...@@ -386,15 +409,15 @@ def preprocess():
check_gpu(use_gpu) check_gpu(use_gpu)
alg = config['Global']['algorithm'] alg = config['Global']['algorithm']
assert alg in ['EAST', 'DB', 'Rosetta', 'CRNN', 'STARNet', 'RARE'] assert alg in ['EAST', 'DB', 'SAST', 'Rosetta', 'CRNN', 'STARNet', 'RARE', 'SRN']
if alg in ['Rosetta', 'CRNN', 'STARNet', 'RARE']: if alg in ['Rosetta', 'CRNN', 'STARNet', 'RARE', 'SRN']:
config['Global']['char_ops'] = CharacterOps(config['Global']) config['Global']['char_ops'] = CharacterOps(config['Global'])
place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
startup_program = fluid.Program() startup_program = fluid.Program()
train_program = fluid.Program() train_program = fluid.Program()
if alg in ['EAST', 'DB']: if alg in ['EAST', 'DB', 'SAST']:
train_alg_type = 'det' train_alg_type = 'det'
else: else:
train_alg_type = 'rec' train_alg_type = 'rec'
......
...@@ -52,6 +52,7 @@ def main(): ...@@ -52,6 +52,7 @@ def main():
train_fetch_name_list = train_build_outputs[1] train_fetch_name_list = train_build_outputs[1]
train_fetch_varname_list = train_build_outputs[2] train_fetch_varname_list = train_build_outputs[2]
train_opt_loss_name = train_build_outputs[3] train_opt_loss_name = train_build_outputs[3]
model_average = train_build_outputs[-1]
eval_program = fluid.Program() eval_program = fluid.Program()
eval_build_outputs = program.build( eval_build_outputs = program.build(
...@@ -85,7 +86,8 @@ def main(): ...@@ -85,7 +86,8 @@ def main():
'train_program':train_program,\ 'train_program':train_program,\
'reader':train_loader,\ 'reader':train_loader,\
'fetch_name_list':train_fetch_name_list,\ 'fetch_name_list':train_fetch_name_list,\
'fetch_varname_list':train_fetch_varname_list} 'fetch_varname_list':train_fetch_varname_list,\
'model_average': model_average}
eval_info_dict = {'program':eval_program,\ eval_info_dict = {'program':eval_program,\
'reader':eval_reader,\ 'reader':eval_reader,\
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment