Commit aad3093a authored by WenmuZhou's avatar WenmuZhou
Browse files

dygraph first commit

parent 10f7e519
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle.fluid as fluid
from paddle.fluid.param_attr import ParamAttr
__all__ = ["ResNet"]
class ResNet(object):
def __init__(self, params):
"""
the Resnet backbone network for detection module.
Args:
params(dict): the super parameters for network build
"""
self.layers = params['layers']
supported_layers = [18, 34, 50, 101, 152]
assert self.layers in supported_layers, \
"supported layers are {} but input layer is {}".format(supported_layers, self.layers)
self.is_3x3 = True
def __call__(self, input):
layers = self.layers
is_3x3 = self.is_3x3
# if layers == 18:
# depth = [2, 2, 2, 2]
# elif layers == 34 or layers == 50:
# depth = [3, 4, 6, 3]
# elif layers == 101:
# depth = [3, 4, 23, 3]
# elif layers == 152:
# depth = [3, 8, 36, 3]
# elif layers == 200:
# depth = [3, 12, 48, 3]
# num_filters = [64, 128, 256, 512]
# outs = []
if layers == 18:
depth = [2, 2, 2, 2]#, 3, 3]
elif layers == 34 or layers == 50:
#depth = [3, 4, 6, 3]#, 3, 3]
depth = [3, 4, 6, 3, 3]#, 3]
elif layers == 101:
depth = [3, 4, 23, 3]#, 3, 3]
elif layers == 152:
depth = [3, 8, 36, 3]#, 3, 3]
num_filters = [64, 128, 256, 512, 512]#, 512]
blocks = {}
idx = 'block_0'
blocks[idx] = input
if is_3x3 == False:
conv = self.conv_bn_layer(
input=input,
num_filters=64,
filter_size=7,
stride=2,
act='relu')
else:
conv = self.conv_bn_layer(
input=input,
num_filters=32,
filter_size=3,
stride=2,
act='relu',
name='conv1_1')
conv = self.conv_bn_layer(
input=conv,
num_filters=32,
filter_size=3,
stride=1,
act='relu',
name='conv1_2')
conv = self.conv_bn_layer(
input=conv,
num_filters=64,
filter_size=3,
stride=1,
act='relu',
name='conv1_3')
idx = 'block_1'
blocks[idx] = conv
conv = fluid.layers.pool2d(
input=conv,
pool_size=3,
pool_stride=2,
pool_padding=1,
pool_type='max')
if layers >= 50:
for block in range(len(depth)):
for i in range(depth[block]):
if layers in [101, 152, 200] and block == 2:
if i == 0:
conv_name = "res" + str(block + 2) + "a"
else:
conv_name = "res" + str(block + 2) + "b" + str(i)
else:
conv_name = "res" + str(block + 2) + chr(97 + i)
conv = self.bottleneck_block(
input=conv,
num_filters=num_filters[block],
stride=2 if i == 0 and block != 0 else 1,
if_first=block == i == 0,
name=conv_name)
# outs.append(conv)
idx = 'block_' + str(block + 2)
blocks[idx] = conv
else:
for block in range(len(depth)):
for i in range(depth[block]):
conv_name = "res" + str(block + 2) + chr(97 + i)
conv = self.basic_block(
input=conv,
num_filters=num_filters[block],
stride=2 if i == 0 and block != 0 else 1,
if_first=block == i == 0,
name=conv_name)
# outs.append(conv)
idx = 'block_' + str(block + 2)
blocks[idx] = conv
# return outs
return blocks
def conv_bn_layer(self,
input,
num_filters,
filter_size,
stride=1,
groups=1,
act=None,
name=None):
conv = fluid.layers.conv2d(
input=input,
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
groups=groups,
act=None,
param_attr=ParamAttr(name=name + "_weights"),
bias_attr=False)
if name == "conv1":
bn_name = "bn_" + name
else:
bn_name = "bn" + name[3:]
return fluid.layers.batch_norm(
input=conv,
act=act,
param_attr=ParamAttr(name=bn_name + '_scale'),
bias_attr=ParamAttr(bn_name + '_offset'),
moving_mean_name=bn_name + '_mean',
moving_variance_name=bn_name + '_variance')
def conv_bn_layer_new(self,
input,
num_filters,
filter_size,
stride=1,
groups=1,
act=None,
name=None):
pool = fluid.layers.pool2d(
input=input,
pool_size=2,
pool_stride=2,
pool_padding=0,
pool_type='avg',
ceil_mode=True)
conv = fluid.layers.conv2d(
input=pool,
num_filters=num_filters,
filter_size=filter_size,
stride=1,
padding=(filter_size - 1) // 2,
groups=groups,
act=None,
param_attr=ParamAttr(name=name + "_weights"),
bias_attr=False)
if name == "conv1":
bn_name = "bn_" + name
else:
bn_name = "bn" + name[3:]
return fluid.layers.batch_norm(
input=conv,
act=act,
param_attr=ParamAttr(name=bn_name + '_scale'),
bias_attr=ParamAttr(bn_name + '_offset'),
moving_mean_name=bn_name + '_mean',
moving_variance_name=bn_name + '_variance')
def shortcut(self, input, ch_out, stride, name, if_first=False):
ch_in = input.shape[1]
if ch_in != ch_out or stride != 1:
if if_first:
return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
else:
return self.conv_bn_layer_new(
input, ch_out, 1, stride, name=name)
elif if_first:
return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
else:
return input
def bottleneck_block(self, input, num_filters, stride, name, if_first):
conv0 = self.conv_bn_layer(
input=input,
num_filters=num_filters,
filter_size=1,
act='relu',
name=name + "_branch2a")
conv1 = self.conv_bn_layer(
input=conv0,
num_filters=num_filters,
filter_size=3,
stride=stride,
act='relu',
name=name + "_branch2b")
conv2 = self.conv_bn_layer(
input=conv1,
num_filters=num_filters * 4,
filter_size=1,
act=None,
name=name + "_branch2c")
short = self.shortcut(
input,
num_filters * 4,
stride,
if_first=if_first,
name=name + "_branch1")
return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
def basic_block(self, input, num_filters, stride, name, if_first):
conv0 = self.conv_bn_layer(
input=input,
num_filters=num_filters,
filter_size=3,
act='relu',
stride=stride,
name=name + "_branch2a")
conv1 = self.conv_bn_layer(
input=conv0,
num_filters=num_filters,
filter_size=3,
act=None,
name=name + "_branch2b")
short = self.shortcut(
input,
num_filters,
stride,
if_first=if_first,
name=name + "_branch1")
return fluid.layers.elementwise_add(x=short, y=conv1, act='relu')
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from paddle import nn
import paddle.fluid as fluid
from paddle.fluid.initializer import MSRA
from paddle.fluid.param_attr import ParamAttr
from ppocr.modeling.backbones.det_mobilenet_v3 import ResidualUnit, ConvBNLayer, make_divisible
__all__ = [
'MobileNetV3', 'MobileNetV3_small_x0_35', 'MobileNetV3_small_x0_5',
'MobileNetV3_small_x0_75', 'MobileNetV3_small_x1_0',
'MobileNetV3_small_x1_25', 'MobileNetV3_large_x0_35',
'MobileNetV3_large_x0_5', 'MobileNetV3_large_x0_75',
'MobileNetV3_large_x1_0', 'MobileNetV3_large_x1_25'
]
__all__ = ['MobileNetV3']
class MobileNetV3():
def __init__(self, params):
self.scale = params.get("scale", 0.5)
model_name = params.get("model_name", "small")
large_stride = params.get("large_stride", [1, 2, 2, 2])
small_stride = params.get("small_stride", [2, 2, 2, 2])
class MobileNetV3(nn.Layer):
def __init__(self,
in_channels=3,
model_name='small',
scale=0.5,
large_stride=None,
small_stride=None,
**kwargs):
super(MobileNetV3, self).__init__()
if small_stride is None:
small_stride = [2, 2, 2, 2]
if large_stride is None:
large_stride = [1, 2, 2, 2]
assert isinstance(large_stride, list), "large_stride type must " \
"be list but got {}".format(type(large_stride))
"be list but got {}".format(type(large_stride))
assert isinstance(small_stride, list), "small_stride type must " \
"be list but got {}".format(type(small_stride))
"be list but got {}".format(type(small_stride))
assert len(large_stride) == 4, "large_stride length must be " \
"4 but got {}".format(len(large_stride))
"4 but got {}".format(len(large_stride))
assert len(small_stride) == 4, "small_stride length must be " \
"4 but got {}".format(len(small_stride))
"4 but got {}".format(len(small_stride))
self.inplanes = 16
if model_name == "large":
self.cfg = [
cfg = [
# k, exp, c, se, nl, s,
[3, 16, 16, False, 'relu', large_stride[0]],
[3, 64, 24, False, 'relu', (large_stride[1], 1)],
......@@ -65,10 +61,9 @@ class MobileNetV3():
[5, 960, 160, True, 'hard_swish', 1],
[5, 960, 160, True, 'hard_swish', 1],
]
self.cls_ch_squeeze = 960
self.cls_ch_expand = 1280
cls_ch_squeeze = 960
elif model_name == "small":
self.cfg = [
cfg = [
# k, exp, c, se, nl, s,
[3, 16, 16, True, 'relu', (small_stride[0], 1)],
[3, 72, 24, False, 'relu', (small_stride[1], 1)],
......@@ -82,186 +77,72 @@ class MobileNetV3():
[5, 576, 96, True, 'hard_swish', 1],
[5, 576, 96, True, 'hard_swish', 1],
]
self.cls_ch_squeeze = 576
self.cls_ch_expand = 1280
cls_ch_squeeze = 576
else:
raise NotImplementedError("mode[" + model_name +
"_model] is not implemented!")
supported_scale = [0.35, 0.5, 0.75, 1.0, 1.25]
assert self.scale in supported_scale, \
"supported scales are {} but input scale is {}".format(supported_scale, self.scale)
def __call__(self, input):
scale = self.scale
inplanes = self.inplanes
cfg = self.cfg
cls_ch_squeeze = self.cls_ch_squeeze
cls_ch_expand = self.cls_ch_expand
#conv1
conv = self.conv_bn_layer(
input,
filter_size=3,
num_filters=self.make_divisible(inplanes * scale),
assert scale in supported_scale, \
"supported scales are {} but input scale is {}".format(supported_scale, scale)
inplanes = 16
# conv1
self.conv1 = ConvBNLayer(
in_channels=in_channels,
out_channels=make_divisible(inplanes * scale),
kernel_size=3,
stride=2,
padding=1,
num_groups=1,
groups=1,
if_act=True,
act='hard_swish',
name='conv1')
i = 0
inplanes = self.make_divisible(inplanes * scale)
for layer_cfg in cfg:
conv = self.residual_unit(
input=conv,
num_in_filter=inplanes,
num_mid_filter=self.make_divisible(scale * layer_cfg[1]),
num_out_filter=self.make_divisible(scale * layer_cfg[2]),
act=layer_cfg[4],
stride=layer_cfg[5],
filter_size=layer_cfg[0],
use_se=layer_cfg[3],
name='conv' + str(i + 2))
inplanes = self.make_divisible(scale * layer_cfg[2])
block_list = []
inplanes = make_divisible(inplanes * scale)
for (k, exp, c, se, nl, s) in cfg:
block_list.append(
ResidualUnit(
in_channels=inplanes,
mid_channels=make_divisible(scale * exp),
out_channels=make_divisible(scale * c),
kernel_size=k,
stride=s,
use_se=se,
act=nl,
name='conv' + str(i + 2)))
inplanes = make_divisible(scale * c)
i += 1
self.blocks = nn.Sequential(*block_list)
conv = self.conv_bn_layer(
input=conv,
filter_size=1,
num_filters=self.make_divisible(scale * cls_ch_squeeze),
self.conv2 = ConvBNLayer(
in_channels=inplanes,
out_channels=make_divisible(scale * cls_ch_squeeze),
kernel_size=1,
stride=1,
padding=0,
num_groups=1,
groups=1,
if_act=True,
act='hard_swish',
name='conv_last')
conv = fluid.layers.pool2d(
input=conv,
pool_size=2,
pool_stride=2,
pool_padding=0,
pool_type='max')
return conv
def conv_bn_layer(self,
input,
filter_size,
num_filters,
stride,
padding,
num_groups=1,
if_act=True,
act=None,
name=None,
use_cudnn=True,
res_last_bn_init=False):
conv = fluid.layers.conv2d(
input=input,
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=padding,
groups=num_groups,
act=None,
use_cudnn=use_cudnn,
param_attr=ParamAttr(name=name + '_weights'),
bias_attr=False)
bn_name = name + '_bn'
bn = fluid.layers.batch_norm(
input=conv,
param_attr=ParamAttr(
name=bn_name + "_scale",
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=0.0)),
bias_attr=ParamAttr(
name=bn_name + "_offset",
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=0.0)),
moving_mean_name=bn_name + '_mean',
moving_variance_name=bn_name + '_variance')
if if_act:
if act == 'relu':
bn = fluid.layers.relu(bn)
elif act == 'hard_swish':
bn = fluid.layers.hard_swish(bn)
return bn
def make_divisible(self, v, divisor=8, min_value=None):
if min_value is None:
min_value = divisor
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
if new_v < 0.9 * v:
new_v += divisor
return new_v
def se_block(self, input, num_out_filter, ratio=4, name=None):
num_mid_filter = num_out_filter // ratio
pool = fluid.layers.pool2d(
input=input, pool_type='avg', global_pooling=True, use_cudnn=False)
conv1 = fluid.layers.conv2d(
input=pool,
filter_size=1,
num_filters=num_mid_filter,
act='relu',
param_attr=ParamAttr(name=name + '_1_weights'),
bias_attr=ParamAttr(name=name + '_1_offset'))
conv2 = fluid.layers.conv2d(
input=conv1,
filter_size=1,
num_filters=num_out_filter,
act='hard_sigmoid',
param_attr=ParamAttr(name=name + '_2_weights'),
bias_attr=ParamAttr(name=name + '_2_offset'))
scale = fluid.layers.elementwise_mul(x=input, y=conv2, axis=0)
return scale
def residual_unit(self,
input,
num_in_filter,
num_mid_filter,
num_out_filter,
stride,
filter_size,
act=None,
use_se=False,
name=None):
conv0 = self.conv_bn_layer(
input=input,
filter_size=1,
num_filters=num_mid_filter,
stride=1,
padding=0,
if_act=True,
act=act,
name=name + '_expand')
conv1 = self.conv_bn_layer(
input=conv0,
filter_size=filter_size,
num_filters=num_mid_filter,
stride=stride,
padding=int((filter_size - 1) // 2),
if_act=True,
act=act,
num_groups=num_mid_filter,
use_cudnn=False,
name=name + '_depthwise')
if use_se:
conv1 = self.se_block(
input=conv1, num_out_filter=num_mid_filter, name=name + '_se')
conv2 = self.conv_bn_layer(
input=conv1,
filter_size=1,
num_filters=num_out_filter,
stride=1,
padding=0,
if_act=False,
name=name + '_linear',
res_last_bn_init=True)
if num_in_filter != num_out_filter or stride != 1:
return conv2
else:
return fluid.layers.elementwise_add(x=input, y=conv2, act=None)
self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
self.out_channels = make_divisible(scale * cls_ch_squeeze)
def forward(self, x):
x = self.conv1(x)
x = self.blocks(x)
x = self.conv2(x)
x = self.pool(x)
return x
if __name__ == '__main__':
import paddle
paddle.disable_static()
x = paddle.zeros((1, 3, 32, 320))
x = paddle.to_variable(x)
net = MobileNetV3(model_name='small', small_stride=[1, 2, 2, 2])
y = net(x)
print(y.shape)
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import paddle
import paddle.fluid as fluid
from paddle.fluid.param_attr import ParamAttr
__all__ = [
"ResNet", "ResNet18", "ResNet34", "ResNet50", "ResNet101", "ResNet152"
]
Trainable = True
w_nolr = fluid.ParamAttr(trainable=Trainable)
train_parameters = {
"input_size": [3, 224, 224],
"input_mean": [0.485, 0.456, 0.406],
"input_std": [0.229, 0.224, 0.225],
"learning_strategy": {
"name": "piecewise_decay",
"batch_size": 256,
"epochs": [30, 60, 90],
"steps": [0.1, 0.01, 0.001, 0.0001]
}
}
class ResNet():
def __init__(self, params):
self.layers = params['layers']
self.params = train_parameters
def __call__(self, input):
layers = self.layers
supported_layers = [18, 34, 50, 101, 152]
assert layers in supported_layers, \
"supported layers are {} but input layer is {}".format(supported_layers, layers)
if layers == 18:
depth = [2, 2, 2, 2]
elif layers == 34 or layers == 50:
depth = [3, 4, 6, 3]
elif layers == 101:
depth = [3, 4, 23, 3]
elif layers == 152:
depth = [3, 8, 36, 3]
stride_list = [(2, 2), (2, 2), (1, 1), (1, 1)]
num_filters = [64, 128, 256, 512]
conv = self.conv_bn_layer(
input=input,
num_filters=64,
filter_size=7,
stride=2,
act='relu',
name="conv1")
F = []
if layers >= 50:
for block in range(len(depth)):
for i in range(depth[block]):
if layers in [101, 152] and block == 2:
if i == 0:
conv_name = "res" + str(block + 2) + "a"
else:
conv_name = "res" + str(block + 2) + "b" + str(i)
else:
conv_name = "res" + str(block + 2) + chr(97 + i)
conv = self.bottleneck_block(
input=conv,
num_filters=num_filters[block],
stride=stride_list[block] if i == 0 else 1,
name=conv_name)
F.append(conv)
else:
for block in range(len(depth)):
for i in range(depth[block]):
conv_name = "res" + str(block + 2) + chr(97 + i)
if i == 0 and block != 0:
stride = (2, 1)
else:
stride = (1, 1)
conv = self.basic_block(
input=conv,
num_filters=num_filters[block],
stride=stride,
if_first=block == i == 0,
name=conv_name)
F.append(conv)
base = F[-1]
for i in [-2, -3]:
b, c, w, h = F[i].shape
if (w, h) == base.shape[2:]:
base = base
else:
base = fluid.layers.conv2d_transpose(
input=base,
num_filters=c,
filter_size=4,
stride=2,
padding=1,
act=None,
param_attr=w_nolr,
bias_attr=w_nolr)
base = fluid.layers.batch_norm(
base, act="relu", param_attr=w_nolr, bias_attr=w_nolr)
base = fluid.layers.concat([base, F[i]], axis=1)
base = fluid.layers.conv2d(
base,
num_filters=c,
filter_size=1,
param_attr=w_nolr,
bias_attr=w_nolr)
base = fluid.layers.conv2d(
base,
num_filters=c,
filter_size=3,
padding=1,
param_attr=w_nolr,
bias_attr=w_nolr)
base = fluid.layers.batch_norm(
base, act="relu", param_attr=w_nolr, bias_attr=w_nolr)
base = fluid.layers.conv2d(
base,
num_filters=512,
filter_size=1,
bias_attr=w_nolr,
param_attr=w_nolr)
return base
def conv_bn_layer(self,
input,
num_filters,
filter_size,
stride=1,
groups=1,
act=None,
name=None):
conv = fluid.layers.conv2d(
input=input,
num_filters=num_filters,
filter_size=2 if stride == (1, 1) else filter_size,
dilation=2 if stride == (1, 1) else 1,
stride=stride,
padding=(filter_size - 1) // 2,
groups=groups,
act=None,
param_attr=ParamAttr(
name=name + "_weights", trainable=Trainable),
bias_attr=False,
name=name + '.conv2d.output.1')
if name == "conv1":
bn_name = "bn_" + name
else:
bn_name = "bn" + name[3:]
return fluid.layers.batch_norm(
input=conv,
act=act,
name=bn_name + '.output.1',
param_attr=ParamAttr(
name=bn_name + '_scale', trainable=Trainable),
bias_attr=ParamAttr(
bn_name + '_offset', trainable=Trainable),
moving_mean_name=bn_name + '_mean',
moving_variance_name=bn_name + '_variance', )
def shortcut(self, input, ch_out, stride, is_first, name):
ch_in = input.shape[1]
if ch_in != ch_out or stride != 1 or is_first == True:
if stride == (1, 1):
return self.conv_bn_layer(input, ch_out, 1, 1, name=name)
else: #stride == (2,2)
return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
else:
return input
def bottleneck_block(self, input, num_filters, stride, name):
conv0 = self.conv_bn_layer(
input=input,
num_filters=num_filters,
filter_size=1,
act='relu',
name=name + "_branch2a")
conv1 = self.conv_bn_layer(
input=conv0,
num_filters=num_filters,
filter_size=3,
stride=stride,
act='relu',
name=name + "_branch2b")
conv2 = self.conv_bn_layer(
input=conv1,
num_filters=num_filters * 4,
filter_size=1,
act=None,
name=name + "_branch2c")
short = self.shortcut(
input,
num_filters * 4,
stride,
is_first=False,
name=name + "_branch1")
return fluid.layers.elementwise_add(
x=short, y=conv2, act='relu', name=name + ".add.output.5")
def basic_block(self, input, num_filters, stride, is_first, name):
conv0 = self.conv_bn_layer(
input=input,
num_filters=num_filters,
filter_size=3,
act='relu',
stride=stride,
name=name + "_branch2a")
conv1 = self.conv_bn_layer(
input=conv0,
num_filters=num_filters,
filter_size=3,
act=None,
name=name + "_branch2b")
short = self.shortcut(
input, num_filters, stride, is_first, name=name + "_branch1")
return fluid.layers.elementwise_add(x=short, y=conv1, act='relu')
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
from paddle import nn, ParamAttr
from paddle.nn import functional as F
import paddle
import paddle.fluid as fluid
from paddle.fluid.param_attr import ParamAttr
__all__ = ["ResNet"]
__all__ = [
"ResNet", "ResNet18_vd", "ResNet34_vd", "ResNet50_vd", "ResNet101_vd",
"ResNet152_vd", "ResNet200_vd"
]
class ResNet(nn.Layer):
def __init__(self, in_channels=3, layers=34):
super(ResNet, self).__init__()
supported_layers = {
18: {
'depth': [2, 2, 2, 2],
'block_class': BasicBlock
},
34: {
'depth': [3, 4, 6, 3],
'block_class': BasicBlock
},
50: {
'depth': [3, 4, 6, 3],
'block_class': BottleneckBlock
},
101: {
'depth': [3, 4, 23, 3],
'block_class': BottleneckBlock
},
152: {
'depth': [3, 8, 36, 3],
'block_class': BottleneckBlock
},
200: {
'depth': [3, 12, 48, 3],
'block_class': BottleneckBlock
}
}
assert layers in supported_layers, \
"supported layers are {} but input layer is {}".format(supported_layers.keys(), layers)
is_3x3 = True
class ResNet():
def __init__(self, params):
self.layers = params['layers']
self.is_3x3 = True
supported_layers = [18, 34, 50, 101, 152, 200]
assert self.layers in supported_layers, \
"supported layers are {} but input layer is {}".format(supported_layers, self.layers)
def __call__(self, input):
is_3x3 = self.is_3x3
layers = self.layers
if layers == 18:
depth = [2, 2, 2, 2]
elif layers == 34 or layers == 50:
depth = [3, 4, 6, 3]
elif layers == 101:
depth = [3, 4, 23, 3]
elif layers == 152:
depth = [3, 8, 36, 3]
elif layers == 200:
depth = [3, 12, 48, 3]
num_filters = [64, 128, 256, 512]
depth = supported_layers[layers]['depth']
block_class = supported_layers[layers]['block_class']
conv = []
if is_3x3 == False:
conv = self.conv_bn_layer(
input=input,
num_filters=64,
filter_size=7,
stride=1,
act='relu')
conv.append(
ConvBNLayer(
in_channels=in_channels,
out_channels=64,
kernel_size=7,
stride=1,
act='relu'))
else:
conv = self.conv_bn_layer(
input=input,
num_filters=32,
filter_size=3,
stride=1,
act='relu',
name='conv1_1')
conv = self.conv_bn_layer(
input=conv,
num_filters=32,
filter_size=3,
stride=1,
act='relu',
name='conv1_2')
conv = self.conv_bn_layer(
input=conv,
num_filters=64,
filter_size=3,
stride=1,
act='relu',
name='conv1_3')
conv.append(
ConvBNLayer(
in_channels=in_channels,
out_channels=32,
kernel_size=3,
stride=1,
act='relu',
name='conv1_1'))
conv.append(
ConvBNLayer(
in_channels=32,
out_channels=32,
kernel_size=3,
stride=1,
act='relu',
name='conv1_2'))
conv.append(
ConvBNLayer(
in_channels=32,
out_channels=64,
kernel_size=3,
stride=1,
act='relu',
name='conv1_3'))
self.conv1 = nn.Sequential(*conv)
conv = fluid.layers.pool2d(
input=conv,
pool_size=3,
pool_stride=2,
pool_padding=1,
pool_type='max')
self.pool = nn.MaxPool2d(
kernel_size=3,
stride=2,
padding=1, )
if layers >= 50:
for block in range(len(depth)):
for i in range(depth[block]):
if layers in [101, 152, 200] and block == 2:
block_list = []
in_ch = 64
for block_index in range(len(depth)):
for i in range(depth[block_index]):
if layers >= 50:
if layers in [101, 152, 200] and block_index == 2:
if i == 0:
conv_name = "res" + str(block + 2) + "a"
conv_name = "res" + str(block_index + 2) + "a"
else:
conv_name = "res" + str(block + 2) + "b" + str(i)
else:
conv_name = "res" + str(block + 2) + chr(97 + i)
if i == 0 and block != 0:
stride = (2, 1)
else:
stride = (1, 1)
conv = self.bottleneck_block(
input=conv,
num_filters=num_filters[block],
stride=stride,
if_first=block == i == 0,
name=conv_name)
else:
for block in range(len(depth)):
for i in range(depth[block]):
conv_name = "res" + str(block + 2) + chr(97 + i)
if i == 0 and block != 0:
stride = (2, 1)
conv_name = "res" + str(block_index +
2) + "b" + str(i)
else:
stride = (1, 1)
conv = self.basic_block(
input=conv,
num_filters=num_filters[block],
conv_name = "res" + str(block_index + 2) + chr(97 + i)
else:
conv_name = "res" + str(block_index + 2) + chr(97 + i)
if i == 0 and block_index != 0:
stride = (2, 1)
else:
stride = (1, 1)
block_list.append(
block_class(
in_channels=in_ch,
out_channels=num_filters[block_index],
stride=stride,
if_first=block == i == 0,
name=conv_name)
if_first=block_index == i == 0,
name=conv_name))
in_ch = block_list[-1].out_channels
self.block_list = nn.Sequential(*block_list)
self.add_sublayer(sublayer=self.block_list, name="block_list")
self.pool_out = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
self.out_channels = in_ch
conv = fluid.layers.pool2d(
input=conv,
pool_size=2,
pool_stride=2,
pool_padding=0,
pool_type='max')
def forward(self, x):
x = self.conv1(x)
x = self.pool(x)
x = self.block_list(x)
x = self.pool_out(x)
return x
return conv
def conv_bn_layer(self,
input,
num_filters,
filter_size,
stride=1,
groups=1,
act=None,
name=None):
conv = fluid.layers.conv2d(
input=input,
num_filters=num_filters,
filter_size=filter_size,
class ConvBNLayer(nn.Layer):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
groups=1,
act=None,
name=None):
super(ConvBNLayer, self).__init__()
self.conv = nn.Conv2d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=(filter_size - 1) // 2,
padding=(kernel_size - 1) // 2,
groups=groups,
act=None,
param_attr=ParamAttr(name=name + "_weights"),
weight_attr=ParamAttr(name=name + "_weights"),
bias_attr=False)
if name == "conv1":
bn_name = "bn_" + name
else:
bn_name = "bn" + name[3:]
return fluid.layers.batch_norm(
input=conv,
self.bn = nn.BatchNorm(
num_channels=out_channels,
act=act,
param_attr=ParamAttr(name=bn_name + '_scale'),
bias_attr=ParamAttr(bn_name + '_offset'),
moving_mean_name=bn_name + '_mean',
moving_variance_name=bn_name + '_variance')
param_attr=ParamAttr(name=bn_name + "_scale"),
bias_attr=ParamAttr(name=bn_name + "_offset"),
moving_mean_name=bn_name + "_mean",
moving_variance_name=bn_name + "_variance")
def __call__(self, x):
x = self.conv(x)
x = self.bn(x)
return x
def conv_bn_layer_new(self,
input,
num_filters,
filter_size,
stride=1,
groups=1,
act=None,
name=None):
pool = fluid.layers.pool2d(
input=input,
pool_size=stride,
pool_stride=stride,
pool_padding=0,
pool_type='avg',
ceil_mode=True)
conv = fluid.layers.conv2d(
input=pool,
num_filters=num_filters,
filter_size=filter_size,
class ConvBNLayerNew(nn.Layer):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
groups=1,
act=None,
name=None):
super(ConvBNLayerNew, self).__init__()
self.pool = nn.AvgPool2d(
kernel_size=stride, stride=stride, padding=0, ceil_mode=True)
self.conv = nn.Conv2d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=1,
padding=(filter_size - 1) // 2,
padding=(kernel_size - 1) // 2,
groups=groups,
act=None,
param_attr=ParamAttr(name=name + "_weights"),
weight_attr=ParamAttr(name=name + "_weights"),
bias_attr=False)
if name == "conv1":
bn_name = "bn_" + name
else:
bn_name = "bn" + name[3:]
return fluid.layers.batch_norm(
input=conv,
self.bn = nn.BatchNorm(
num_channels=out_channels,
act=act,
param_attr=ParamAttr(name=bn_name + '_scale'),
bias_attr=ParamAttr(bn_name + '_offset'),
moving_mean_name=bn_name + '_mean',
moving_variance_name=bn_name + '_variance')
param_attr=ParamAttr(name=bn_name + "_scale"),
bias_attr=ParamAttr(name=bn_name + "_offset"),
moving_mean_name=bn_name + "_mean",
moving_variance_name=bn_name + "_variance")
def __call__(self, x):
x = self.pool(x)
x = self.conv(x)
x = self.bn(x)
return x
class ShortCut(nn.Layer):
def __init__(self, in_channels, out_channels, stride, name, if_first=False):
super(ShortCut, self).__init__()
self.use_conv = True
def shortcut(self, input, ch_out, stride, name, if_first=False):
ch_in = input.shape[1]
if ch_in != ch_out or stride[0] != 1:
if in_channels != out_channels or stride[0] != 1:
if if_first:
return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
self.conv = ConvBNLayer(
in_channels, out_channels, 1, stride, name=name)
else:
return self.conv_bn_layer_new(
input, ch_out, 1, stride, name=name)
self.conv = ConvBNLayerNew(
in_channels, out_channels, 1, stride, name=name)
elif if_first:
return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
self.conv = ConvBNLayer(
in_channels, out_channels, 1, stride, name=name)
else:
return input
self.use_conv = False
def bottleneck_block(self, input, num_filters, stride, name, if_first):
conv0 = self.conv_bn_layer(
input=input,
num_filters=num_filters,
filter_size=1,
def forward(self, x):
if self.use_conv:
x = self.conv(x)
return x
class BottleneckBlock(nn.Layer):
def __init__(self, in_channels, out_channels, stride, name, if_first):
super(BottleneckBlock, self).__init__()
self.conv0 = ConvBNLayer(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=1,
act='relu',
name=name + "_branch2a")
conv1 = self.conv_bn_layer(
input=conv0,
num_filters=num_filters,
filter_size=3,
self.conv1 = ConvBNLayer(
in_channels=out_channels,
out_channels=out_channels,
kernel_size=3,
stride=stride,
act='relu',
name=name + "_branch2b")
conv2 = self.conv_bn_layer(
input=conv1,
num_filters=num_filters * 4,
filter_size=1,
self.conv2 = ConvBNLayer(
in_channels=out_channels,
out_channels=out_channels * 4,
kernel_size=1,
act=None,
name=name + "_branch2c")
short = self.shortcut(
input,
num_filters * 4,
stride,
self.short = ShortCut(
in_channels=in_channels,
out_channels=out_channels * 4,
stride=stride,
if_first=if_first,
name=name + "_branch1")
self.out_channels = out_channels * 4
return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
def forward(self, x):
y = self.conv0(x)
y = self.conv1(y)
y = self.conv2(y)
y = y + self.short(x)
y = F.relu(y)
return y
def basic_block(self, input, num_filters, stride, name, if_first):
conv0 = self.conv_bn_layer(
input=input,
num_filters=num_filters,
filter_size=3,
class BasicBlock(nn.Layer):
def __init__(self, in_channels, out_channels, stride, name, if_first):
super(BasicBlock, self).__init__()
self.conv0 = ConvBNLayer(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=3,
act='relu',
stride=stride,
name=name + "_branch2a")
conv1 = self.conv_bn_layer(
input=conv0,
num_filters=num_filters,
filter_size=3,
self.conv1 = ConvBNLayer(
in_channels=out_channels,
out_channels=out_channels,
kernel_size=3,
act=None,
name=name + "_branch2b")
short = self.shortcut(
input,
num_filters,
stride,
self.short = ShortCut(
in_channels=in_channels,
out_channels=out_channels,
stride=stride,
if_first=if_first,
name=name + "_branch1")
return fluid.layers.elementwise_add(x=short, y=conv1, act='relu')
self.out_channels = out_channels
def forward(self, x):
y = self.conv0(x)
y = self.conv1(y)
y = y + self.short(x)
return F.relu(y)
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.fluid as fluid
from paddle.fluid.param_attr import ParamAttr
import math
def get_para_bias_attr(l2_decay, k, name):
regularizer = fluid.regularizer.L2Decay(l2_decay)
stdv = 1.0 / math.sqrt(k * 1.0)
initializer = fluid.initializer.Uniform(-stdv, stdv)
para_attr = fluid.ParamAttr(
regularizer=regularizer, initializer=initializer, name=name + "_w_attr")
bias_attr = fluid.ParamAttr(
regularizer=regularizer, initializer=initializer, name=name + "_b_attr")
return [para_attr, bias_attr]
def conv_bn_layer(input,
num_filters,
filter_size,
stride=1,
groups=1,
act=None,
name=None):
conv = fluid.layers.conv2d(
input=input,
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
groups=groups,
act=None,
param_attr=ParamAttr(name=name + "_weights"),
bias_attr=False,
name=name + '.conv2d')
bn_name = "bn_" + name
return fluid.layers.batch_norm(
input=conv,
act=act,
name=bn_name + '.output',
param_attr=ParamAttr(name=bn_name + '_scale'),
bias_attr=ParamAttr(bn_name + '_offset'),
moving_mean_name=bn_name + '_mean',
moving_variance_name=bn_name + '_variance')
def deconv_bn_layer(input,
num_filters,
filter_size=4,
stride=2,
act='relu',
name=None):
deconv = fluid.layers.conv2d_transpose(
input=input,
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=1,
act=None,
param_attr=ParamAttr(name=name + "_weights"),
bias_attr=False,
name=name + '.deconv2d')
bn_name = "bn_" + name
return fluid.layers.batch_norm(
input=deconv,
act=act,
name=bn_name + '.output',
param_attr=ParamAttr(name=bn_name + '_scale'),
bias_attr=ParamAttr(bn_name + '_offset'),
moving_mean_name=bn_name + '_mean',
moving_variance_name=bn_name + '_variance')
def create_tmp_var(program, name, dtype, shape, lod_level=0):
return program.current_block().create_var(
name=name, dtype=dtype, shape=shape, lod_level=lod_level)
......@@ -11,3 +11,20 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__all__ = ['build_head']
def build_head(config):
# det head
from .det_db_head import DBHead
# rec head
from .rec_ctc_head import CTC
support_dict = ['DBHead', 'CTC']
module_name = config.pop('name')
assert module_name in support_dict, Exception('head only support {}'.format(
support_dict))
module_class = eval(module_name)(**config)
return module_class
#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import paddle
from paddle import nn
import paddle.nn.functional as F
from paddle import ParamAttr
import paddle.fluid as fluid
def get_bias_attr(k, name):
stdv = 1.0 / math.sqrt(k * 1.0)
initializer = paddle.nn.initializer.Uniform(-stdv, stdv)
bias_attr = ParamAttr(initializer=initializer, name=name + "_b_attr")
return bias_attr
class DBHead(object):
"""
Differentiable Binarization (DB) for text detection:
see https://arxiv.org/abs/1911.08947
args:
params(dict): super parameters for build DB network
"""
def __init__(self, params):
self.k = params['k']
self.inner_channels = params['inner_channels']
self.C, self.H, self.W = params['image_shape']
print(self.C, self.H, self.W)
def binarize(self, x):
conv1 = fluid.layers.conv2d(
input=x,
num_filters=self.inner_channels // 4,
filter_size=3,
class Head(nn.Layer):
def __init__(self, in_channels, name_list):
super(Head, self).__init__()
self.conv1 = nn.Conv2d(
in_channels=in_channels,
out_channels=in_channels // 4,
kernel_size=3,
padding=1,
param_attr=fluid.initializer.MSRAInitializer(uniform=False),
weight_attr=ParamAttr(name=name_list[0] + '.w_0'),
bias_attr=False)
conv_bn1 = fluid.layers.batch_norm(
input=conv1,
param_attr=fluid.initializer.ConstantInitializer(value=1.0),
bias_attr=fluid.initializer.ConstantInitializer(value=1e-4),
act="relu")
conv2 = fluid.layers.conv2d_transpose(
input=conv_bn1,
num_filters=self.inner_channels // 4,
filter_size=2,
self.conv_bn1 = nn.BatchNorm(
num_channels=in_channels // 4,
param_attr=ParamAttr(
name=name_list[1] + '.w_0',
initializer=paddle.nn.initializer.Constant(value=1.0)),
bias_attr=ParamAttr(
name=name_list[1] + '.b_0',
initializer=paddle.nn.initializer.Constant(value=1e-4)),
moving_mean_name=name_list[1] + '.w_1',
moving_variance_name=name_list[1] + '.w_2',
act='relu')
self.conv2 = nn.ConvTranspose2d(
in_channels=in_channels // 4,
out_channels=in_channels // 4,
kernel_size=2,
stride=2,
param_attr=fluid.initializer.MSRAInitializer(uniform=False),
bias_attr=self._get_bias_attr(0.0004, conv_bn1.shape[1], "conv2"),
act=None)
conv_bn2 = fluid.layers.batch_norm(
input=conv2,
param_attr=fluid.initializer.ConstantInitializer(value=1.0),
bias_attr=fluid.initializer.ConstantInitializer(value=1e-4),
weight_attr=ParamAttr(
name=name_list[2] + '.w_0',
initializer=paddle.nn.initializer.MSRA(uniform=False)),
bias_attr=get_bias_attr(in_channels // 4, name_list[-1] + "conv2"))
self.conv_bn2 = nn.BatchNorm(
num_channels=in_channels // 4,
param_attr=ParamAttr(
name=name_list[3] + '.w_0',
initializer=paddle.nn.initializer.Constant(value=1.0)),
bias_attr=ParamAttr(
name=name_list[3] + '.b_0',
initializer=paddle.nn.initializer.Constant(value=1e-4)),
moving_mean_name=name_list[3] + '.w_1',
moving_variance_name=name_list[3] + '.w_2',
act="relu")
conv3 = fluid.layers.conv2d_transpose(
input=conv_bn2,
num_filters=1,
filter_size=2,
self.conv3 = nn.ConvTranspose2d(
in_channels=in_channels // 4,
out_channels=1,
kernel_size=2,
stride=2,
param_attr=fluid.initializer.MSRAInitializer(uniform=False),
bias_attr=self._get_bias_attr(0.0004, conv_bn2.shape[1], "conv3"),
act=None)
out = fluid.layers.sigmoid(conv3)
return out
weight_attr=ParamAttr(
name=name_list[4] + '.w_0',
initializer=paddle.nn.initializer.MSRA(uniform=False)),
bias_attr=get_bias_attr(in_channels // 4, name_list[-1] + "conv3"),
)
def thresh(self, x):
conv1 = fluid.layers.conv2d(
input=x,
num_filters=self.inner_channels // 4,
filter_size=3,
padding=1,
param_attr=fluid.initializer.MSRAInitializer(uniform=False),
bias_attr=False)
conv_bn1 = fluid.layers.batch_norm(
input=conv1,
param_attr=fluid.initializer.ConstantInitializer(value=1.0),
bias_attr=fluid.initializer.ConstantInitializer(value=1e-4),
act="relu")
conv2 = fluid.layers.conv2d_transpose(
input=conv_bn1,
num_filters=self.inner_channels // 4,
filter_size=2,
stride=2,
param_attr=fluid.initializer.MSRAInitializer(uniform=False),
bias_attr=self._get_bias_attr(0.0004, conv_bn1.shape[1], "conv2"),
act=None)
conv_bn2 = fluid.layers.batch_norm(
input=conv2,
param_attr=fluid.initializer.ConstantInitializer(value=1.0),
bias_attr=fluid.initializer.ConstantInitializer(value=1e-4),
act="relu")
conv3 = fluid.layers.conv2d_transpose(
input=conv_bn2,
num_filters=1,
filter_size=2,
stride=2,
param_attr=fluid.initializer.MSRAInitializer(uniform=False),
bias_attr=self._get_bias_attr(0.0004, conv_bn2.shape[1], "conv3"),
act=None)
out = fluid.layers.sigmoid(conv3)
return out
def forward(self, x):
x = self.conv1(x)
x = self.conv_bn1(x)
x = self.conv2(x)
x = self.conv_bn2(x)
x = self.conv3(x)
x = F.sigmoid(x)
return x
def _get_bias_attr(self, l2_decay, k, name, gradient_clip=None):
regularizer = fluid.regularizer.L2Decay(l2_decay)
stdv = 1.0 / math.sqrt(k * 1.0)
initializer = fluid.initializer.Uniform(-stdv, stdv)
bias_attr = fluid.ParamAttr(
regularizer=regularizer,
initializer=initializer,
name=name + "_b_attr")
return bias_attr
def step_function(self, x, y):
return fluid.layers.reciprocal(1 + fluid.layers.exp(-self.k * (x - y)))
class DBHead(nn.Layer):
"""
Differentiable Binarization (DB) for text detection:
see https://arxiv.org/abs/1911.08947
args:
params(dict): super parameters for build DB network
"""
def __call__(self, conv_features, mode="train"):
c2, c3, c4, c5 = conv_features
param_attr = fluid.initializer.MSRAInitializer(uniform=False)
in5 = fluid.layers.conv2d(
input=c5,
num_filters=self.inner_channels,
filter_size=1,
param_attr=param_attr,
bias_attr=False)
in4 = fluid.layers.conv2d(
input=c4,
num_filters=self.inner_channels,
filter_size=1,
param_attr=param_attr,
bias_attr=False)
in3 = fluid.layers.conv2d(
input=c3,
num_filters=self.inner_channels,
filter_size=1,
param_attr=param_attr,
bias_attr=False)
in2 = fluid.layers.conv2d(
input=c2,
num_filters=self.inner_channels,
filter_size=1,
param_attr=param_attr,
bias_attr=False)
def __init__(self, in_channels, k=50, **kwargs):
super(DBHead, self).__init__()
self.k = k
binarize_name_list = [
'conv2d_56', 'batch_norm_47', 'conv2d_transpose_0', 'batch_norm_48',
'conv2d_transpose_1', 'binarize'
]
thresh_name_list = [
'conv2d_57', 'batch_norm_49', 'conv2d_transpose_2', 'batch_norm_50',
'conv2d_transpose_3', 'thresh'
]
self.binarize = Head(in_channels, binarize_name_list)
self.thresh = Head(in_channels, thresh_name_list)
out4 = fluid.layers.elementwise_add(
x=fluid.layers.resize_nearest(
input=in5, scale=2), y=in4) # 1/16
out3 = fluid.layers.elementwise_add(
x=fluid.layers.resize_nearest(
input=out4, scale=2), y=in3) # 1/8
out2 = fluid.layers.elementwise_add(
x=fluid.layers.resize_nearest(
input=out3, scale=2), y=in2) # 1/4
def step_function(self, x, y):
return paddle.reciprocal(1 + paddle.exp(-self.k * (x - y)))
p5 = fluid.layers.conv2d(
input=in5,
num_filters=self.inner_channels // 4,
filter_size=3,
padding=1,
param_attr=param_attr,
bias_attr=False)
p5 = fluid.layers.resize_nearest(input=p5, scale=8)
p4 = fluid.layers.conv2d(
input=out4,
num_filters=self.inner_channels // 4,
filter_size=3,
padding=1,
param_attr=param_attr,
bias_attr=False)
p4 = fluid.layers.resize_nearest(input=p4, scale=4)
p3 = fluid.layers.conv2d(
input=out3,
num_filters=self.inner_channels // 4,
filter_size=3,
padding=1,
param_attr=param_attr,
bias_attr=False)
p3 = fluid.layers.resize_nearest(input=p3, scale=2)
p2 = fluid.layers.conv2d(
input=out2,
num_filters=self.inner_channels // 4,
filter_size=3,
padding=1,
param_attr=param_attr,
bias_attr=False)
def forward(self, x):
shrink_maps = self.binarize(x)
if not self.training:
return shrink_maps
fuse = fluid.layers.concat(input=[p5, p4, p3, p2], axis=1)
shrink_maps = self.binarize(fuse)
if mode != "train":
return {"maps": shrink_maps}
threshold_maps = self.thresh(fuse)
threshold_maps = self.thresh(x)
binary_maps = self.step_function(shrink_maps, threshold_maps)
y = fluid.layers.concat(
input=[shrink_maps, threshold_maps, binary_maps], axis=1)
predicts = {}
predicts['maps'] = y
return predicts
y = paddle.concat([shrink_maps, threshold_maps, binary_maps], axis=1)
return y
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle.fluid as fluid
from ..common_functions import conv_bn_layer, deconv_bn_layer
from collections import OrderedDict
class EASTHead(object):
"""
EAST: An Efficient and Accurate Scene Text Detector
see arxiv: https://arxiv.org/abs/1704.03155
args:
params(dict): the super parameters for network build
"""
def __init__(self, params):
self.model_name = params['model_name']
def unet_fusion(self, inputs):
f = inputs[::-1]
if self.model_name == "large":
num_outputs = [128, 128, 128, 128]
else:
num_outputs = [64, 64, 64, 64]
g = [None, None, None, None]
h = [None, None, None, None]
for i in range(4):
if i == 0:
h[i] = f[i]
else:
h[i] = fluid.layers.concat([g[i - 1], f[i]], axis=1)
h[i] = conv_bn_layer(
input=h[i],
num_filters=num_outputs[i],
filter_size=3,
stride=1,
act='relu',
name="unet_h_%d" % (i))
if i <= 2:
#can be replaced with unpool
g[i] = deconv_bn_layer(
input=h[i],
num_filters=num_outputs[i],
name="unet_g_%d" % (i))
else:
g[i] = conv_bn_layer(
input=h[i],
num_filters=num_outputs[i],
filter_size=3,
stride=1,
act='relu',
name="unet_g_%d" % (i))
return g[3]
def detector_header(self, f_common):
if self.model_name == "large":
num_outputs = [128, 64, 1, 8]
else:
num_outputs = [64, 32, 1, 8]
f_det = conv_bn_layer(
input=f_common,
num_filters=num_outputs[0],
filter_size=3,
stride=1,
act='relu',
name="det_head1")
f_det = conv_bn_layer(
input=f_det,
num_filters=num_outputs[1],
filter_size=3,
stride=1,
act='relu',
name="det_head2")
#f_score
f_score = conv_bn_layer(
input=f_det,
num_filters=num_outputs[2],
filter_size=1,
stride=1,
act=None,
name="f_score")
f_score = fluid.layers.sigmoid(f_score)
#f_geo
f_geo = conv_bn_layer(
input=f_det,
num_filters=num_outputs[3],
filter_size=1,
stride=1,
act=None,
name="f_geo")
f_geo = (fluid.layers.sigmoid(f_geo) - 0.5) * 2 * 800
return f_score, f_geo
def __call__(self, inputs):
f_common = self.unet_fusion(inputs)
f_score, f_geo = self.detector_header(f_common)
predicts = OrderedDict()
predicts['f_score'] = f_score
predicts['f_geo'] = f_geo
return predicts
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle.fluid as fluid
from ..common_functions import conv_bn_layer, deconv_bn_layer
from collections import OrderedDict
class SASTHead(object):
"""
SAST:
see arxiv: https://arxiv.org/abs/1908.05498
args:
params(dict): the super parameters for network build
"""
def __init__(self, params):
self.model_name = params['model_name']
self.with_cab = params['with_cab']
def FPN_Up_Fusion(self, blocks):
"""
blocks{}: contain block_2, block_3, block_4, block_5, block_6, block_7 with
1/4, 1/8, 1/16, 1/32, 1/64, 1/128 resolution.
"""
f = [blocks['block_6'], blocks['block_5'], blocks['block_4'], blocks['block_3'], blocks['block_2']]
num_outputs = [256, 256, 192, 192, 128]
g = [None, None, None, None, None]
h = [None, None, None, None, None]
for i in range(5):
h[i] = conv_bn_layer(input=f[i], num_filters=num_outputs[i],
filter_size=1, stride=1, act=None, name='fpn_up_h'+str(i))
for i in range(4):
if i == 0:
g[i] = deconv_bn_layer(input=h[i], num_filters=num_outputs[i + 1], act=None, name='fpn_up_g0')
#print("g[{}] shape: {}".format(i, g[i].shape))
else:
g[i] = fluid.layers.elementwise_add(x=g[i - 1], y=h[i])
g[i] = fluid.layers.relu(g[i])
#g[i] = conv_bn_layer(input=g[i], num_filters=num_outputs[i],
# filter_size=1, stride=1, act='relu')
g[i] = conv_bn_layer(input=g[i], num_filters=num_outputs[i],
filter_size=3, stride=1, act='relu', name='fpn_up_g%d_1'%i)
g[i] = deconv_bn_layer(input=g[i], num_filters=num_outputs[i + 1], act=None, name='fpn_up_g%d_2'%i)
#print("g[{}] shape: {}".format(i, g[i].shape))
g[4] = fluid.layers.elementwise_add(x=g[3], y=h[4])
g[4] = fluid.layers.relu(g[4])
g[4] = conv_bn_layer(input=g[4], num_filters=num_outputs[4],
filter_size=3, stride=1, act='relu', name='fpn_up_fusion_1')
g[4] = conv_bn_layer(input=g[4], num_filters=num_outputs[4],
filter_size=1, stride=1, act=None, name='fpn_up_fusion_2')
return g[4]
def FPN_Down_Fusion(self, blocks):
"""
blocks{}: contain block_2, block_3, block_4, block_5, block_6, block_7 with
1/4, 1/8, 1/16, 1/32, 1/64, 1/128 resolution.
"""
f = [blocks['block_0'], blocks['block_1'], blocks['block_2']]
num_outputs = [32, 64, 128]
g = [None, None, None]
h = [None, None, None]
for i in range(3):
h[i] = conv_bn_layer(input=f[i], num_filters=num_outputs[i],
filter_size=3, stride=1, act=None, name='fpn_down_h'+str(i))
for i in range(2):
if i == 0:
g[i] = conv_bn_layer(input=h[i], num_filters=num_outputs[i+1], filter_size=3, stride=2, act=None, name='fpn_down_g0')
else:
g[i] = fluid.layers.elementwise_add(x=g[i - 1], y=h[i])
g[i] = fluid.layers.relu(g[i])
g[i] = conv_bn_layer(input=g[i], num_filters=num_outputs[i], filter_size=3, stride=1, act='relu', name='fpn_down_g%d_1'%i)
g[i] = conv_bn_layer(input=g[i], num_filters=num_outputs[i+1], filter_size=3, stride=2, act=None, name='fpn_down_g%d_2'%i)
# print("g[{}] shape: {}".format(i, g[i].shape))
g[2] = fluid.layers.elementwise_add(x=g[1], y=h[2])
g[2] = fluid.layers.relu(g[2])
g[2] = conv_bn_layer(input=g[2], num_filters=num_outputs[2],
filter_size=3, stride=1, act='relu', name='fpn_down_fusion_1')
g[2] = conv_bn_layer(input=g[2], num_filters=num_outputs[2],
filter_size=1, stride=1, act=None, name='fpn_down_fusion_2')
return g[2]
def SAST_Header1(self, f_common):
"""Detector header."""
#f_score
f_score = conv_bn_layer(input=f_common, num_filters=64, filter_size=1, stride=1, act='relu', name='f_score1')
f_score = conv_bn_layer(input=f_score, num_filters=64, filter_size=3, stride=1, act='relu', name='f_score2')
f_score = conv_bn_layer(input=f_score, num_filters=128, filter_size=1, stride=1, act='relu', name='f_score3')
f_score = conv_bn_layer(input=f_score, num_filters=1, filter_size=3, stride=1, name='f_score4')
f_score = fluid.layers.sigmoid(f_score)
# print("f_score shape: {}".format(f_score.shape))
#f_boder
f_border = conv_bn_layer(input=f_common, num_filters=64, filter_size=1, stride=1, act='relu', name='f_border1')
f_border = conv_bn_layer(input=f_border, num_filters=64, filter_size=3, stride=1, act='relu', name='f_border2')
f_border = conv_bn_layer(input=f_border, num_filters=128, filter_size=1, stride=1, act='relu', name='f_border3')
f_border = conv_bn_layer(input=f_border, num_filters=4, filter_size=3, stride=1, name='f_border4')
# print("f_border shape: {}".format(f_border.shape))
return f_score, f_border
def SAST_Header2(self, f_common):
"""Detector header."""
#f_tvo
f_tvo = conv_bn_layer(input=f_common, num_filters=64, filter_size=1, stride=1, act='relu', name='f_tvo1')
f_tvo = conv_bn_layer(input=f_tvo, num_filters=64, filter_size=3, stride=1, act='relu', name='f_tvo2')
f_tvo = conv_bn_layer(input=f_tvo, num_filters=128, filter_size=1, stride=1, act='relu', name='f_tvo3')
f_tvo = conv_bn_layer(input=f_tvo, num_filters=8, filter_size=3, stride=1, name='f_tvo4')
# print("f_tvo shape: {}".format(f_tvo.shape))
#f_tco
f_tco = conv_bn_layer(input=f_common, num_filters=64, filter_size=1, stride=1, act='relu', name='f_tco1')
f_tco = conv_bn_layer(input=f_tco, num_filters=64, filter_size=3, stride=1, act='relu', name='f_tco2')
f_tco = conv_bn_layer(input=f_tco, num_filters=128, filter_size=1, stride=1, act='relu', name='f_tco3')
f_tco = conv_bn_layer(input=f_tco, num_filters=2, filter_size=3, stride=1, name='f_tco4')
# print("f_tco shape: {}".format(f_tco.shape))
return f_tvo, f_tco
def cross_attention(self, f_common):
"""
"""
f_shape = fluid.layers.shape(f_common)
f_theta = conv_bn_layer(input=f_common, num_filters=128, filter_size=1, stride=1, act='relu', name='f_theta')
f_phi = conv_bn_layer(input=f_common, num_filters=128, filter_size=1, stride=1, act='relu', name='f_phi')
f_g = conv_bn_layer(input=f_common, num_filters=128, filter_size=1, stride=1, act='relu', name='f_g')
### horizon
fh_theta = f_theta
fh_phi = f_phi
fh_g = f_g
#flatten
fh_theta = fluid.layers.transpose(fh_theta, [0, 2, 3, 1])
fh_theta = fluid.layers.reshape(fh_theta, [f_shape[0] * f_shape[2], f_shape[3], 128])
fh_phi = fluid.layers.transpose(fh_phi, [0, 2, 3, 1])
fh_phi = fluid.layers.reshape(fh_phi, [f_shape[0] * f_shape[2], f_shape[3], 128])
fh_g = fluid.layers.transpose(fh_g, [0, 2, 3, 1])
fh_g = fluid.layers.reshape(fh_g, [f_shape[0] * f_shape[2], f_shape[3], 128])
#correlation
fh_attn = fluid.layers.matmul(fh_theta, fluid.layers.transpose(fh_phi, [0, 2, 1]))
#scale
fh_attn = fh_attn / (128 ** 0.5)
fh_attn = fluid.layers.softmax(fh_attn)
#weighted sum
fh_weight = fluid.layers.matmul(fh_attn, fh_g)
fh_weight = fluid.layers.reshape(fh_weight, [f_shape[0], f_shape[2], f_shape[3], 128])
# print("fh_weight: {}".format(fh_weight.shape))
fh_weight = fluid.layers.transpose(fh_weight, [0, 3, 1, 2])
fh_weight = conv_bn_layer(input=fh_weight, num_filters=128, filter_size=1, stride=1, name='fh_weight')
#short cut
fh_sc = conv_bn_layer(input=f_common, num_filters=128, filter_size=1, stride=1, name='fh_sc')
f_h = fluid.layers.relu(fh_weight + fh_sc)
######
#vertical
fv_theta = fluid.layers.transpose(f_theta, [0, 1, 3, 2])
fv_phi = fluid.layers.transpose(f_phi, [0, 1, 3, 2])
fv_g = fluid.layers.transpose(f_g, [0, 1, 3, 2])
#flatten
fv_theta = fluid.layers.transpose(fv_theta, [0, 2, 3, 1])
fv_theta = fluid.layers.reshape(fv_theta, [f_shape[0] * f_shape[3], f_shape[2], 128])
fv_phi = fluid.layers.transpose(fv_phi, [0, 2, 3, 1])
fv_phi = fluid.layers.reshape(fv_phi, [f_shape[0] * f_shape[3], f_shape[2], 128])
fv_g = fluid.layers.transpose(fv_g, [0, 2, 3, 1])
fv_g = fluid.layers.reshape(fv_g, [f_shape[0] * f_shape[3], f_shape[2], 128])
#correlation
fv_attn = fluid.layers.matmul(fv_theta, fluid.layers.transpose(fv_phi, [0, 2, 1]))
#scale
fv_attn = fv_attn / (128 ** 0.5)
fv_attn = fluid.layers.softmax(fv_attn)
#weighted sum
fv_weight = fluid.layers.matmul(fv_attn, fv_g)
fv_weight = fluid.layers.reshape(fv_weight, [f_shape[0], f_shape[3], f_shape[2], 128])
# print("fv_weight: {}".format(fv_weight.shape))
fv_weight = fluid.layers.transpose(fv_weight, [0, 3, 2, 1])
fv_weight = conv_bn_layer(input=fv_weight, num_filters=128, filter_size=1, stride=1, name='fv_weight')
#short cut
fv_sc = conv_bn_layer(input=f_common, num_filters=128, filter_size=1, stride=1, name='fv_sc')
f_v = fluid.layers.relu(fv_weight + fv_sc)
######
f_attn = fluid.layers.concat([f_h, f_v], axis=1)
f_attn = conv_bn_layer(input=f_attn, num_filters=128, filter_size=1, stride=1, act='relu', name='f_attn')
return f_attn
def __call__(self, blocks, with_cab=False):
# for k, v in blocks.items():
# print(k, v.shape)
#down fpn
f_down = self.FPN_Down_Fusion(blocks)
# print("f_down shape: {}".format(f_down.shape))
#up fpn
f_up = self.FPN_Up_Fusion(blocks)
# print("f_up shape: {}".format(f_up.shape))
#fusion
f_common = fluid.layers.elementwise_add(x=f_down, y=f_up)
f_common = fluid.layers.relu(f_common)
# print("f_common: {}".format(f_common.shape))
if self.with_cab:
# print('enhence f_common with CAB.')
f_common = self.cross_attention(f_common)
f_score, f_border= self.SAST_Header1(f_common)
f_tvo, f_tco = self.SAST_Header2(f_common)
predicts = OrderedDict()
predicts['f_score'] = f_score
predicts['f_border'] = f_border
predicts['f_tvo'] = f_tvo
predicts['f_tco'] = f_tco
return predicts
\ No newline at end of file
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import paddle
import paddle.fluid as fluid
import paddle.fluid.layers as layers
from .rec_seq_encoder import SequenceEncoder
import numpy as np
class AttentionPredict(object):
def __init__(self, params):
super(AttentionPredict, self).__init__()
self.char_num = params['char_num']
self.encoder = SequenceEncoder(params)
self.decoder_size = params['Attention']['decoder_size']
self.word_vector_dim = params['Attention']['word_vector_dim']
self.encoder_type = params['encoder_type']
self.max_length = params['max_text_length']
def simple_attention(self, encoder_vec, encoder_proj, decoder_state,
decoder_size):
decoder_state_proj = layers.fc(input=decoder_state,
size=decoder_size,
bias_attr=False,
name="decoder_state_proj_fc")
decoder_state_expand = layers.sequence_expand(
x=decoder_state_proj, y=encoder_proj)
concated = layers.elementwise_add(encoder_proj, decoder_state_expand)
concated = layers.tanh(x=concated)
attention_weights = layers.fc(input=concated,
size=1,
act=None,
bias_attr=False,
name="attention_weights_fc")
attention_weights = layers.sequence_softmax(input=attention_weights)
weigths_reshape = layers.reshape(x=attention_weights, shape=[-1])
scaled = layers.elementwise_mul(
x=encoder_vec, y=weigths_reshape, axis=0)
context = layers.sequence_pool(input=scaled, pool_type='sum')
return context
def gru_decoder_with_attention(self, target_embedding, encoder_vec,
encoder_proj, decoder_boot, decoder_size,
char_num):
rnn = layers.DynamicRNN()
with rnn.block():
current_word = rnn.step_input(target_embedding)
encoder_vec = rnn.static_input(encoder_vec)
encoder_proj = rnn.static_input(encoder_proj)
hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True)
context = self.simple_attention(encoder_vec, encoder_proj,
hidden_mem, decoder_size)
fc_1 = layers.fc(input=context,
size=decoder_size * 3,
bias_attr=False,
name="rnn_fc1")
fc_2 = layers.fc(input=current_word,
size=decoder_size * 3,
bias_attr=False,
name="rnn_fc2")
decoder_inputs = fc_1 + fc_2
h, _, _ = layers.gru_unit(
input=decoder_inputs, hidden=hidden_mem, size=decoder_size * 3)
rnn.update_memory(hidden_mem, h)
out = layers.fc(input=h,
size=char_num,
bias_attr=True,
act='softmax',
name="rnn_out_fc")
rnn.output(out)
return rnn()
def gru_attention_infer(self, decoder_boot, max_length, char_num,
word_vector_dim, encoded_vector, encoded_proj,
decoder_size):
init_state = decoder_boot
beam_size = 1
array_len = layers.fill_constant(
shape=[1], dtype='int64', value=max_length)
counter = layers.zeros(shape=[1], dtype='int64', force_cpu=True)
# fill the first element with init_state
state_array = layers.create_array('float32')
layers.array_write(init_state, array=state_array, i=counter)
# ids, scores as memory
ids_array = layers.create_array('int64')
scores_array = layers.create_array('float32')
rois_shape = layers.shape(init_state)
batch_size = layers.slice(
rois_shape, axes=[0], starts=[0], ends=[1]) + 1
lod_level = layers.range(
start=0, end=batch_size, step=1, dtype=batch_size.dtype)
init_ids = layers.fill_constant_batch_size_like(
input=init_state, shape=[-1, 1], value=0, dtype='int64')
init_ids = layers.lod_reset(init_ids, lod_level)
init_ids = layers.lod_append(init_ids, lod_level)
init_scores = layers.fill_constant_batch_size_like(
input=init_state, shape=[-1, 1], value=1, dtype='float32')
init_scores = layers.lod_reset(init_scores, init_ids)
layers.array_write(init_ids, array=ids_array, i=counter)
layers.array_write(init_scores, array=scores_array, i=counter)
full_ids = fluid.layers.fill_constant_batch_size_like(
input=init_state, shape=[-1, 1], dtype='int64', value=1)
full_scores = fluid.layers.fill_constant_batch_size_like(
input=init_state, shape=[-1, 1], dtype='float32', value=1)
cond = layers.less_than(x=counter, y=array_len)
while_op = layers.While(cond=cond)
with while_op.block():
pre_ids = layers.array_read(array=ids_array, i=counter)
pre_state = layers.array_read(array=state_array, i=counter)
pre_score = layers.array_read(array=scores_array, i=counter)
pre_ids_emb = layers.embedding(
input=pre_ids,
size=[char_num, word_vector_dim],
dtype='float32')
context = self.simple_attention(encoded_vector, encoded_proj,
pre_state, decoder_size)
# expand the recursive_sequence_lengths of pre_state
# to be the same with pre_score
pre_state_expanded = layers.sequence_expand(pre_state, pre_score)
context_expanded = layers.sequence_expand(context, pre_score)
fc_1 = layers.fc(input=context_expanded,
size=decoder_size * 3,
bias_attr=False,
name="rnn_fc1")
fc_2 = layers.fc(input=pre_ids_emb,
size=decoder_size * 3,
bias_attr=False,
name="rnn_fc2")
decoder_inputs = fc_1 + fc_2
current_state, _, _ = layers.gru_unit(
input=decoder_inputs,
hidden=pre_state_expanded,
size=decoder_size * 3)
current_state_with_lod = layers.lod_reset(
x=current_state, y=pre_score)
# use score to do beam search
current_score = layers.fc(input=current_state_with_lod,
size=char_num,
bias_attr=True,
act='softmax',
name="rnn_out_fc")
topk_scores, topk_indices = layers.topk(current_score, k=beam_size)
new_ids = fluid.layers.concat([full_ids, topk_indices], axis=1)
fluid.layers.assign(new_ids, full_ids)
new_scores = fluid.layers.concat([full_scores, topk_scores], axis=1)
fluid.layers.assign(new_scores, full_scores)
layers.increment(x=counter, value=1, in_place=True)
# update the memories
layers.array_write(current_state, array=state_array, i=counter)
layers.array_write(topk_indices, array=ids_array, i=counter)
layers.array_write(topk_scores, array=scores_array, i=counter)
# update the break condition:
# up to the max length or all candidates of
# source sentences have ended.
length_cond = layers.less_than(x=counter, y=array_len)
finish_cond = layers.logical_not(layers.is_empty(x=topk_indices))
layers.logical_and(x=length_cond, y=finish_cond, out=cond)
return full_ids, full_scores
def __call__(self, inputs, labels=None, mode=None):
encoder_features = self.encoder(inputs)
char_num = self.char_num
word_vector_dim = self.word_vector_dim
decoder_size = self.decoder_size
if self.encoder_type == "reshape":
encoder_input = encoder_features
encoded_vector = encoder_features
else:
encoder_input = encoder_features[1]
encoded_vector = layers.concat(encoder_features, axis=1)
encoded_proj = layers.fc(input=encoded_vector,
size=decoder_size,
bias_attr=False,
name="encoded_proj_fc")
backward_first = layers.sequence_pool(
input=encoder_input, pool_type='first')
decoder_boot = layers.fc(input=backward_first,
size=decoder_size,
bias_attr=False,
act="relu",
name='decoder_boot')
if mode == "train":
label_in = labels['label_in']
label_out = labels['label_out']
label_in = layers.cast(x=label_in, dtype='int64')
trg_embedding = layers.embedding(
input=label_in,
size=[char_num, word_vector_dim],
dtype='float32')
predict = self.gru_decoder_with_attention(
trg_embedding, encoded_vector, encoded_proj, decoder_boot,
decoder_size, char_num)
_, decoded_out = layers.topk(input=predict, k=1)
decoded_out = layers.lod_reset(decoded_out, y=label_out)
predicts = {'predict':predict, 'decoded_out':decoded_out}
else:
ids, predict = self.gru_attention_infer(
decoder_boot, self.max_length, char_num, word_vector_dim,
encoded_vector, encoded_proj, decoder_size)
predicts = {'predict':predict, 'decoded_out':ids}
return predicts
#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
......@@ -19,34 +19,33 @@ from __future__ import print_function
import math
import paddle
import paddle.fluid as fluid
from paddle.fluid.param_attr import ParamAttr
from .rec_seq_encoder import SequenceEncoder
from ..common_functions import get_para_bias_attr
import numpy as np
class CTCPredict(object):
def __init__(self, params):
super(CTCPredict, self).__init__()
self.char_num = params['char_num']
self.encoder = SequenceEncoder(params)
self.encoder_type = params['encoder_type']
self.fc_decay = params.get("fc_decay", 0.0004)
def __call__(self, inputs, labels=None, mode=None):
encoder_features = self.encoder(inputs)
if self.encoder_type != "reshape":
encoder_features = fluid.layers.concat(encoder_features, axis=1)
name = "ctc_fc"
para_attr, bias_attr = get_para_bias_attr(
l2_decay=self.fc_decay, k=encoder_features.shape[1], name=name)
predict = fluid.layers.fc(input=encoder_features,
size=self.char_num + 1,
param_attr=para_attr,
bias_attr=bias_attr,
name=name)
decoded_out = fluid.layers.ctc_greedy_decoder(
input=predict, blank=self.char_num)
predicts = {'predict': predict, 'decoded_out': decoded_out}
from paddle import ParamAttr, nn
def get_para_bias_attr(l2_decay, k, name):
regularizer = paddle.fluid.regularizer.L2Decay(l2_decay)
stdv = 1.0 / math.sqrt(k * 1.0)
initializer = nn.initializer.Uniform(-stdv, stdv)
weight_attr = ParamAttr(
regularizer=regularizer, initializer=initializer, name=name + "_w_attr")
bias_attr = ParamAttr(
regularizer=regularizer, initializer=initializer, name=name + "_b_attr")
return [weight_attr, bias_attr]
class CTC(nn.Layer):
def __init__(self, in_channels, out_channels, fc_decay=1e-5, **kwargs):
super(CTC, self).__init__()
weight_attr, bias_attr = get_para_bias_attr(
l2_decay=fc_decay, k=in_channels, name='ctc_fc')
self.fc = nn.Linear(
in_channels,
out_channels,
weight_attr=weight_attr,
bias_attr=bias_attr,
name='ctc_fc')
self.out_channels = out_channels
def forward(self, x, labels=None):
predicts = self.fc(x)
return predicts
#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import paddle.fluid as fluid
import paddle.fluid.layers as layers
class EncoderWithReshape(object):
def __init__(self, params):
super(EncoderWithReshape, self).__init__()
def __call__(self, inputs):
sliced_feature = layers.im2sequence(
input=inputs,
stride=[1, 1],
filter_size=[inputs.shape[2], 1],
name="sliced_feature")
return sliced_feature
class EncoderWithRNN(object):
def __init__(self, params):
super(EncoderWithRNN, self).__init__()
self.rnn_hidden_size = params['SeqRNN']['hidden_size']
def __call__(self, inputs):
lstm_list = []
name_prefix = "lstm"
rnn_hidden_size = self.rnn_hidden_size
for no in range(1, 3):
if no == 1:
is_reverse = False
else:
is_reverse = True
name = "%s_st1_fc%d" % (name_prefix, no)
fc = layers.fc(input=inputs,
size=rnn_hidden_size * 4,
param_attr=fluid.ParamAttr(name=name + "_w"),
bias_attr=fluid.ParamAttr(name=name + "_b"),
name=name)
name = "%s_st1_out%d" % (name_prefix, no)
lstm, _ = layers.dynamic_lstm(
input=fc,
size=rnn_hidden_size * 4,
is_reverse=is_reverse,
param_attr=fluid.ParamAttr(name=name + "_w"),
bias_attr=fluid.ParamAttr(name=name + "_b"),
use_peepholes=False)
name = "%s_st2_fc%d" % (name_prefix, no)
fc = layers.fc(input=lstm,
size=rnn_hidden_size * 4,
param_attr=fluid.ParamAttr(name=name + "_w"),
bias_attr=fluid.ParamAttr(name=name + "_b"),
name=name)
name = "%s_st2_out%d" % (name_prefix, no)
lstm, _ = layers.dynamic_lstm(
input=fc,
size=rnn_hidden_size * 4,
is_reverse=is_reverse,
param_attr=fluid.ParamAttr(name=name + "_w"),
bias_attr=fluid.ParamAttr(name=name + "_b"),
use_peepholes=False)
lstm_list.append(lstm)
return lstm_list
class SequenceEncoder(object):
def __init__(self, params):
super(SequenceEncoder, self).__init__()
self.encoder_type = params['encoder_type']
self.encoder_reshape = EncoderWithReshape(params)
if self.encoder_type == "rnn":
self.encoder_rnn = EncoderWithRNN(params)
def __call__(self, inputs):
if self.encoder_type == "reshape":
encoder_features = self.encoder_reshape(inputs)
elif self.encoder_type == "rnn":
inputs = self.encoder_reshape(inputs)
encoder_features = self.encoder_rnn(inputs)
else:
assert False, "Unsupport encoder_type:%s"\
% self.encoder_type
return encoder_features
#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import paddle
import paddle.fluid as fluid
from paddle.fluid.param_attr import ParamAttr
import numpy as np
from .self_attention.model import wrap_encoder
from .self_attention.model import wrap_encoder_forFeature
gradient_clip = 10
class SRNPredict(object):
def __init__(self, params):
super(SRNPredict, self).__init__()
self.char_num = params['char_num']
self.max_length = params['max_text_length']
self.num_heads = params['num_heads']
self.num_encoder_TUs = params['num_encoder_TUs']
self.num_decoder_TUs = params['num_decoder_TUs']
self.hidden_dims = params['hidden_dims']
def pvam(self, inputs, others):
b, c, h, w = inputs.shape
conv_features = fluid.layers.reshape(x=inputs, shape=[-1, c, h * w])
conv_features = fluid.layers.transpose(x=conv_features, perm=[0, 2, 1])
#===== Transformer encoder =====
b, t, c = conv_features.shape
encoder_word_pos = others["encoder_word_pos"]
gsrm_word_pos = others["gsrm_word_pos"]
enc_inputs = [conv_features, encoder_word_pos, None]
word_features = wrap_encoder_forFeature(
src_vocab_size=-1,
max_length=t,
n_layer=self.num_encoder_TUs,
n_head=self.num_heads,
d_key=int(self.hidden_dims / self.num_heads),
d_value=int(self.hidden_dims / self.num_heads),
d_model=self.hidden_dims,
d_inner_hid=self.hidden_dims,
prepostprocess_dropout=0.1,
attention_dropout=0.1,
relu_dropout=0.1,
preprocess_cmd="n",
postprocess_cmd="da",
weight_sharing=True,
enc_inputs=enc_inputs, )
fluid.clip.set_gradient_clip(
fluid.clip.GradientClipByValue(gradient_clip))
#===== Parallel Visual Attention Module =====
b, t, c = word_features.shape
word_features = fluid.layers.fc(word_features, c, num_flatten_dims=2)
word_features_ = fluid.layers.reshape(word_features, [-1, 1, t, c])
word_features_ = fluid.layers.expand(word_features_,
[1, self.max_length, 1, 1])
word_pos_feature = fluid.layers.embedding(gsrm_word_pos,
[self.max_length, c])
word_pos_ = fluid.layers.reshape(word_pos_feature,
[-1, self.max_length, 1, c])
word_pos_ = fluid.layers.expand(word_pos_, [1, 1, t, 1])
temp = fluid.layers.elementwise_add(
word_features_, word_pos_, act='tanh')
attention_weight = fluid.layers.fc(input=temp,
size=1,
num_flatten_dims=3,
bias_attr=False)
attention_weight = fluid.layers.reshape(
x=attention_weight, shape=[-1, self.max_length, t])
attention_weight = fluid.layers.softmax(input=attention_weight, axis=-1)
pvam_features = fluid.layers.matmul(attention_weight,
word_features) #[b, max_length, c]
return pvam_features
def gsrm(self, pvam_features, others):
#===== GSRM Visual-to-semantic embedding block =====
b, t, c = pvam_features.shape
word_out = fluid.layers.fc(
input=fluid.layers.reshape(pvam_features, [-1, c]),
size=self.char_num,
act="softmax")
#word_out.stop_gradient = True
word_ids = fluid.layers.argmax(word_out, axis=1)
word_ids.stop_gradient = True
word_ids = fluid.layers.reshape(x=word_ids, shape=[-1, t, 1])
#===== GSRM Semantic reasoning block =====
"""
This module is achieved through bi-transformers,
ngram_feature1 is the froward one, ngram_fetaure2 is the backward one
"""
pad_idx = self.char_num
gsrm_word_pos = others["gsrm_word_pos"]
gsrm_slf_attn_bias1 = others["gsrm_slf_attn_bias1"]
gsrm_slf_attn_bias2 = others["gsrm_slf_attn_bias2"]
def prepare_bi(word_ids):
"""
prepare bi for gsrm
word1 for forward; word2 for backward
"""
word1 = fluid.layers.cast(word_ids, "float32")
word1 = fluid.layers.pad(word1, [0, 0, 1, 0, 0, 0],
pad_value=1.0 * pad_idx)
word1 = fluid.layers.cast(word1, "int64")
word1 = word1[:, :-1, :]
word2 = word_ids
return word1, word2
word1, word2 = prepare_bi(word_ids)
word1.stop_gradient = True
word2.stop_gradient = True
enc_inputs_1 = [word1, gsrm_word_pos, gsrm_slf_attn_bias1]
enc_inputs_2 = [word2, gsrm_word_pos, gsrm_slf_attn_bias2]
gsrm_feature1 = wrap_encoder(
src_vocab_size=self.char_num + 1,
max_length=self.max_length,
n_layer=self.num_decoder_TUs,
n_head=self.num_heads,
d_key=int(self.hidden_dims / self.num_heads),
d_value=int(self.hidden_dims / self.num_heads),
d_model=self.hidden_dims,
d_inner_hid=self.hidden_dims,
prepostprocess_dropout=0.1,
attention_dropout=0.1,
relu_dropout=0.1,
preprocess_cmd="n",
postprocess_cmd="da",
weight_sharing=True,
enc_inputs=enc_inputs_1, )
gsrm_feature2 = wrap_encoder(
src_vocab_size=self.char_num + 1,
max_length=self.max_length,
n_layer=self.num_decoder_TUs,
n_head=self.num_heads,
d_key=int(self.hidden_dims / self.num_heads),
d_value=int(self.hidden_dims / self.num_heads),
d_model=self.hidden_dims,
d_inner_hid=self.hidden_dims,
prepostprocess_dropout=0.1,
attention_dropout=0.1,
relu_dropout=0.1,
preprocess_cmd="n",
postprocess_cmd="da",
weight_sharing=True,
enc_inputs=enc_inputs_2, )
gsrm_feature2 = fluid.layers.pad(gsrm_feature2, [0, 0, 0, 1, 0, 0],
pad_value=0.)
gsrm_feature2 = gsrm_feature2[:, 1:, ]
gsrm_features = gsrm_feature1 + gsrm_feature2
b, t, c = gsrm_features.shape
gsrm_out = fluid.layers.matmul(
x=gsrm_features,
y=fluid.default_main_program().global_block().var(
"src_word_emb_table"),
transpose_y=True)
b, t, c = gsrm_out.shape
gsrm_out = fluid.layers.softmax(input=fluid.layers.reshape(gsrm_out,
[-1, c]))
return gsrm_features, word_out, gsrm_out
def vsfd(self, pvam_features, gsrm_features):
#===== Visual-Semantic Fusion Decoder Module =====
b, t, c1 = pvam_features.shape
b, t, c2 = gsrm_features.shape
combine_features_ = fluid.layers.concat(
[pvam_features, gsrm_features], axis=2)
img_comb_features_ = fluid.layers.reshape(
x=combine_features_, shape=[-1, c1 + c2])
img_comb_features_map = fluid.layers.fc(input=img_comb_features_,
size=c1,
act="sigmoid")
img_comb_features_map = fluid.layers.reshape(
x=img_comb_features_map, shape=[-1, t, c1])
combine_features = img_comb_features_map * pvam_features + (
1.0 - img_comb_features_map) * gsrm_features
img_comb_features = fluid.layers.reshape(
x=combine_features, shape=[-1, c1])
fc_out = fluid.layers.fc(input=img_comb_features,
size=self.char_num,
act="softmax")
return fc_out
def __call__(self, inputs, others, mode=None):
pvam_features = self.pvam(inputs, others)
gsrm_features, word_out, gsrm_out = self.gsrm(pvam_features, others)
final_out = self.vsfd(pvam_features, gsrm_features)
_, decoded_out = fluid.layers.topk(input=final_out, k=1)
predicts = {
'predict': final_out,
'decoded_out': decoded_out,
'word_out': word_out,
'gsrm_out': gsrm_out
}
return predicts
from functools import partial
import numpy as np
import paddle.fluid as fluid
import paddle.fluid.layers as layers
encoder_data_input_fields = (
"src_word",
"src_pos",
"src_slf_attn_bias", )
def wrap_layer_with_block(layer, block_idx):
"""
Make layer define support indicating block, by which we can add layers
to other blocks within current block. This will make it easy to define
cache among while loop.
"""
class BlockGuard(object):
"""
BlockGuard class.
BlockGuard class is used to switch to the given block in a program by
using the Python `with` keyword.
"""
def __init__(self, block_idx=None, main_program=None):
self.main_program = fluid.default_main_program(
) if main_program is None else main_program
self.old_block_idx = self.main_program.current_block().idx
self.new_block_idx = block_idx
def __enter__(self):
self.main_program.current_block_idx = self.new_block_idx
def __exit__(self, exc_type, exc_val, exc_tb):
self.main_program.current_block_idx = self.old_block_idx
if exc_type is not None:
return False # re-raise exception
return True
def layer_wrapper(*args, **kwargs):
with BlockGuard(block_idx):
return layer(*args, **kwargs)
return layer_wrapper
def multi_head_attention(queries,
keys,
values,
attn_bias,
d_key,
d_value,
d_model,
n_head=1,
dropout_rate=0.,
cache=None,
gather_idx=None,
static_kv=False):
"""
Multi-Head Attention. Note that attn_bias is added to the logit before
computing softmax activiation to mask certain selected positions so that
they will not considered in attention weights.
"""
keys = queries if keys is None else keys
values = keys if values is None else values
if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
raise ValueError(
"Inputs: quries, keys and values should all be 3-D tensors.")
def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
"""
Add linear projection to queries, keys, and values.
"""
q = layers.fc(input=queries,
size=d_key * n_head,
bias_attr=False,
num_flatten_dims=2)
# For encoder-decoder attention in inference, insert the ops and vars
# into global block to use as cache among beam search.
fc_layer = wrap_layer_with_block(
layers.fc, fluid.default_main_program().current_block()
.parent_idx) if cache is not None and static_kv else layers.fc
k = fc_layer(
input=keys,
size=d_key * n_head,
bias_attr=False,
num_flatten_dims=2)
v = fc_layer(
input=values,
size=d_value * n_head,
bias_attr=False,
num_flatten_dims=2)
return q, k, v
def __split_heads_qkv(queries, keys, values, n_head, d_key, d_value):
"""
Reshape input tensors at the last dimension to split multi-heads
and then transpose. Specifically, transform the input tensor with shape
[bs, max_sequence_length, n_head * hidden_dim] to the output tensor
with shape [bs, n_head, max_sequence_length, hidden_dim].
"""
# The value 0 in shape attr means copying the corresponding dimension
# size of the input as the output dimension size.
reshaped_q = layers.reshape(
x=queries, shape=[0, 0, n_head, d_key], inplace=True)
# permuate the dimensions into:
# [batch_size, n_head, max_sequence_len, hidden_size_per_head]
q = layers.transpose(x=reshaped_q, perm=[0, 2, 1, 3])
# For encoder-decoder attention in inference, insert the ops and vars
# into global block to use as cache among beam search.
reshape_layer = wrap_layer_with_block(
layers.reshape,
fluid.default_main_program().current_block()
.parent_idx) if cache is not None and static_kv else layers.reshape
transpose_layer = wrap_layer_with_block(
layers.transpose,
fluid.default_main_program().current_block().
parent_idx) if cache is not None and static_kv else layers.transpose
reshaped_k = reshape_layer(
x=keys, shape=[0, 0, n_head, d_key], inplace=True)
k = transpose_layer(x=reshaped_k, perm=[0, 2, 1, 3])
reshaped_v = reshape_layer(
x=values, shape=[0, 0, n_head, d_value], inplace=True)
v = transpose_layer(x=reshaped_v, perm=[0, 2, 1, 3])
if cache is not None: # only for faster inference
if static_kv: # For encoder-decoder attention in inference
cache_k, cache_v = cache["static_k"], cache["static_v"]
# To init the static_k and static_v in cache.
# Maybe we can use condition_op(if_else) to do these at the first
# step in while loop to replace these, however it might be less
# efficient.
static_cache_init = wrap_layer_with_block(
layers.assign,
fluid.default_main_program().current_block().parent_idx)
static_cache_init(k, cache_k)
static_cache_init(v, cache_v)
else: # For decoder self-attention in inference
cache_k, cache_v = cache["k"], cache["v"]
# gather cell states corresponding to selected parent
select_k = layers.gather(cache_k, index=gather_idx)
select_v = layers.gather(cache_v, index=gather_idx)
if not static_kv:
# For self attention in inference, use cache and concat time steps.
select_k = layers.concat([select_k, k], axis=2)
select_v = layers.concat([select_v, v], axis=2)
# update cell states(caches) cached in global block
layers.assign(select_k, cache_k)
layers.assign(select_v, cache_v)
return q, select_k, select_v
return q, k, v
def __combine_heads(x):
"""
Transpose and then reshape the last two dimensions of inpunt tensor x
so that it becomes one dimension, which is reverse to __split_heads.
"""
if len(x.shape) != 4:
raise ValueError("Input(x) should be a 4-D Tensor.")
trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
# The value 0 in shape attr means copying the corresponding dimension
# size of the input as the output dimension size.
return layers.reshape(
x=trans_x,
shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]],
inplace=True)
def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
"""
Scaled Dot-Product Attention
"""
# print(q)
# print(k)
product = layers.matmul(x=q, y=k, transpose_y=True, alpha=d_key**-0.5)
if attn_bias:
product += attn_bias
weights = layers.softmax(product)
if dropout_rate:
weights = layers.dropout(
weights, dropout_prob=dropout_rate, seed=None, is_test=False)
out = layers.matmul(weights, v)
return out
q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
q, k, v = __split_heads_qkv(q, k, v, n_head, d_key, d_value)
ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_model,
dropout_rate)
out = __combine_heads(ctx_multiheads)
# Project back to the model size.
proj_out = layers.fc(input=out,
size=d_model,
bias_attr=False,
num_flatten_dims=2)
return proj_out
def positionwise_feed_forward(x, d_inner_hid, d_hid, dropout_rate):
"""
Position-wise Feed-Forward Networks.
This module consists of two linear transformations with a ReLU activation
in between, which is applied to each position separately and identically.
"""
hidden = layers.fc(input=x,
size=d_inner_hid,
num_flatten_dims=2,
act="relu")
if dropout_rate:
hidden = layers.dropout(
hidden, dropout_prob=dropout_rate, seed=None, is_test=False)
out = layers.fc(input=hidden, size=d_hid, num_flatten_dims=2)
return out
def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.):
"""
Add residual connection, layer normalization and droput to the out tensor
optionally according to the value of process_cmd.
This will be used before or after multi-head attention and position-wise
feed-forward networks.
"""
for cmd in process_cmd:
if cmd == "a": # add residual connection
out = out + prev_out if prev_out else out
elif cmd == "n": # add layer normalization
out = layers.layer_norm(
out,
begin_norm_axis=len(out.shape) - 1,
param_attr=fluid.initializer.Constant(1.),
bias_attr=fluid.initializer.Constant(0.))
elif cmd == "d": # add dropout
if dropout_rate:
out = layers.dropout(
out, dropout_prob=dropout_rate, seed=None, is_test=False)
return out
pre_process_layer = partial(pre_post_process_layer, None)
post_process_layer = pre_post_process_layer
def prepare_encoder(
src_word, # [b,t,c]
src_pos,
src_vocab_size,
src_emb_dim,
src_max_len,
dropout_rate=0.,
bos_idx=0,
word_emb_param_name=None,
pos_enc_param_name=None):
"""Add word embeddings and position encodings.
The output tensor has a shape of:
[batch_size, max_src_length_in_batch, d_model].
This module is used at the bottom of the encoder stacks.
"""
src_word_emb = src_word
src_word_emb = layers.cast(src_word_emb, 'float32')
src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim**0.5)
src_pos_enc = layers.embedding(
src_pos,
size=[src_max_len, src_emb_dim],
param_attr=fluid.ParamAttr(
name=pos_enc_param_name, trainable=False))
src_pos_enc.stop_gradient = True
enc_input = src_word_emb + src_pos_enc
return layers.dropout(
enc_input, dropout_prob=dropout_rate, seed=None,
is_test=False) if dropout_rate else enc_input
def prepare_decoder(src_word,
src_pos,
src_vocab_size,
src_emb_dim,
src_max_len,
dropout_rate=0.,
bos_idx=0,
word_emb_param_name=None,
pos_enc_param_name=None):
"""Add word embeddings and position encodings.
The output tensor has a shape of:
[batch_size, max_src_length_in_batch, d_model].
This module is used at the bottom of the encoder stacks.
"""
src_word_emb = layers.embedding(
src_word,
size=[src_vocab_size, src_emb_dim],
padding_idx=bos_idx, # set embedding of bos to 0
param_attr=fluid.ParamAttr(
name=word_emb_param_name,
initializer=fluid.initializer.Normal(0., src_emb_dim**-0.5)))
src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim**0.5)
src_pos_enc = layers.embedding(
src_pos,
size=[src_max_len, src_emb_dim],
param_attr=fluid.ParamAttr(
name=pos_enc_param_name, trainable=False))
src_pos_enc.stop_gradient = True
enc_input = src_word_emb + src_pos_enc
return layers.dropout(
enc_input, dropout_prob=dropout_rate, seed=None,
is_test=False) if dropout_rate else enc_input
def encoder_layer(enc_input,
attn_bias,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd="n",
postprocess_cmd="da"):
"""The encoder layers that can be stacked to form a deep encoder.
This module consits of a multi-head (self) attention followed by
position-wise feed-forward networks and both the two components companied
with the post_process_layer to add residual connection, layer normalization
and droput.
"""
attn_output = multi_head_attention(
pre_process_layer(enc_input, preprocess_cmd,
prepostprocess_dropout), None, None, attn_bias, d_key,
d_value, d_model, n_head, attention_dropout)
attn_output = post_process_layer(enc_input, attn_output, postprocess_cmd,
prepostprocess_dropout)
ffd_output = positionwise_feed_forward(
pre_process_layer(attn_output, preprocess_cmd, prepostprocess_dropout),
d_inner_hid, d_model, relu_dropout)
return post_process_layer(attn_output, ffd_output, postprocess_cmd,
prepostprocess_dropout)
def encoder(enc_input,
attn_bias,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd="n",
postprocess_cmd="da"):
"""
The encoder is composed of a stack of identical layers returned by calling
encoder_layer.
"""
for i in range(n_layer):
enc_output = encoder_layer(
enc_input,
attn_bias,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd, )
enc_input = enc_output
enc_output = pre_process_layer(enc_output, preprocess_cmd,
prepostprocess_dropout)
return enc_output
def wrap_encoder_forFeature(src_vocab_size,
max_length,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
weight_sharing,
enc_inputs=None,
bos_idx=0):
"""
The wrapper assembles together all needed layers for the encoder.
img, src_pos, src_slf_attn_bias = enc_inputs
img
"""
conv_features, src_pos, src_slf_attn_bias = enc_inputs #
b, t, c = conv_features.shape
enc_input = prepare_encoder(
conv_features,
src_pos,
src_vocab_size,
d_model,
max_length,
prepostprocess_dropout,
bos_idx=bos_idx,
word_emb_param_name="src_word_emb_table")
enc_output = encoder(
enc_input,
src_slf_attn_bias,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd, )
return enc_output
def wrap_encoder(src_vocab_size,
max_length,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
weight_sharing,
enc_inputs=None,
bos_idx=0):
"""
The wrapper assembles together all needed layers for the encoder.
img, src_pos, src_slf_attn_bias = enc_inputs
img
"""
src_word, src_pos, src_slf_attn_bias = enc_inputs #
enc_input = prepare_decoder(
src_word,
src_pos,
src_vocab_size,
d_model,
max_length,
prepostprocess_dropout,
bos_idx=bos_idx,
word_emb_param_name="src_word_emb_table")
enc_output = encoder(
enc_input,
src_slf_attn_bias,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd, )
return enc_output
......@@ -11,3 +11,22 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
def build_loss(config):
# det loss
from .det_db_loss import DBLoss
# rec loss
from .rec_ctc_loss import CTCLoss
support_dict = ['DBLoss', 'CTCLoss']
config = copy.deepcopy(config)
module_name = config.pop('name')
assert module_name in support_dict, Exception('loss only support {}'.format(
support_dict))
module_class = eval(module_name)(**config)
return module_class
#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
......@@ -18,99 +18,189 @@ from __future__ import print_function
import numpy as np
import paddle.fluid as fluid
def BalanceLoss(pred,
gt,
mask,
balance_loss=True,
main_loss_type="DiceLoss",
negative_ratio=3,
return_origin=False,
eps=1e-6):
"""
The BalanceLoss for Differentiable Binarization text detection
args:
pred (variable): predicted feature maps.
gt (variable): ground truth feature maps.
mask (variable): masked maps.
balance_loss (bool): whether balance loss or not, default is True
main_loss_type (str): can only be one of ['CrossEntropy','DiceLoss',
'Euclidean','BCELoss', 'MaskL1Loss'], default is 'DiceLoss'.
negative_ratio (int|float): float, default is 3.
return_origin (bool): whether return unbalanced loss or not, default is False.
eps (float): default is 1e-6.
return: (variable) balanced loss
"""
positive = gt * mask
negative = (1 - gt) * mask
positive_count = fluid.layers.reduce_sum(positive)
positive_count_int = fluid.layers.cast(positive_count, dtype=np.int32)
negative_count = min(
fluid.layers.reduce_sum(negative), positive_count * negative_ratio)
negative_count_int = fluid.layers.cast(negative_count, dtype=np.int32)
if main_loss_type == "CrossEntropy":
loss = fluid.layers.cross_entropy(input=pred, label=gt, soft_label=True)
loss = fluid.layers.reduce_mean(loss)
elif main_loss_type == "Euclidean":
loss = fluid.layers.square(pred - gt)
loss = fluid.layers.reduce_mean(loss)
elif main_loss_type == "DiceLoss":
loss = DiceLoss(pred, gt, mask)
elif main_loss_type == "BCELoss":
loss = fluid.layers.sigmoid_cross_entropy_with_logits(pred, label=gt)
elif main_loss_type == "MaskL1Loss":
loss = MaskL1Loss(pred, gt, mask)
else:
loss_type = [
'CrossEntropy', 'DiceLoss', 'Euclidean', 'BCELoss', 'MaskL1Loss'
]
raise Exception("main_loss_type in BalanceLoss() can only be one of {}".
format(loss_type))
if not balance_loss:
import paddle
from paddle import nn
import paddle.nn.functional as F
class BalanceLoss(nn.Layer):
def __init__(self,
balance_loss=True,
main_loss_type='DiceLoss',
negative_ratio=3,
return_origin=False,
eps=1e-6,
**kwargs):
"""
The BalanceLoss for Differentiable Binarization text detection
args:
balance_loss (bool): whether balance loss or not, default is True
main_loss_type (str): can only be one of ['CrossEntropy','DiceLoss',
'Euclidean','BCELoss', 'MaskL1Loss'], default is 'DiceLoss'.
negative_ratio (int|float): float, default is 3.
return_origin (bool): whether return unbalanced loss or not, default is False.
eps (float): default is 1e-6.
"""
super(BalanceLoss, self).__init__()
self.balance_loss = balance_loss
self.main_loss_type = main_loss_type
self.negative_ratio = negative_ratio
self.main_loss_type = main_loss_type
self.return_origin = return_origin
self.eps = eps
if self.main_loss_type == "CrossEntropy":
self.loss = nn.CrossEntropyLoss()
elif self.main_loss_type == "Euclidean":
self.loss = nn.MSELoss()
elif self.main_loss_type == "DiceLoss":
self.loss = DiceLoss(self.eps)
elif self.main_loss_type == "BCELoss":
self.loss = BCELoss(reduction='none')
elif self.main_loss_type == "MaskL1Loss":
self.loss = MaskL1Loss(self.eps)
else:
loss_type = [
'CrossEntropy', 'DiceLoss', 'Euclidean', 'BCELoss', 'MaskL1Loss'
]
raise Exception(
"main_loss_type in BalanceLoss() can only be one of {}".format(
loss_type))
def forward(self, pred, gt, mask=None):
"""
The BalanceLoss for Differentiable Binarization text detection
args:
pred (variable): predicted feature maps.
gt (variable): ground truth feature maps.
mask (variable): masked maps.
return: (variable) balanced loss
"""
# if self.main_loss_type in ['DiceLoss']:
# # For the loss that returns to scalar value, perform ohem on the mask
# mask = ohem_batch(pred, gt, mask, self.negative_ratio)
# loss = self.loss(pred, gt, mask)
# return loss
positive = gt * mask
negative = (1 - gt) * mask
positive_count = int(positive.sum())
negative_count = int(
min(negative.sum(), positive_count * self.negative_ratio))
loss = self.loss(pred, gt, mask=mask)
if not self.balance_loss:
return loss
positive_loss = positive * loss
negative_loss = negative * loss
negative_loss = paddle.reshape(negative_loss, shape=[-1])
if negative_count > 0:
sort_loss = negative_loss.sort(descending=True)
negative_loss = sort_loss[:negative_count]
# negative_loss, _ = paddle.topk(negative_loss, k=negative_count_int)
balance_loss = (positive_loss.sum() + negative_loss.sum()) / (
positive_count + negative_count + self.eps)
else:
balance_loss = positive_loss.sum() / (positive_count + self.eps)
if self.return_origin:
return balance_loss, loss
return balance_loss
class DiceLoss(nn.Layer):
def __init__(self, eps=1e-6):
super(DiceLoss, self).__init__()
self.eps = eps
def forward(self, pred, gt, mask, weights=None):
"""
DiceLoss function.
"""
assert pred.shape == gt.shape
assert pred.shape == mask.shape
if weights is not None:
assert weights.shape == mask.shape
mask = weights * mask
intersection = paddle.sum(pred * gt * mask)
union = paddle.sum(pred * mask) + paddle.sum(gt * mask) + self.eps
loss = 1 - 2.0 * intersection / union
assert loss <= 1
return loss
positive_loss = positive * loss
negative_loss = negative * loss
negative_loss = fluid.layers.reshape(negative_loss, shape=[-1])
negative_loss, _ = fluid.layers.topk(negative_loss, k=negative_count_int)
balance_loss = (fluid.layers.reduce_sum(positive_loss) +
fluid.layers.reduce_sum(negative_loss)) / (
positive_count + negative_count + eps)
if return_origin:
return balance_loss, loss
return balance_loss
def DiceLoss(pred, gt, mask, weights=None, eps=1e-6):
"""
DiceLoss function.
"""
assert pred.shape == gt.shape
assert pred.shape == mask.shape
if weights is not None:
assert weights.shape == mask.shape
mask = weights * mask
intersection = fluid.layers.reduce_sum(pred * gt * mask)
union = fluid.layers.reduce_sum(pred * mask) + fluid.layers.reduce_sum(
gt * mask) + eps
loss = 1 - 2.0 * intersection / union
assert loss <= 1
return loss
def MaskL1Loss(pred, gt, mask, eps=1e-6):
"""
Mask L1 Loss
"""
loss = fluid.layers.reduce_sum((fluid.layers.abs(pred - gt) * mask)) / (
fluid.layers.reduce_sum(mask) + eps)
loss = fluid.layers.reduce_mean(loss)
return loss
class MaskL1Loss(nn.Layer):
def __init__(self, eps=1e-6):
super(MaskL1Loss, self).__init__()
self.eps = eps
def forward(self, pred, gt, mask):
"""
Mask L1 Loss
"""
loss = (paddle.abs(pred - gt) * mask).sum() / (mask.sum() + self.eps)
loss = paddle.mean(loss)
return loss
class BCELoss(nn.Layer):
def __init__(self, reduction='mean'):
super(BCELoss, self).__init__()
self.reduction = reduction
def forward(self, input, label, mask=None, weight=None, name=None):
loss = F.binary_cross_entropy(input, label, reduction=self.reduction)
return loss
def ohem_single(score, gt_text, training_mask, ohem_ratio):
pos_num = (int)(np.sum(gt_text > 0.5)) - (
int)(np.sum((gt_text > 0.5) & (training_mask <= 0.5)))
if pos_num == 0:
# selected_mask = gt_text.copy() * 0 # may be not good
selected_mask = training_mask
selected_mask = selected_mask.reshape(
1, selected_mask.shape[0], selected_mask.shape[1]).astype('float32')
return selected_mask
neg_num = (int)(np.sum(gt_text <= 0.5))
neg_num = (int)(min(pos_num * ohem_ratio, neg_num))
if neg_num == 0:
selected_mask = training_mask
selected_mask = selected_mask.reshape(
1, selected_mask.shape[0], selected_mask.shape[1]).astype('float32')
return selected_mask
neg_score = score[gt_text <= 0.5]
# 将负样本得分从高到低排序
neg_score_sorted = np.sort(-neg_score)
threshold = -neg_score_sorted[neg_num - 1]
# 选出 得分高的 负样本 和正样本 的 mask
selected_mask = ((score >= threshold) |
(gt_text > 0.5)) & (training_mask > 0.5)
selected_mask = selected_mask.reshape(
1, selected_mask.shape[0], selected_mask.shape[1]).astype('float32')
return selected_mask
def ohem_batch(scores, gt_texts, training_masks, ohem_ratio):
scores = scores.numpy()
gt_texts = gt_texts.numpy()
training_masks = training_masks.numpy()
selected_masks = []
for i in range(scores.shape[0]):
selected_masks.append(
ohem_single(scores[i, :, :], gt_texts[i, :, :], training_masks[
i, :, :], ohem_ratio))
selected_masks = np.concatenate(selected_masks, 0)
selected_masks = paddle.to_variable(selected_masks)
return selected_masks
#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from paddle import nn
from .det_basic_loss import BalanceLoss, MaskL1Loss, DiceLoss
class DBLoss(object):
class DBLoss(nn.Layer):
"""
Differentiable Binarization (DB) Loss Function
args:
param (dict): the super paramter for DB Loss
"""
def __init__(self, params):
def __init__(self,
balance_loss=True,
main_loss_type='DiceLoss',
alpha=5,
beta=10,
ohem_ratio=3,
eps=1e-6,
**kwargs):
super(DBLoss, self).__init__()
self.balance_loss = params['balance_loss']
self.main_loss_type = params['main_loss_type']
self.alpha = params['alpha']
self.beta = params['beta']
self.ohem_ratio = params['ohem_ratio']
self.alpha = alpha
self.beta = beta
self.dice_loss = DiceLoss(eps=eps)
self.l1_loss = MaskL1Loss(eps=eps)
self.bce_loss = BalanceLoss(
balance_loss=balance_loss,
main_loss_type=main_loss_type,
negative_ratio=ohem_ratio)
def __call__(self, predicts, labels):
label_shrink_map = labels['shrink_map']
label_shrink_mask = labels['shrink_mask']
label_threshold_map = labels['threshold_map']
label_threshold_mask = labels['threshold_mask']
pred = predicts['maps']
shrink_maps = pred[:, 0, :, :]
threshold_maps = pred[:, 1, :, :]
binary_maps = pred[:, 2, :, :]
def forward(self, predicts, labels):
label_threshold_map, label_threshold_mask, label_shrink_map, label_shrink_mask = labels[
1:]
shrink_maps = predicts[:, 0, :, :]
threshold_maps = predicts[:, 1, :, :]
binary_maps = predicts[:, 2, :, :]
loss_shrink_maps = BalanceLoss(
shrink_maps,
label_shrink_map,
label_shrink_mask,
balance_loss=self.balance_loss,
main_loss_type=self.main_loss_type,
negative_ratio=self.ohem_ratio)
loss_threshold_maps = MaskL1Loss(threshold_maps, label_threshold_map,
label_threshold_mask)
loss_binary_maps = DiceLoss(binary_maps, label_shrink_map,
label_shrink_mask)
loss_shrink_maps = self.bce_loss(shrink_maps, label_shrink_map,
label_shrink_mask)
loss_threshold_maps = self.l1_loss(threshold_maps, label_threshold_map,
label_threshold_mask)
loss_binary_maps = self.dice_loss(binary_maps, label_shrink_map,
label_shrink_mask)
loss_shrink_maps = self.alpha * loss_shrink_maps
loss_threshold_maps = self.beta * loss_threshold_maps
loss_all = loss_shrink_maps + loss_threshold_maps\
+ loss_binary_maps
losses = {'total_loss':loss_all,\
"loss_shrink_maps":loss_shrink_maps,\
"loss_threshold_maps":loss_threshold_maps,\
"loss_binary_maps":loss_binary_maps}
loss_all = loss_shrink_maps + loss_threshold_maps \
+ loss_binary_maps
losses = {'loss': loss_all, \
"loss_shrink_maps": loss_shrink_maps, \
"loss_threshold_maps": loss_threshold_maps, \
"loss_binary_maps": loss_binary_maps}
return losses
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle.fluid as fluid
class EASTLoss(object):
"""
EAST Loss function
"""
def __init__(self, params=None):
super(EASTLoss, self).__init__()
def __call__(self, predicts, labels):
f_score = predicts['f_score']
f_geo = predicts['f_geo']
l_score = labels['score']
l_geo = labels['geo']
l_mask = labels['mask']
##dice_loss
intersection = fluid.layers.reduce_sum(f_score * l_score * l_mask)
union = fluid.layers.reduce_sum(f_score * l_mask)\
+ fluid.layers.reduce_sum(l_score * l_mask)
dice_loss = 1 - 2 * intersection / (union + 1e-5)
#smoooth_l1_loss
channels = 8
l_geo_split = fluid.layers.split(
l_geo, num_or_sections=channels + 1, dim=1)
f_geo_split = fluid.layers.split(f_geo, num_or_sections=channels, dim=1)
smooth_l1 = 0
for i in range(0, channels):
geo_diff = l_geo_split[i] - f_geo_split[i]
abs_geo_diff = fluid.layers.abs(geo_diff)
smooth_l1_sign = fluid.layers.less_than(abs_geo_diff, l_score)
smooth_l1_sign = fluid.layers.cast(smooth_l1_sign, dtype='float32')
in_loss = abs_geo_diff * abs_geo_diff * smooth_l1_sign + \
(abs_geo_diff - 0.5) * (1.0 - smooth_l1_sign)
out_loss = l_geo_split[-1] / channels * in_loss * l_score
smooth_l1 += out_loss
smooth_l1_loss = fluid.layers.reduce_mean(smooth_l1 * l_score)
dice_loss = dice_loss * 0.01
total_loss = dice_loss + smooth_l1_loss
losses = {'total_loss':total_loss, "dice_loss":dice_loss,\
"smooth_l1_loss":smooth_l1_loss}
return losses
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle.fluid as fluid
class SASTLoss(object):
"""
SAST Loss function
"""
def __init__(self, params=None):
super(SASTLoss, self).__init__()
def __call__(self, predicts, labels):
"""
tcl_pos: N x 128 x 3
tcl_mask: N x 128 x 1
tcl_label: N x X list or LoDTensor
"""
f_score = predicts['f_score']
f_border = predicts['f_border']
f_tvo = predicts['f_tvo']
f_tco = predicts['f_tco']
l_score = labels['input_score']
l_border = labels['input_border']
l_mask = labels['input_mask']
l_tvo = labels['input_tvo']
l_tco = labels['input_tco']
#score_loss
intersection = fluid.layers.reduce_sum(f_score * l_score * l_mask)
union = fluid.layers.reduce_sum(f_score * l_mask) + fluid.layers.reduce_sum(l_score * l_mask)
score_loss = 1.0 - 2 * intersection / (union + 1e-5)
#border loss
l_border_split, l_border_norm = fluid.layers.split(l_border, num_or_sections=[4, 1], dim=1)
f_border_split = f_border
l_border_norm_split = fluid.layers.expand(x=l_border_norm, expand_times=[1, 4, 1, 1])
l_border_score = fluid.layers.expand(x=l_score, expand_times=[1, 4, 1, 1])
l_border_mask = fluid.layers.expand(x=l_mask, expand_times=[1, 4, 1, 1])
border_diff = l_border_split - f_border_split
abs_border_diff = fluid.layers.abs(border_diff)
border_sign = abs_border_diff < 1.0
border_sign = fluid.layers.cast(border_sign, dtype='float32')
border_sign.stop_gradient = True
border_in_loss = 0.5 * abs_border_diff * abs_border_diff * border_sign + \
(abs_border_diff - 0.5) * (1.0 - border_sign)
border_out_loss = l_border_norm_split * border_in_loss
border_loss = fluid.layers.reduce_sum(border_out_loss * l_border_score * l_border_mask) / \
(fluid.layers.reduce_sum(l_border_score * l_border_mask) + 1e-5)
#tvo_loss
l_tvo_split, l_tvo_norm = fluid.layers.split(l_tvo, num_or_sections=[8, 1], dim=1)
f_tvo_split = f_tvo
l_tvo_norm_split = fluid.layers.expand(x=l_tvo_norm, expand_times=[1, 8, 1, 1])
l_tvo_score = fluid.layers.expand(x=l_score, expand_times=[1, 8, 1, 1])
l_tvo_mask = fluid.layers.expand(x=l_mask, expand_times=[1, 8, 1, 1])
#
tvo_geo_diff = l_tvo_split - f_tvo_split
abs_tvo_geo_diff = fluid.layers.abs(tvo_geo_diff)
tvo_sign = abs_tvo_geo_diff < 1.0
tvo_sign = fluid.layers.cast(tvo_sign, dtype='float32')
tvo_sign.stop_gradient = True
tvo_in_loss = 0.5 * abs_tvo_geo_diff * abs_tvo_geo_diff * tvo_sign + \
(abs_tvo_geo_diff - 0.5) * (1.0 - tvo_sign)
tvo_out_loss = l_tvo_norm_split * tvo_in_loss
tvo_loss = fluid.layers.reduce_sum(tvo_out_loss * l_tvo_score * l_tvo_mask) / \
(fluid.layers.reduce_sum(l_tvo_score * l_tvo_mask) + 1e-5)
#tco_loss
l_tco_split, l_tco_norm = fluid.layers.split(l_tco, num_or_sections=[2, 1], dim=1)
f_tco_split = f_tco
l_tco_norm_split = fluid.layers.expand(x=l_tco_norm, expand_times=[1, 2, 1, 1])
l_tco_score = fluid.layers.expand(x=l_score, expand_times=[1, 2, 1, 1])
l_tco_mask = fluid.layers.expand(x=l_mask, expand_times=[1, 2, 1, 1])
#
tco_geo_diff = l_tco_split - f_tco_split
abs_tco_geo_diff = fluid.layers.abs(tco_geo_diff)
tco_sign = abs_tco_geo_diff < 1.0
tco_sign = fluid.layers.cast(tco_sign, dtype='float32')
tco_sign.stop_gradient = True
tco_in_loss = 0.5 * abs_tco_geo_diff * abs_tco_geo_diff * tco_sign + \
(abs_tco_geo_diff - 0.5) * (1.0 - tco_sign)
tco_out_loss = l_tco_norm_split * tco_in_loss
tco_loss = fluid.layers.reduce_sum(tco_out_loss * l_tco_score * l_tco_mask) / \
(fluid.layers.reduce_sum(l_tco_score * l_tco_mask) + 1e-5)
# total loss
tvo_lw, tco_lw = 1.5, 1.5
score_lw, border_lw = 1.0, 1.0
total_loss = score_loss * score_lw + border_loss * border_lw + \
tvo_loss * tvo_lw + tco_loss * tco_lw
losses = {'total_loss':total_loss, "score_loss":score_loss,\
"border_loss":border_loss, 'tvo_loss':tvo_loss, 'tco_loss':tco_loss}
return losses
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment