Commit f1506916 authored by sugon_cxj's avatar sugon_cxj
Browse files

first commit

parent 55c28ed5
Pipeline #266 canceled with stages
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import paddle
from paddle import nn, ParamAttr
import paddle.nn.functional as F
class ClsHead(nn.Layer):
"""
Class orientation
Args:
params(dict): super parameters for build Class network
"""
def __init__(self, in_channels, class_dim, **kwargs):
super(ClsHead, self).__init__()
self.pool = nn.AdaptiveAvgPool2D(1)
stdv = 1.0 / math.sqrt(in_channels * 1.0)
self.fc = nn.Linear(
in_channels,
class_dim,
weight_attr=ParamAttr(
name="fc_0.w_0",
initializer=nn.initializer.Uniform(-stdv, stdv)),
bias_attr=ParamAttr(name="fc_0.b_0"), )
def forward(self, x, targets=None):
x = self.pool(x)
x = paddle.reshape(x, shape=[x.shape[0], x.shape[1]])
x = self.fc(x)
if not self.training:
x = F.softmax(x, axis=1)
return x
# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import paddle
from paddle import nn
import paddle.nn.functional as F
from paddle import ParamAttr
def get_bias_attr(k):
stdv = 1.0 / math.sqrt(k * 1.0)
initializer = paddle.nn.initializer.Uniform(-stdv, stdv)
bias_attr = ParamAttr(initializer=initializer)
return bias_attr
class Head(nn.Layer):
def __init__(self, in_channels, name_list, kernel_list=[3, 2, 2], **kwargs):
super(Head, self).__init__()
self.conv1 = nn.Conv2D(
in_channels=in_channels,
out_channels=in_channels // 4,
kernel_size=kernel_list[0],
padding=int(kernel_list[0] // 2),
weight_attr=ParamAttr(),
bias_attr=False)
self.conv_bn1 = nn.BatchNorm(
num_channels=in_channels // 4,
param_attr=ParamAttr(
initializer=paddle.nn.initializer.Constant(value=1.0)),
bias_attr=ParamAttr(
initializer=paddle.nn.initializer.Constant(value=1e-4)),
act='relu')
self.conv2 = nn.Conv2DTranspose(
in_channels=in_channels // 4,
out_channels=in_channels // 4,
kernel_size=kernel_list[1],
stride=2,
weight_attr=ParamAttr(
initializer=paddle.nn.initializer.KaimingUniform()),
bias_attr=get_bias_attr(in_channels // 4))
self.conv_bn2 = nn.BatchNorm(
num_channels=in_channels // 4,
param_attr=ParamAttr(
initializer=paddle.nn.initializer.Constant(value=1.0)),
bias_attr=ParamAttr(
initializer=paddle.nn.initializer.Constant(value=1e-4)),
act="relu")
self.conv3 = nn.Conv2DTranspose(
in_channels=in_channels // 4,
out_channels=1,
kernel_size=kernel_list[2],
stride=2,
weight_attr=ParamAttr(
initializer=paddle.nn.initializer.KaimingUniform()),
bias_attr=get_bias_attr(in_channels // 4), )
def forward(self, x):
x = self.conv1(x)
x = self.conv_bn1(x)
x = self.conv2(x)
x = self.conv_bn2(x)
x = self.conv3(x)
x = F.sigmoid(x)
return x
class DBHead(nn.Layer):
"""
Differentiable Binarization (DB) for text detection:
see https://arxiv.org/abs/1911.08947
args:
params(dict): super parameters for build DB network
"""
def __init__(self, in_channels, k=50, **kwargs):
super(DBHead, self).__init__()
self.k = k
binarize_name_list = [
'conv2d_56', 'batch_norm_47', 'conv2d_transpose_0', 'batch_norm_48',
'conv2d_transpose_1', 'binarize'
]
thresh_name_list = [
'conv2d_57', 'batch_norm_49', 'conv2d_transpose_2', 'batch_norm_50',
'conv2d_transpose_3', 'thresh'
]
self.binarize = Head(in_channels, binarize_name_list, **kwargs)
self.thresh = Head(in_channels, thresh_name_list, **kwargs)
def step_function(self, x, y):
return paddle.reciprocal(1 + paddle.exp(-self.k * (x - y)))
def forward(self, x, targets=None):
shrink_maps = self.binarize(x)
if not self.training:
return {'maps': shrink_maps}
threshold_maps = self.thresh(x)
binary_maps = self.step_function(shrink_maps, threshold_maps)
y = paddle.concat([shrink_maps, threshold_maps, binary_maps], axis=1)
return {'maps': y}
# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import paddle
from paddle import nn
import paddle.nn.functional as F
from paddle import ParamAttr
class ConvBNLayer(nn.Layer):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride,
padding,
groups=1,
if_act=True,
act=None,
name=None):
super(ConvBNLayer, self).__init__()
self.if_act = if_act
self.act = act
self.conv = nn.Conv2D(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
groups=groups,
weight_attr=ParamAttr(name=name + '_weights'),
bias_attr=False)
self.bn = nn.BatchNorm(
num_channels=out_channels,
act=act,
param_attr=ParamAttr(name="bn_" + name + "_scale"),
bias_attr=ParamAttr(name="bn_" + name + "_offset"),
moving_mean_name="bn_" + name + "_mean",
moving_variance_name="bn_" + name + "_variance")
def forward(self, x):
x = self.conv(x)
x = self.bn(x)
return x
class EASTHead(nn.Layer):
"""
"""
def __init__(self, in_channels, model_name, **kwargs):
super(EASTHead, self).__init__()
self.model_name = model_name
if self.model_name == "large":
num_outputs = [128, 64, 1, 8]
else:
num_outputs = [64, 32, 1, 8]
self.det_conv1 = ConvBNLayer(
in_channels=in_channels,
out_channels=num_outputs[0],
kernel_size=3,
stride=1,
padding=1,
if_act=True,
act='relu',
name="det_head1")
self.det_conv2 = ConvBNLayer(
in_channels=num_outputs[0],
out_channels=num_outputs[1],
kernel_size=3,
stride=1,
padding=1,
if_act=True,
act='relu',
name="det_head2")
self.score_conv = ConvBNLayer(
in_channels=num_outputs[1],
out_channels=num_outputs[2],
kernel_size=1,
stride=1,
padding=0,
if_act=False,
act=None,
name="f_score")
self.geo_conv = ConvBNLayer(
in_channels=num_outputs[1],
out_channels=num_outputs[3],
kernel_size=1,
stride=1,
padding=0,
if_act=False,
act=None,
name="f_geo")
def forward(self, x, targets=None):
f_det = self.det_conv1(x)
f_det = self.det_conv2(f_det)
f_score = self.score_conv(f_det)
f_score = F.sigmoid(f_score)
f_geo = self.geo_conv(f_det)
f_geo = (F.sigmoid(f_geo) - 0.5) * 2 * 800
pred = {'f_score': f_score, 'f_geo': f_geo}
return pred
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is refer from:
https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textdet/dense_heads/fce_head.py
"""
from paddle import nn
from paddle import ParamAttr
import paddle.nn.functional as F
from paddle.nn.initializer import Normal
import paddle
from functools import partial
def multi_apply(func, *args, **kwargs):
pfunc = partial(func, **kwargs) if kwargs else func
map_results = map(pfunc, *args)
return tuple(map(list, zip(*map_results)))
class FCEHead(nn.Layer):
"""The class for implementing FCENet head.
FCENet(CVPR2021): Fourier Contour Embedding for Arbitrary-shaped Text
Detection.
[https://arxiv.org/abs/2104.10442]
Args:
in_channels (int): The number of input channels.
scales (list[int]) : The scale of each layer.
fourier_degree (int) : The maximum Fourier transform degree k.
"""
def __init__(self, in_channels, fourier_degree=5):
super().__init__()
assert isinstance(in_channels, int)
self.downsample_ratio = 1.0
self.in_channels = in_channels
self.fourier_degree = fourier_degree
self.out_channels_cls = 4
self.out_channels_reg = (2 * self.fourier_degree + 1) * 2
self.out_conv_cls = nn.Conv2D(
in_channels=self.in_channels,
out_channels=self.out_channels_cls,
kernel_size=3,
stride=1,
padding=1,
groups=1,
weight_attr=ParamAttr(
name='cls_weights',
initializer=Normal(
mean=0., std=0.01)),
bias_attr=True)
self.out_conv_reg = nn.Conv2D(
in_channels=self.in_channels,
out_channels=self.out_channels_reg,
kernel_size=3,
stride=1,
padding=1,
groups=1,
weight_attr=ParamAttr(
name='reg_weights',
initializer=Normal(
mean=0., std=0.01)),
bias_attr=True)
def forward(self, feats, targets=None):
cls_res, reg_res = multi_apply(self.forward_single, feats)
level_num = len(cls_res)
outs = {}
if not self.training:
for i in range(level_num):
tr_pred = F.softmax(cls_res[i][:, 0:2, :, :], axis=1)
tcl_pred = F.softmax(cls_res[i][:, 2:, :, :], axis=1)
outs['level_{}'.format(i)] = paddle.concat(
[tr_pred, tcl_pred, reg_res[i]], axis=1)
else:
preds = [[cls_res[i], reg_res[i]] for i in range(level_num)]
outs['levels'] = preds
return outs
def forward_single(self, x):
cls_predict = self.out_conv_cls(x)
reg_predict = self.out_conv_reg(x)
return cls_predict, reg_predict
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is refer from:
https://github.com/whai362/PSENet/blob/python3/models/head/psenet_head.py
"""
from paddle import nn
class PSEHead(nn.Layer):
def __init__(self, in_channels, hidden_dim=256, out_channels=7, **kwargs):
super(PSEHead, self).__init__()
self.conv1 = nn.Conv2D(
in_channels, hidden_dim, kernel_size=3, stride=1, padding=1)
self.bn1 = nn.BatchNorm2D(hidden_dim)
self.relu1 = nn.ReLU()
self.conv2 = nn.Conv2D(
hidden_dim, out_channels, kernel_size=1, stride=1, padding=0)
def forward(self, x, **kwargs):
out = self.conv1(x)
out = self.relu1(self.bn1(out))
out = self.conv2(out)
return {'maps': out}
# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import paddle
from paddle import nn
import paddle.nn.functional as F
from paddle import ParamAttr
class ConvBNLayer(nn.Layer):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride,
groups=1,
if_act=True,
act=None,
name=None):
super(ConvBNLayer, self).__init__()
self.if_act = if_act
self.act = act
self.conv = nn.Conv2D(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=(kernel_size - 1) // 2,
groups=groups,
weight_attr=ParamAttr(name=name + '_weights'),
bias_attr=False)
self.bn = nn.BatchNorm(
num_channels=out_channels,
act=act,
param_attr=ParamAttr(name="bn_" + name + "_scale"),
bias_attr=ParamAttr(name="bn_" + name + "_offset"),
moving_mean_name="bn_" + name + "_mean",
moving_variance_name="bn_" + name + "_variance")
def forward(self, x):
x = self.conv(x)
x = self.bn(x)
return x
class SAST_Header1(nn.Layer):
def __init__(self, in_channels, **kwargs):
super(SAST_Header1, self).__init__()
out_channels = [64, 64, 128]
self.score_conv = nn.Sequential(
ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_score1'),
ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_score2'),
ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_score3'),
ConvBNLayer(out_channels[2], 1, 3, 1, act=None, name='f_score4')
)
self.border_conv = nn.Sequential(
ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_border1'),
ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_border2'),
ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_border3'),
ConvBNLayer(out_channels[2], 4, 3, 1, act=None, name='f_border4')
)
def forward(self, x):
f_score = self.score_conv(x)
f_score = F.sigmoid(f_score)
f_border = self.border_conv(x)
return f_score, f_border
class SAST_Header2(nn.Layer):
def __init__(self, in_channels, **kwargs):
super(SAST_Header2, self).__init__()
out_channels = [64, 64, 128]
self.tvo_conv = nn.Sequential(
ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_tvo1'),
ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_tvo2'),
ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_tvo3'),
ConvBNLayer(out_channels[2], 8, 3, 1, act=None, name='f_tvo4')
)
self.tco_conv = nn.Sequential(
ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_tco1'),
ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_tco2'),
ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_tco3'),
ConvBNLayer(out_channels[2], 2, 3, 1, act=None, name='f_tco4')
)
def forward(self, x):
f_tvo = self.tvo_conv(x)
f_tco = self.tco_conv(x)
return f_tvo, f_tco
class SASTHead(nn.Layer):
"""
"""
def __init__(self, in_channels, **kwargs):
super(SASTHead, self).__init__()
self.head1 = SAST_Header1(in_channels)
self.head2 = SAST_Header2(in_channels)
def forward(self, x, targets=None):
f_score, f_border = self.head1(x)
f_tvo, f_tco = self.head2(x)
predicts = {}
predicts['f_score'] = f_score
predicts['f_border'] = f_border
predicts['f_tvo'] = f_tvo
predicts['f_tco'] = f_tco
return predicts
\ No newline at end of file
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import paddle
from paddle import nn
import paddle.nn.functional as F
from paddle import ParamAttr
class ConvBNLayer(nn.Layer):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride,
padding,
groups=1,
if_act=True,
act=None,
name=None):
super(ConvBNLayer, self).__init__()
self.if_act = if_act
self.act = act
self.conv = nn.Conv2D(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
groups=groups,
weight_attr=ParamAttr(name=name + '_weights'),
bias_attr=False)
self.bn = nn.BatchNorm(
num_channels=out_channels,
act=act,
param_attr=ParamAttr(name="bn_" + name + "_scale"),
bias_attr=ParamAttr(name="bn_" + name + "_offset"),
moving_mean_name="bn_" + name + "_mean",
moving_variance_name="bn_" + name + "_variance",
use_global_stats=False)
def forward(self, x):
x = self.conv(x)
x = self.bn(x)
return x
class PGHead(nn.Layer):
"""
"""
def __init__(self, in_channels, **kwargs):
super(PGHead, self).__init__()
self.conv_f_score1 = ConvBNLayer(
in_channels=in_channels,
out_channels=64,
kernel_size=1,
stride=1,
padding=0,
act='relu',
name="conv_f_score{}".format(1))
self.conv_f_score2 = ConvBNLayer(
in_channels=64,
out_channels=64,
kernel_size=3,
stride=1,
padding=1,
act='relu',
name="conv_f_score{}".format(2))
self.conv_f_score3 = ConvBNLayer(
in_channels=64,
out_channels=128,
kernel_size=1,
stride=1,
padding=0,
act='relu',
name="conv_f_score{}".format(3))
self.conv1 = nn.Conv2D(
in_channels=128,
out_channels=1,
kernel_size=3,
stride=1,
padding=1,
groups=1,
weight_attr=ParamAttr(name="conv_f_score{}".format(4)),
bias_attr=False)
self.conv_f_boder1 = ConvBNLayer(
in_channels=in_channels,
out_channels=64,
kernel_size=1,
stride=1,
padding=0,
act='relu',
name="conv_f_boder{}".format(1))
self.conv_f_boder2 = ConvBNLayer(
in_channels=64,
out_channels=64,
kernel_size=3,
stride=1,
padding=1,
act='relu',
name="conv_f_boder{}".format(2))
self.conv_f_boder3 = ConvBNLayer(
in_channels=64,
out_channels=128,
kernel_size=1,
stride=1,
padding=0,
act='relu',
name="conv_f_boder{}".format(3))
self.conv2 = nn.Conv2D(
in_channels=128,
out_channels=4,
kernel_size=3,
stride=1,
padding=1,
groups=1,
weight_attr=ParamAttr(name="conv_f_boder{}".format(4)),
bias_attr=False)
self.conv_f_char1 = ConvBNLayer(
in_channels=in_channels,
out_channels=128,
kernel_size=1,
stride=1,
padding=0,
act='relu',
name="conv_f_char{}".format(1))
self.conv_f_char2 = ConvBNLayer(
in_channels=128,
out_channels=128,
kernel_size=3,
stride=1,
padding=1,
act='relu',
name="conv_f_char{}".format(2))
self.conv_f_char3 = ConvBNLayer(
in_channels=128,
out_channels=256,
kernel_size=1,
stride=1,
padding=0,
act='relu',
name="conv_f_char{}".format(3))
self.conv_f_char4 = ConvBNLayer(
in_channels=256,
out_channels=256,
kernel_size=3,
stride=1,
padding=1,
act='relu',
name="conv_f_char{}".format(4))
self.conv_f_char5 = ConvBNLayer(
in_channels=256,
out_channels=256,
kernel_size=1,
stride=1,
padding=0,
act='relu',
name="conv_f_char{}".format(5))
self.conv3 = nn.Conv2D(
in_channels=256,
out_channels=37,
kernel_size=3,
stride=1,
padding=1,
groups=1,
weight_attr=ParamAttr(name="conv_f_char{}".format(6)),
bias_attr=False)
self.conv_f_direc1 = ConvBNLayer(
in_channels=in_channels,
out_channels=64,
kernel_size=1,
stride=1,
padding=0,
act='relu',
name="conv_f_direc{}".format(1))
self.conv_f_direc2 = ConvBNLayer(
in_channels=64,
out_channels=64,
kernel_size=3,
stride=1,
padding=1,
act='relu',
name="conv_f_direc{}".format(2))
self.conv_f_direc3 = ConvBNLayer(
in_channels=64,
out_channels=128,
kernel_size=1,
stride=1,
padding=0,
act='relu',
name="conv_f_direc{}".format(3))
self.conv4 = nn.Conv2D(
in_channels=128,
out_channels=2,
kernel_size=3,
stride=1,
padding=1,
groups=1,
weight_attr=ParamAttr(name="conv_f_direc{}".format(4)),
bias_attr=False)
def forward(self, x, targets=None):
f_score = self.conv_f_score1(x)
f_score = self.conv_f_score2(f_score)
f_score = self.conv_f_score3(f_score)
f_score = self.conv1(f_score)
f_score = F.sigmoid(f_score)
# f_border
f_border = self.conv_f_boder1(x)
f_border = self.conv_f_boder2(f_border)
f_border = self.conv_f_boder3(f_border)
f_border = self.conv2(f_border)
f_char = self.conv_f_char1(x)
f_char = self.conv_f_char2(f_char)
f_char = self.conv_f_char3(f_char)
f_char = self.conv_f_char4(f_char)
f_char = self.conv_f_char5(f_char)
f_char = self.conv3(f_char)
f_direction = self.conv_f_direc1(x)
f_direction = self.conv_f_direc2(f_direction)
f_direction = self.conv_f_direc3(f_direction)
f_direction = self.conv4(f_direction)
predicts = {}
predicts['f_score'] = f_score
predicts['f_border'] = f_border
predicts['f_char'] = f_char
predicts['f_direction'] = f_direction
return predicts
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# reference from : https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/kie/heads/sdmgr_head.py
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import paddle
from paddle import nn
import paddle.nn.functional as F
from paddle import ParamAttr
class SDMGRHead(nn.Layer):
def __init__(self,
in_channels,
num_chars=92,
visual_dim=16,
fusion_dim=1024,
node_input=32,
node_embed=256,
edge_input=5,
edge_embed=256,
num_gnn=2,
num_classes=26,
bidirectional=False):
super().__init__()
self.fusion = Block([visual_dim, node_embed], node_embed, fusion_dim)
self.node_embed = nn.Embedding(num_chars, node_input, 0)
hidden = node_embed // 2 if bidirectional else node_embed
self.rnn = nn.LSTM(
input_size=node_input, hidden_size=hidden, num_layers=1)
self.edge_embed = nn.Linear(edge_input, edge_embed)
self.gnn_layers = nn.LayerList(
[GNNLayer(node_embed, edge_embed) for _ in range(num_gnn)])
self.node_cls = nn.Linear(node_embed, num_classes)
self.edge_cls = nn.Linear(edge_embed, 2)
def forward(self, input, targets):
relations, texts, x = input
node_nums, char_nums = [], []
for text in texts:
node_nums.append(text.shape[0])
char_nums.append(paddle.sum((text > -1).astype(int), axis=-1))
max_num = max([char_num.max() for char_num in char_nums])
all_nodes = paddle.concat([
paddle.concat(
[text, paddle.zeros(
(text.shape[0], max_num - text.shape[1]))], -1)
for text in texts
])
temp = paddle.clip(all_nodes, min=0).astype(int)
embed_nodes = self.node_embed(temp)
rnn_nodes, _ = self.rnn(embed_nodes)
b, h, w = rnn_nodes.shape
nodes = paddle.zeros([b, w])
all_nums = paddle.concat(char_nums)
valid = paddle.nonzero((all_nums > 0).astype(int))
temp_all_nums = (
paddle.gather(all_nums, valid) - 1).unsqueeze(-1).unsqueeze(-1)
temp_all_nums = paddle.expand(temp_all_nums, [
temp_all_nums.shape[0], temp_all_nums.shape[1], rnn_nodes.shape[-1]
])
temp_all_nodes = paddle.gather(rnn_nodes, valid)
N, C, A = temp_all_nodes.shape
one_hot = F.one_hot(
temp_all_nums[:, 0, :], num_classes=C).transpose([0, 2, 1])
one_hot = paddle.multiply(
temp_all_nodes, one_hot.astype("float32")).sum(axis=1, keepdim=True)
t = one_hot.expand([N, 1, A]).squeeze(1)
nodes = paddle.scatter(nodes, valid.squeeze(1), t)
if x is not None:
nodes = self.fusion([x, nodes])
all_edges = paddle.concat(
[rel.reshape([-1, rel.shape[-1]]) for rel in relations])
embed_edges = self.edge_embed(all_edges.astype('float32'))
embed_edges = F.normalize(embed_edges)
for gnn_layer in self.gnn_layers:
nodes, cat_nodes = gnn_layer(nodes, embed_edges, node_nums)
node_cls, edge_cls = self.node_cls(nodes), self.edge_cls(cat_nodes)
return node_cls, edge_cls
class GNNLayer(nn.Layer):
def __init__(self, node_dim=256, edge_dim=256):
super().__init__()
self.in_fc = nn.Linear(node_dim * 2 + edge_dim, node_dim)
self.coef_fc = nn.Linear(node_dim, 1)
self.out_fc = nn.Linear(node_dim, node_dim)
self.relu = nn.ReLU()
def forward(self, nodes, edges, nums):
start, cat_nodes = 0, []
for num in nums:
sample_nodes = nodes[start:start + num]
cat_nodes.append(
paddle.concat([
paddle.expand(sample_nodes.unsqueeze(1), [-1, num, -1]),
paddle.expand(sample_nodes.unsqueeze(0), [num, -1, -1])
], -1).reshape([num**2, -1]))
start += num
cat_nodes = paddle.concat([paddle.concat(cat_nodes), edges], -1)
cat_nodes = self.relu(self.in_fc(cat_nodes))
coefs = self.coef_fc(cat_nodes)
start, residuals = 0, []
for num in nums:
residual = F.softmax(
-paddle.eye(num).unsqueeze(-1) * 1e9 +
coefs[start:start + num**2].reshape([num, num, -1]), 1)
residuals.append((residual * cat_nodes[start:start + num**2]
.reshape([num, num, -1])).sum(1))
start += num**2
nodes += self.relu(self.out_fc(paddle.concat(residuals)))
return [nodes, cat_nodes]
class Block(nn.Layer):
def __init__(self,
input_dims,
output_dim,
mm_dim=1600,
chunks=20,
rank=15,
shared=False,
dropout_input=0.,
dropout_pre_lin=0.,
dropout_output=0.,
pos_norm='before_cat'):
super().__init__()
self.rank = rank
self.dropout_input = dropout_input
self.dropout_pre_lin = dropout_pre_lin
self.dropout_output = dropout_output
assert (pos_norm in ['before_cat', 'after_cat'])
self.pos_norm = pos_norm
# Modules
self.linear0 = nn.Linear(input_dims[0], mm_dim)
self.linear1 = (self.linear0
if shared else nn.Linear(input_dims[1], mm_dim))
self.merge_linears0 = nn.LayerList()
self.merge_linears1 = nn.LayerList()
self.chunks = self.chunk_sizes(mm_dim, chunks)
for size in self.chunks:
ml0 = nn.Linear(size, size * rank)
self.merge_linears0.append(ml0)
ml1 = ml0 if shared else nn.Linear(size, size * rank)
self.merge_linears1.append(ml1)
self.linear_out = nn.Linear(mm_dim, output_dim)
def forward(self, x):
x0 = self.linear0(x[0])
x1 = self.linear1(x[1])
bs = x1.shape[0]
if self.dropout_input > 0:
x0 = F.dropout(x0, p=self.dropout_input, training=self.training)
x1 = F.dropout(x1, p=self.dropout_input, training=self.training)
x0_chunks = paddle.split(x0, self.chunks, -1)
x1_chunks = paddle.split(x1, self.chunks, -1)
zs = []
for x0_c, x1_c, m0, m1 in zip(x0_chunks, x1_chunks, self.merge_linears0,
self.merge_linears1):
m = m0(x0_c) * m1(x1_c) # bs x split_size*rank
m = m.reshape([bs, self.rank, -1])
z = paddle.sum(m, 1)
if self.pos_norm == 'before_cat':
z = paddle.sqrt(F.relu(z)) - paddle.sqrt(F.relu(-z))
z = F.normalize(z)
zs.append(z)
z = paddle.concat(zs, 1)
if self.pos_norm == 'after_cat':
z = paddle.sqrt(F.relu(z)) - paddle.sqrt(F.relu(-z))
z = F.normalize(z)
if self.dropout_pre_lin > 0:
z = F.dropout(z, p=self.dropout_pre_lin, training=self.training)
z = self.linear_out(z)
if self.dropout_output > 0:
z = F.dropout(z, p=self.dropout_output, training=self.training)
return z
def chunk_sizes(self, dim, chunks):
split_size = (dim + chunks - 1) // chunks
sizes_list = [split_size] * chunks
sizes_list[-1] = sizes_list[-1] - (sum(sizes_list) - dim)
return sizes_list
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
from paddle import nn
import paddle.nn.functional as F
from paddle.nn import Linear
from paddle.nn.initializer import XavierUniform as xavier_uniform_
from paddle.nn.initializer import Constant as constant_
from paddle.nn.initializer import XavierNormal as xavier_normal_
zeros_ = constant_(value=0.)
ones_ = constant_(value=1.)
class MultiheadAttention(nn.Layer):
"""Allows the model to jointly attend to information
from different representation subspaces.
See reference: Attention Is All You Need
.. math::
\text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
\text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)
Args:
embed_dim: total dimension of the model
num_heads: parallel attention layers, or heads
"""
def __init__(self,
embed_dim,
num_heads,
dropout=0.,
bias=True,
add_bias_kv=False,
add_zero_attn=False):
super(MultiheadAttention, self).__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
self.scaling = self.head_dim**-0.5
self.out_proj = Linear(embed_dim, embed_dim, bias_attr=bias)
self._reset_parameters()
self.conv1 = paddle.nn.Conv2D(
in_channels=embed_dim, out_channels=embed_dim, kernel_size=(1, 1))
self.conv2 = paddle.nn.Conv2D(
in_channels=embed_dim, out_channels=embed_dim, kernel_size=(1, 1))
self.conv3 = paddle.nn.Conv2D(
in_channels=embed_dim, out_channels=embed_dim, kernel_size=(1, 1))
def _reset_parameters(self):
xavier_uniform_(self.out_proj.weight)
def forward(self,
query,
key,
value,
key_padding_mask=None,
incremental_state=None,
attn_mask=None):
"""
Inputs of forward function
query: [target length, batch size, embed dim]
key: [sequence length, batch size, embed dim]
value: [sequence length, batch size, embed dim]
key_padding_mask: if True, mask padding based on batch size
incremental_state: if provided, previous time steps are cashed
need_weights: output attn_output_weights
static_kv: key and value are static
Outputs of forward function
attn_output: [target length, batch size, embed dim]
attn_output_weights: [batch size, target length, sequence length]
"""
q_shape = paddle.shape(query)
src_shape = paddle.shape(key)
q = self._in_proj_q(query)
k = self._in_proj_k(key)
v = self._in_proj_v(value)
q *= self.scaling
q = paddle.transpose(
paddle.reshape(
q, [q_shape[0], q_shape[1], self.num_heads, self.head_dim]),
[1, 2, 0, 3])
k = paddle.transpose(
paddle.reshape(
k, [src_shape[0], q_shape[1], self.num_heads, self.head_dim]),
[1, 2, 0, 3])
v = paddle.transpose(
paddle.reshape(
v, [src_shape[0], q_shape[1], self.num_heads, self.head_dim]),
[1, 2, 0, 3])
if key_padding_mask is not None:
assert key_padding_mask.shape[0] == q_shape[1]
assert key_padding_mask.shape[1] == src_shape[0]
attn_output_weights = paddle.matmul(q,
paddle.transpose(k, [0, 1, 3, 2]))
if attn_mask is not None:
attn_mask = paddle.unsqueeze(paddle.unsqueeze(attn_mask, 0), 0)
attn_output_weights += attn_mask
if key_padding_mask is not None:
attn_output_weights = paddle.reshape(
attn_output_weights,
[q_shape[1], self.num_heads, q_shape[0], src_shape[0]])
key = paddle.unsqueeze(paddle.unsqueeze(key_padding_mask, 1), 2)
key = paddle.cast(key, 'float32')
y = paddle.full(
shape=paddle.shape(key), dtype='float32', fill_value='-inf')
y = paddle.where(key == 0., key, y)
attn_output_weights += y
attn_output_weights = F.softmax(
attn_output_weights.astype('float32'),
axis=-1,
dtype=paddle.float32 if attn_output_weights.dtype == paddle.float16
else attn_output_weights.dtype)
attn_output_weights = F.dropout(
attn_output_weights, p=self.dropout, training=self.training)
attn_output = paddle.matmul(attn_output_weights, v)
attn_output = paddle.reshape(
paddle.transpose(attn_output, [2, 0, 1, 3]),
[q_shape[0], q_shape[1], self.embed_dim])
attn_output = self.out_proj(attn_output)
return attn_output
def _in_proj_q(self, query):
query = paddle.transpose(query, [1, 2, 0])
query = paddle.unsqueeze(query, axis=2)
res = self.conv1(query)
res = paddle.squeeze(res, axis=2)
res = paddle.transpose(res, [2, 0, 1])
return res
def _in_proj_k(self, key):
key = paddle.transpose(key, [1, 2, 0])
key = paddle.unsqueeze(key, axis=2)
res = self.conv2(key)
res = paddle.squeeze(res, axis=2)
res = paddle.transpose(res, [2, 0, 1])
return res
def _in_proj_v(self, value):
value = paddle.transpose(value, [1, 2, 0]) #(1, 2, 0)
value = paddle.unsqueeze(value, axis=2)
res = self.conv3(value)
res = paddle.squeeze(res, axis=2)
res = paddle.transpose(res, [2, 0, 1])
return res
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is refer from:
https://github.com/ayumiymk/aster.pytorch/blob/master/lib/models/attention_recognition_head.py
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import sys
import paddle
from paddle import nn
from paddle.nn import functional as F
class AsterHead(nn.Layer):
def __init__(self,
in_channels,
out_channels,
sDim,
attDim,
max_len_labels,
time_step=25,
beam_width=5,
**kwargs):
super(AsterHead, self).__init__()
self.num_classes = out_channels
self.in_planes = in_channels
self.sDim = sDim
self.attDim = attDim
self.max_len_labels = max_len_labels
self.decoder = AttentionRecognitionHead(in_channels, out_channels, sDim,
attDim, max_len_labels)
self.time_step = time_step
self.embeder = Embedding(self.time_step, in_channels)
self.beam_width = beam_width
self.eos = self.num_classes - 3
def forward(self, x, targets=None, embed=None):
return_dict = {}
embedding_vectors = self.embeder(x)
if self.training:
rec_targets, rec_lengths, _ = targets
rec_pred = self.decoder([x, rec_targets, rec_lengths],
embedding_vectors)
return_dict['rec_pred'] = rec_pred
return_dict['embedding_vectors'] = embedding_vectors
else:
rec_pred, rec_pred_scores = self.decoder.beam_search(
x, self.beam_width, self.eos, embedding_vectors)
return_dict['rec_pred'] = rec_pred
return_dict['rec_pred_scores'] = rec_pred_scores
return_dict['embedding_vectors'] = embedding_vectors
return return_dict
class Embedding(nn.Layer):
def __init__(self, in_timestep, in_planes, mid_dim=4096, embed_dim=300):
super(Embedding, self).__init__()
self.in_timestep = in_timestep
self.in_planes = in_planes
self.embed_dim = embed_dim
self.mid_dim = mid_dim
self.eEmbed = nn.Linear(
in_timestep * in_planes,
self.embed_dim) # Embed encoder output to a word-embedding like
def forward(self, x):
x = paddle.reshape(x, [paddle.shape(x)[0], -1])
x = self.eEmbed(x)
return x
class AttentionRecognitionHead(nn.Layer):
"""
input: [b x 16 x 64 x in_planes]
output: probability sequence: [b x T x num_classes]
"""
def __init__(self, in_channels, out_channels, sDim, attDim, max_len_labels):
super(AttentionRecognitionHead, self).__init__()
self.num_classes = out_channels # this is the output classes. So it includes the <EOS>.
self.in_planes = in_channels
self.sDim = sDim
self.attDim = attDim
self.max_len_labels = max_len_labels
self.decoder = DecoderUnit(
sDim=sDim, xDim=in_channels, yDim=self.num_classes, attDim=attDim)
def forward(self, x, embed):
x, targets, lengths = x
batch_size = paddle.shape(x)[0]
# Decoder
state = self.decoder.get_initial_state(embed)
outputs = []
for i in range(max(lengths)):
if i == 0:
y_prev = paddle.full(
shape=[batch_size], fill_value=self.num_classes)
else:
y_prev = targets[:, i - 1]
output, state = self.decoder(x, state, y_prev)
outputs.append(output)
outputs = paddle.concat([_.unsqueeze(1) for _ in outputs], 1)
return outputs
# inference stage.
def sample(self, x):
x, _, _ = x
batch_size = x.size(0)
# Decoder
state = paddle.zeros([1, batch_size, self.sDim])
predicted_ids, predicted_scores = [], []
for i in range(self.max_len_labels):
if i == 0:
y_prev = paddle.full(
shape=[batch_size], fill_value=self.num_classes)
else:
y_prev = predicted
output, state = self.decoder(x, state, y_prev)
output = F.softmax(output, axis=1)
score, predicted = output.max(1)
predicted_ids.append(predicted.unsqueeze(1))
predicted_scores.append(score.unsqueeze(1))
predicted_ids = paddle.concat([predicted_ids, 1])
predicted_scores = paddle.concat([predicted_scores, 1])
# return predicted_ids.squeeze(), predicted_scores.squeeze()
return predicted_ids, predicted_scores
def beam_search(self, x, beam_width, eos, embed):
def _inflate(tensor, times, dim):
repeat_dims = [1] * tensor.dim()
repeat_dims[dim] = times
output = paddle.tile(tensor, repeat_dims)
return output
# https://github.com/IBM/pytorch-seq2seq/blob/fede87655ddce6c94b38886089e05321dc9802af/seq2seq/models/TopKDecoder.py
batch_size, l, d = x.shape
x = paddle.tile(
paddle.transpose(
x.unsqueeze(1), perm=[1, 0, 2, 3]), [beam_width, 1, 1, 1])
inflated_encoder_feats = paddle.reshape(
paddle.transpose(
x, perm=[1, 0, 2, 3]), [-1, l, d])
# Initialize the decoder
state = self.decoder.get_initial_state(embed, tile_times=beam_width)
pos_index = paddle.reshape(
paddle.arange(batch_size) * beam_width, shape=[-1, 1])
# Initialize the scores
sequence_scores = paddle.full(
shape=[batch_size * beam_width, 1], fill_value=-float('Inf'))
index = [i * beam_width for i in range(0, batch_size)]
sequence_scores[index] = 0.0
# Initialize the input vector
y_prev = paddle.full(
shape=[batch_size * beam_width], fill_value=self.num_classes)
# Store decisions for backtracking
stored_scores = list()
stored_predecessors = list()
stored_emitted_symbols = list()
for i in range(self.max_len_labels):
output, state = self.decoder(inflated_encoder_feats, state, y_prev)
state = paddle.unsqueeze(state, axis=0)
log_softmax_output = paddle.nn.functional.log_softmax(
output, axis=1)
sequence_scores = _inflate(sequence_scores, self.num_classes, 1)
sequence_scores += log_softmax_output
scores, candidates = paddle.topk(
paddle.reshape(sequence_scores, [batch_size, -1]),
beam_width,
axis=1)
# Reshape input = (bk, 1) and sequence_scores = (bk, 1)
y_prev = paddle.reshape(
candidates % self.num_classes, shape=[batch_size * beam_width])
sequence_scores = paddle.reshape(
scores, shape=[batch_size * beam_width, 1])
# Update fields for next timestep
pos_index = paddle.expand_as(pos_index, candidates)
predecessors = paddle.cast(
candidates / self.num_classes + pos_index, dtype='int64')
predecessors = paddle.reshape(
predecessors, shape=[batch_size * beam_width, 1])
state = paddle.index_select(
state, index=predecessors.squeeze(), axis=1)
# Update sequence socres and erase scores for <eos> symbol so that they aren't expanded
stored_scores.append(sequence_scores.clone())
y_prev = paddle.reshape(y_prev, shape=[-1, 1])
eos_prev = paddle.full_like(y_prev, fill_value=eos)
mask = eos_prev == y_prev
mask = paddle.nonzero(mask)
if mask.dim() > 0:
sequence_scores = sequence_scores.numpy()
mask = mask.numpy()
sequence_scores[mask] = -float('inf')
sequence_scores = paddle.to_tensor(sequence_scores)
# Cache results for backtracking
stored_predecessors.append(predecessors)
y_prev = paddle.squeeze(y_prev)
stored_emitted_symbols.append(y_prev)
# Do backtracking to return the optimal values
#====== backtrak ======#
# Initialize return variables given different types
p = list()
l = [[self.max_len_labels] * beam_width for _ in range(batch_size)
] # Placeholder for lengths of top-k sequences
# the last step output of the beams are not sorted
# thus they are sorted here
sorted_score, sorted_idx = paddle.topk(
paddle.reshape(
stored_scores[-1], shape=[batch_size, beam_width]),
beam_width)
# initialize the sequence scores with the sorted last step beam scores
s = sorted_score.clone()
batch_eos_found = [0] * batch_size # the number of EOS found
# in the backward loop below for each batch
t = self.max_len_labels - 1
# initialize the back pointer with the sorted order of the last step beams.
# add pos_index for indexing variable with b*k as the first dimension.
t_predecessors = paddle.reshape(
sorted_idx + pos_index.expand_as(sorted_idx),
shape=[batch_size * beam_width])
while t >= 0:
# Re-order the variables with the back pointer
current_symbol = paddle.index_select(
stored_emitted_symbols[t], index=t_predecessors, axis=0)
t_predecessors = paddle.index_select(
stored_predecessors[t].squeeze(), index=t_predecessors, axis=0)
eos_indices = stored_emitted_symbols[t] == eos
eos_indices = paddle.nonzero(eos_indices)
if eos_indices.dim() > 0:
for i in range(eos_indices.shape[0] - 1, -1, -1):
# Indices of the EOS symbol for both variables
# with b*k as the first dimension, and b, k for
# the first two dimensions
idx = eos_indices[i]
b_idx = int(idx[0] / beam_width)
# The indices of the replacing position
# according to the replacement strategy noted above
res_k_idx = beam_width - (batch_eos_found[b_idx] %
beam_width) - 1
batch_eos_found[b_idx] += 1
res_idx = b_idx * beam_width + res_k_idx
# Replace the old information in return variables
# with the new ended sequence information
t_predecessors[res_idx] = stored_predecessors[t][idx[0]]
current_symbol[res_idx] = stored_emitted_symbols[t][idx[0]]
s[b_idx, res_k_idx] = stored_scores[t][idx[0], 0]
l[b_idx][res_k_idx] = t + 1
# record the back tracked results
p.append(current_symbol)
t -= 1
# Sort and re-order again as the added ended sequences may change
# the order (very unlikely)
s, re_sorted_idx = s.topk(beam_width)
for b_idx in range(batch_size):
l[b_idx] = [
l[b_idx][k_idx.item()] for k_idx in re_sorted_idx[b_idx, :]
]
re_sorted_idx = paddle.reshape(
re_sorted_idx + pos_index.expand_as(re_sorted_idx),
[batch_size * beam_width])
# Reverse the sequences and re-order at the same time
# It is reversed because the backtracking happens in reverse time order
p = [
paddle.reshape(
paddle.index_select(step, re_sorted_idx, 0),
shape=[batch_size, beam_width, -1]) for step in reversed(p)
]
p = paddle.concat(p, -1)[:, 0, :]
return p, paddle.ones_like(p)
class AttentionUnit(nn.Layer):
def __init__(self, sDim, xDim, attDim):
super(AttentionUnit, self).__init__()
self.sDim = sDim
self.xDim = xDim
self.attDim = attDim
self.sEmbed = nn.Linear(sDim, attDim)
self.xEmbed = nn.Linear(xDim, attDim)
self.wEmbed = nn.Linear(attDim, 1)
def forward(self, x, sPrev):
batch_size, T, _ = x.shape # [b x T x xDim]
x = paddle.reshape(x, [-1, self.xDim]) # [(b x T) x xDim]
xProj = self.xEmbed(x) # [(b x T) x attDim]
xProj = paddle.reshape(xProj, [batch_size, T, -1]) # [b x T x attDim]
sPrev = sPrev.squeeze(0)
sProj = self.sEmbed(sPrev) # [b x attDim]
sProj = paddle.unsqueeze(sProj, 1) # [b x 1 x attDim]
sProj = paddle.expand(sProj,
[batch_size, T, self.attDim]) # [b x T x attDim]
sumTanh = paddle.tanh(sProj + xProj)
sumTanh = paddle.reshape(sumTanh, [-1, self.attDim])
vProj = self.wEmbed(sumTanh) # [(b x T) x 1]
vProj = paddle.reshape(vProj, [batch_size, T])
alpha = F.softmax(
vProj, axis=1) # attention weights for each sample in the minibatch
return alpha
class DecoderUnit(nn.Layer):
def __init__(self, sDim, xDim, yDim, attDim):
super(DecoderUnit, self).__init__()
self.sDim = sDim
self.xDim = xDim
self.yDim = yDim
self.attDim = attDim
self.emdDim = attDim
self.attention_unit = AttentionUnit(sDim, xDim, attDim)
self.tgt_embedding = nn.Embedding(
yDim + 1, self.emdDim, weight_attr=nn.initializer.Normal(
std=0.01)) # the last is used for <BOS>
self.gru = nn.GRUCell(input_size=xDim + self.emdDim, hidden_size=sDim)
self.fc = nn.Linear(
sDim,
yDim,
weight_attr=nn.initializer.Normal(std=0.01),
bias_attr=nn.initializer.Constant(value=0))
self.embed_fc = nn.Linear(300, self.sDim)
def get_initial_state(self, embed, tile_times=1):
assert embed.shape[1] == 300
state = self.embed_fc(embed) # N * sDim
if tile_times != 1:
state = state.unsqueeze(1)
trans_state = paddle.transpose(state, perm=[1, 0, 2])
state = paddle.tile(trans_state, repeat_times=[tile_times, 1, 1])
trans_state = paddle.transpose(state, perm=[1, 0, 2])
state = paddle.reshape(trans_state, shape=[-1, self.sDim])
state = state.unsqueeze(0) # 1 * N * sDim
return state
def forward(self, x, sPrev, yPrev):
# x: feature sequence from the image decoder.
batch_size, T, _ = x.shape
alpha = self.attention_unit(x, sPrev)
context = paddle.squeeze(paddle.matmul(alpha.unsqueeze(1), x), axis=1)
yPrev = paddle.cast(yPrev, dtype="int64")
yProj = self.tgt_embedding(yPrev)
concat_context = paddle.concat([yProj, context], 1)
concat_context = paddle.squeeze(concat_context, 1)
sPrev = paddle.squeeze(sPrev, 0)
output, state = self.gru(concat_context, sPrev)
output = paddle.squeeze(output, axis=1)
output = self.fc(output)
return output, state
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment