first commit

f1506916 · sugon_cxj · 55c28ed5 · f1506916 · f1506916 · f1506916
Commit f1506916 authored May 18, 2023 by sugon_cxj
20 changed files
--- a/ppocr/modeling/heads/__pycache__/rec_aster_head.cpython-37.pyc
+++ b/ppocr/modeling/heads/__pycache__/rec_aster_head.cpython-37.pyc
--- a/ppocr/modeling/heads/__pycache__/rec_att_head.cpython-37.pyc
+++ b/ppocr/modeling/heads/__pycache__/rec_att_head.cpython-37.pyc
--- a/ppocr/modeling/heads/__pycache__/rec_ctc_head.cpython-37.pyc
+++ b/ppocr/modeling/heads/__pycache__/rec_ctc_head.cpython-37.pyc
--- a/ppocr/modeling/heads/__pycache__/rec_multi_head.cpython-37.pyc
+++ b/ppocr/modeling/heads/__pycache__/rec_multi_head.cpython-37.pyc
--- a/ppocr/modeling/heads/__pycache__/rec_nrtr_head.cpython-37.pyc
+++ b/ppocr/modeling/heads/__pycache__/rec_nrtr_head.cpython-37.pyc
--- a/ppocr/modeling/heads/__pycache__/rec_pren_head.cpython-37.pyc
+++ b/ppocr/modeling/heads/__pycache__/rec_pren_head.cpython-37.pyc
--- a/ppocr/modeling/heads/__pycache__/rec_sar_head.cpython-37.pyc
+++ b/ppocr/modeling/heads/__pycache__/rec_sar_head.cpython-37.pyc
--- a/ppocr/modeling/heads/__pycache__/rec_srn_head.cpython-37.pyc
+++ b/ppocr/modeling/heads/__pycache__/rec_srn_head.cpython-37.pyc
--- a/ppocr/modeling/heads/__pycache__/self_attention.cpython-37.pyc
+++ b/ppocr/modeling/heads/__pycache__/self_attention.cpython-37.pyc
--- a/ppocr/modeling/heads/__pycache__/table_att_head.cpython-37.pyc
+++ b/ppocr/modeling/heads/__pycache__/table_att_head.cpython-37.pyc
--- a/ppocr/modeling/heads/cls_head.py
+++ b/ppocr/modeling/heads/cls_head.py
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import paddle
+from paddle import nn, ParamAttr
+import paddle.nn.functional as F
+
+
+class ClsHead(nn.Layer):
+    """
+    Class orientation
+
+    Args:
+
+        params(dict): super parameters for build Class network
+    """
+
+    def __init__(self, in_channels, class_dim, **kwargs):
+        super(ClsHead, self).__init__()
+        self.pool = nn.AdaptiveAvgPool2D(1)
+        stdv = 1.0 / math.sqrt(in_channels * 1.0)
+        self.fc = nn.Linear(
+            in_channels,
+            class_dim,
+            weight_attr=ParamAttr(
+                name="fc_0.w_0",
+                initializer=nn.initializer.Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr(name="fc_0.b_0"), )
+
+    def forward(self, x, targets=None):
+        x = self.pool(x)
+        x = paddle.reshape(x, shape=[x.shape[0], x.shape[1]])
+        x = self.fc(x)
+        if not self.training:
+            x = F.softmax(x, axis=1)
+        return x
--- a/ppocr/modeling/heads/det_db_head.py
+++ b/ppocr/modeling/heads/det_db_head.py
+# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+
+
+def get_bias_attr(k):
+    stdv = 1.0 / math.sqrt(k * 1.0)
+    initializer = paddle.nn.initializer.Uniform(-stdv, stdv)
+    bias_attr = ParamAttr(initializer=initializer)
+    return bias_attr
+
+
+class Head(nn.Layer):
+    def __init__(self, in_channels, name_list, kernel_list=[3, 2, 2], **kwargs):
+        super(Head, self).__init__()
+
+        self.conv1 = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=in_channels // 4,
+            kernel_size=kernel_list[0],
+            padding=int(kernel_list[0] // 2),
+            weight_attr=ParamAttr(),
+            bias_attr=False)
+        self.conv_bn1 = nn.BatchNorm(
+            num_channels=in_channels // 4,
+            param_attr=ParamAttr(
+                initializer=paddle.nn.initializer.Constant(value=1.0)),
+            bias_attr=ParamAttr(
+                initializer=paddle.nn.initializer.Constant(value=1e-4)),
+            act='relu')
+        self.conv2 = nn.Conv2DTranspose(
+            in_channels=in_channels // 4,
+            out_channels=in_channels // 4,
+            kernel_size=kernel_list[1],
+            stride=2,
+            weight_attr=ParamAttr(
+                initializer=paddle.nn.initializer.KaimingUniform()),
+            bias_attr=get_bias_attr(in_channels // 4))
+        self.conv_bn2 = nn.BatchNorm(
+            num_channels=in_channels // 4,
+            param_attr=ParamAttr(
+                initializer=paddle.nn.initializer.Constant(value=1.0)),
+            bias_attr=ParamAttr(
+                initializer=paddle.nn.initializer.Constant(value=1e-4)),
+            act="relu")
+        self.conv3 = nn.Conv2DTranspose(
+            in_channels=in_channels // 4,
+            out_channels=1,
+            kernel_size=kernel_list[2],
+            stride=2,
+            weight_attr=ParamAttr(
+                initializer=paddle.nn.initializer.KaimingUniform()),
+            bias_attr=get_bias_attr(in_channels // 4), )
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.conv_bn1(x)
+        x = self.conv2(x)
+        x = self.conv_bn2(x)
+        x = self.conv3(x)
+        x = F.sigmoid(x)
+        return x
+
+
+class DBHead(nn.Layer):
+    """
+    Differentiable Binarization (DB) for text detection:
+        see https://arxiv.org/abs/1911.08947
+    args:
+        params(dict): super parameters for build DB network
+    """
+
+    def __init__(self, in_channels, k=50, **kwargs):
+        super(DBHead, self).__init__()
+        self.k = k
+        binarize_name_list = [
+            'conv2d_56', 'batch_norm_47', 'conv2d_transpose_0', 'batch_norm_48',
+            'conv2d_transpose_1', 'binarize'
+        ]
+        thresh_name_list = [
+            'conv2d_57', 'batch_norm_49', 'conv2d_transpose_2', 'batch_norm_50',
+            'conv2d_transpose_3', 'thresh'
+        ]
+        self.binarize = Head(in_channels, binarize_name_list, **kwargs)
+        self.thresh = Head(in_channels, thresh_name_list, **kwargs)
+
+    def step_function(self, x, y):
+        return paddle.reciprocal(1 + paddle.exp(-self.k * (x - y)))
+
+    def forward(self, x, targets=None):
+        shrink_maps = self.binarize(x)
+        if not self.training:
+            return {'maps': shrink_maps}
+
+        threshold_maps = self.thresh(x)
+        binary_maps = self.step_function(shrink_maps, threshold_maps)
+        y = paddle.concat([shrink_maps, threshold_maps, binary_maps], axis=1)
+        return {'maps': y}
--- a/ppocr/modeling/heads/det_east_head.py
+++ b/ppocr/modeling/heads/det_east_head.py
+# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 groups=1,
+                 if_act=True,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self.if_act = if_act
+        self.act = act
+        self.conv = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + '_weights'),
+            bias_attr=False)
+
+        self.bn = nn.BatchNorm(
+            num_channels=out_channels,
+            act=act,
+            param_attr=ParamAttr(name="bn_" + name + "_scale"),
+            bias_attr=ParamAttr(name="bn_" + name + "_offset"),
+            moving_mean_name="bn_" + name + "_mean",
+            moving_variance_name="bn_" + name + "_variance")
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        return x
+
+
+class EASTHead(nn.Layer):
+    """
+    """
+    def __init__(self, in_channels, model_name, **kwargs):
+        super(EASTHead, self).__init__()
+        self.model_name = model_name
+        if self.model_name == "large":
+            num_outputs = [128, 64, 1, 8]
+        else:
+            num_outputs = [64, 32, 1, 8]
+
+        self.det_conv1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=num_outputs[0],
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            if_act=True,
+            act='relu',
+            name="det_head1")
+        self.det_conv2 = ConvBNLayer(
+            in_channels=num_outputs[0],
+            out_channels=num_outputs[1],
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            if_act=True,
+            act='relu',
+            name="det_head2")
+        self.score_conv = ConvBNLayer(
+            in_channels=num_outputs[1],
+            out_channels=num_outputs[2],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            if_act=False,
+            act=None,
+            name="f_score")
+        self.geo_conv = ConvBNLayer(
+            in_channels=num_outputs[1],
+            out_channels=num_outputs[3],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            if_act=False,
+            act=None,
+            name="f_geo")
+
+    def forward(self, x, targets=None):
+        f_det = self.det_conv1(x)
+        f_det = self.det_conv2(f_det)
+        f_score = self.score_conv(f_det)
+        f_score = F.sigmoid(f_score)
+        f_geo = self.geo_conv(f_det)
+        f_geo = (F.sigmoid(f_geo) - 0.5) * 2 * 800
+
+        pred = {'f_score': f_score, 'f_geo': f_geo}
+        return pred
--- a/ppocr/modeling/heads/det_fce_head.py
+++ b/ppocr/modeling/heads/det_fce_head.py
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is refer from:
+https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textdet/dense_heads/fce_head.py
+"""
+
+from paddle import nn
+from paddle import ParamAttr
+import paddle.nn.functional as F
+from paddle.nn.initializer import Normal
+import paddle
+from functools import partial
+
+
+def multi_apply(func, *args, **kwargs):
+    pfunc = partial(func, **kwargs) if kwargs else func
+    map_results = map(pfunc, *args)
+    return tuple(map(list, zip(*map_results)))
+
+
+class FCEHead(nn.Layer):
+    """The class for implementing FCENet head.
+    FCENet(CVPR2021): Fourier Contour Embedding for Arbitrary-shaped Text
+    Detection.
+
+    [https://arxiv.org/abs/2104.10442]
+
+    Args:
+        in_channels (int): The number of input channels.
+        scales (list[int]) : The scale of each layer.
+        fourier_degree (int) : The maximum Fourier transform degree k.
+    """
+
+    def __init__(self, in_channels, fourier_degree=5):
+        super().__init__()
+        assert isinstance(in_channels, int)
+
+        self.downsample_ratio = 1.0
+        self.in_channels = in_channels
+        self.fourier_degree = fourier_degree
+        self.out_channels_cls = 4
+        self.out_channels_reg = (2 * self.fourier_degree + 1) * 2
+
+        self.out_conv_cls = nn.Conv2D(
+            in_channels=self.in_channels,
+            out_channels=self.out_channels_cls,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            groups=1,
+            weight_attr=ParamAttr(
+                name='cls_weights',
+                initializer=Normal(
+                    mean=0., std=0.01)),
+            bias_attr=True)
+        self.out_conv_reg = nn.Conv2D(
+            in_channels=self.in_channels,
+            out_channels=self.out_channels_reg,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            groups=1,
+            weight_attr=ParamAttr(
+                name='reg_weights',
+                initializer=Normal(
+                    mean=0., std=0.01)),
+            bias_attr=True)
+
+    def forward(self, feats, targets=None):
+        cls_res, reg_res = multi_apply(self.forward_single, feats)
+        level_num = len(cls_res)
+        outs = {}
+        if not self.training:
+            for i in range(level_num):
+                tr_pred = F.softmax(cls_res[i][:, 0:2, :, :], axis=1)
+                tcl_pred = F.softmax(cls_res[i][:, 2:, :, :], axis=1)
+                outs['level_{}'.format(i)] = paddle.concat(
+                    [tr_pred, tcl_pred, reg_res[i]], axis=1)
+        else:
+            preds = [[cls_res[i], reg_res[i]] for i in range(level_num)]
+            outs['levels'] = preds
+        return outs
+
+    def forward_single(self, x):
+        cls_predict = self.out_conv_cls(x)
+        reg_predict = self.out_conv_reg(x)
+        return cls_predict, reg_predict
--- a/ppocr/modeling/heads/det_pse_head.py
+++ b/ppocr/modeling/heads/det_pse_head.py
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is refer from:
+https://github.com/whai362/PSENet/blob/python3/models/head/psenet_head.py
+"""
+
+from paddle import nn
+
+
+class PSEHead(nn.Layer):
+    def __init__(self, in_channels, hidden_dim=256, out_channels=7, **kwargs):
+        super(PSEHead, self).__init__()
+        self.conv1 = nn.Conv2D(
+            in_channels, hidden_dim, kernel_size=3, stride=1, padding=1)
+        self.bn1 = nn.BatchNorm2D(hidden_dim)
+        self.relu1 = nn.ReLU()
+
+        self.conv2 = nn.Conv2D(
+            hidden_dim, out_channels, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, x, **kwargs):
+        out = self.conv1(x)
+        out = self.relu1(self.bn1(out))
+        out = self.conv2(out)
+        return {'maps': out}
--- a/ppocr/modeling/heads/det_sast_head.py
+++ b/ppocr/modeling/heads/det_sast_head.py
+# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 groups=1,
+                 if_act=True,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self.if_act = if_act
+        self.act = act
+        self.conv = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=(kernel_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + '_weights'),
+            bias_attr=False)
+
+        self.bn = nn.BatchNorm(
+            num_channels=out_channels,
+            act=act,
+            param_attr=ParamAttr(name="bn_" + name + "_scale"),
+            bias_attr=ParamAttr(name="bn_" + name + "_offset"),
+            moving_mean_name="bn_" + name + "_mean",
+            moving_variance_name="bn_" + name + "_variance")
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        return x
+
+
+class SAST_Header1(nn.Layer):
+    def __init__(self, in_channels, **kwargs):
+        super(SAST_Header1, self).__init__()
+        out_channels = [64, 64, 128]
+        self.score_conv = nn.Sequential(
+            ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_score1'),
+            ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_score2'),
+            ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_score3'),
+            ConvBNLayer(out_channels[2], 1, 3, 1, act=None, name='f_score4')
+        )
+        self.border_conv = nn.Sequential(
+            ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_border1'),
+            ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_border2'),
+            ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_border3'),
+            ConvBNLayer(out_channels[2], 4, 3, 1, act=None, name='f_border4')            
+        )
+
+    def forward(self, x):
+        f_score = self.score_conv(x)
+        f_score = F.sigmoid(f_score)
+        f_border = self.border_conv(x)
+        return f_score, f_border
+
+
+class SAST_Header2(nn.Layer):
+    def __init__(self, in_channels, **kwargs):
+        super(SAST_Header2, self).__init__()
+        out_channels = [64, 64, 128]
+        self.tvo_conv = nn.Sequential(
+            ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_tvo1'),
+            ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_tvo2'),
+            ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_tvo3'),
+            ConvBNLayer(out_channels[2], 8, 3, 1, act=None, name='f_tvo4')
+        )
+        self.tco_conv = nn.Sequential(
+            ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_tco1'),
+            ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_tco2'),
+            ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_tco3'),
+            ConvBNLayer(out_channels[2], 2, 3, 1, act=None, name='f_tco4')            
+        )
+
+    def forward(self, x):
+        f_tvo = self.tvo_conv(x)
+        f_tco = self.tco_conv(x)
+        return f_tvo, f_tco
+
+
+class SASTHead(nn.Layer):
+    """
+    """
+    def __init__(self, in_channels, **kwargs):
+        super(SASTHead, self).__init__()
+
+        self.head1 = SAST_Header1(in_channels)
+        self.head2 = SAST_Header2(in_channels)
+
+    def forward(self, x, targets=None):
+        f_score, f_border = self.head1(x)
+        f_tvo, f_tco = self.head2(x)
+
+        predicts = {}
+        predicts['f_score'] = f_score
+        predicts['f_border'] = f_border
+        predicts['f_tvo'] = f_tvo
+        predicts['f_tco'] = f_tco
+        return predicts
\ No newline at end of file
--- a/ppocr/modeling/heads/e2e_pg_head.py
+++ b/ppocr/modeling/heads/e2e_pg_head.py
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 groups=1,
+                 if_act=True,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self.if_act = if_act
+        self.act = act
+        self.conv = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + '_weights'),
+            bias_attr=False)
+
+        self.bn = nn.BatchNorm(
+            num_channels=out_channels,
+            act=act,
+            param_attr=ParamAttr(name="bn_" + name + "_scale"),
+            bias_attr=ParamAttr(name="bn_" + name + "_offset"),
+            moving_mean_name="bn_" + name + "_mean",
+            moving_variance_name="bn_" + name + "_variance",
+            use_global_stats=False)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        return x
+
+
+class PGHead(nn.Layer):
+    """
+    """
+
+    def __init__(self, in_channels, **kwargs):
+        super(PGHead, self).__init__()
+        self.conv_f_score1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=64,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            act='relu',
+            name="conv_f_score{}".format(1))
+        self.conv_f_score2 = ConvBNLayer(
+            in_channels=64,
+            out_channels=64,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            act='relu',
+            name="conv_f_score{}".format(2))
+        self.conv_f_score3 = ConvBNLayer(
+            in_channels=64,
+            out_channels=128,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            act='relu',
+            name="conv_f_score{}".format(3))
+
+        self.conv1 = nn.Conv2D(
+            in_channels=128,
+            out_channels=1,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            groups=1,
+            weight_attr=ParamAttr(name="conv_f_score{}".format(4)),
+            bias_attr=False)
+
+        self.conv_f_boder1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=64,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            act='relu',
+            name="conv_f_boder{}".format(1))
+        self.conv_f_boder2 = ConvBNLayer(
+            in_channels=64,
+            out_channels=64,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            act='relu',
+            name="conv_f_boder{}".format(2))
+        self.conv_f_boder3 = ConvBNLayer(
+            in_channels=64,
+            out_channels=128,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            act='relu',
+            name="conv_f_boder{}".format(3))
+        self.conv2 = nn.Conv2D(
+            in_channels=128,
+            out_channels=4,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            groups=1,
+            weight_attr=ParamAttr(name="conv_f_boder{}".format(4)),
+            bias_attr=False)
+        self.conv_f_char1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=128,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            act='relu',
+            name="conv_f_char{}".format(1))
+        self.conv_f_char2 = ConvBNLayer(
+            in_channels=128,
+            out_channels=128,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            act='relu',
+            name="conv_f_char{}".format(2))
+        self.conv_f_char3 = ConvBNLayer(
+            in_channels=128,
+            out_channels=256,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            act='relu',
+            name="conv_f_char{}".format(3))
+        self.conv_f_char4 = ConvBNLayer(
+            in_channels=256,
+            out_channels=256,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            act='relu',
+            name="conv_f_char{}".format(4))
+        self.conv_f_char5 = ConvBNLayer(
+            in_channels=256,
+            out_channels=256,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            act='relu',
+            name="conv_f_char{}".format(5))
+        self.conv3 = nn.Conv2D(
+            in_channels=256,
+            out_channels=37,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            groups=1,
+            weight_attr=ParamAttr(name="conv_f_char{}".format(6)),
+            bias_attr=False)
+
+        self.conv_f_direc1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=64,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            act='relu',
+            name="conv_f_direc{}".format(1))
+        self.conv_f_direc2 = ConvBNLayer(
+            in_channels=64,
+            out_channels=64,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            act='relu',
+            name="conv_f_direc{}".format(2))
+        self.conv_f_direc3 = ConvBNLayer(
+            in_channels=64,
+            out_channels=128,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            act='relu',
+            name="conv_f_direc{}".format(3))
+        self.conv4 = nn.Conv2D(
+            in_channels=128,
+            out_channels=2,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            groups=1,
+            weight_attr=ParamAttr(name="conv_f_direc{}".format(4)),
+            bias_attr=False)
+
+    def forward(self, x, targets=None):
+        f_score = self.conv_f_score1(x)
+        f_score = self.conv_f_score2(f_score)
+        f_score = self.conv_f_score3(f_score)
+        f_score = self.conv1(f_score)
+        f_score = F.sigmoid(f_score)
+
+        # f_border
+        f_border = self.conv_f_boder1(x)
+        f_border = self.conv_f_boder2(f_border)
+        f_border = self.conv_f_boder3(f_border)
+        f_border = self.conv2(f_border)
+
+        f_char = self.conv_f_char1(x)
+        f_char = self.conv_f_char2(f_char)
+        f_char = self.conv_f_char3(f_char)
+        f_char = self.conv_f_char4(f_char)
+        f_char = self.conv_f_char5(f_char)
+        f_char = self.conv3(f_char)
+
+        f_direction = self.conv_f_direc1(x)
+        f_direction = self.conv_f_direc2(f_direction)
+        f_direction = self.conv_f_direc3(f_direction)
+        f_direction = self.conv4(f_direction)
+
+        predicts = {}
+        predicts['f_score'] = f_score
+        predicts['f_border'] = f_border
+        predicts['f_char'] = f_char
+        predicts['f_direction'] = f_direction
+        return predicts
--- a/ppocr/modeling/heads/kie_sdmgr_head.py
+++ b/ppocr/modeling/heads/kie_sdmgr_head.py
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# reference from : https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/kie/heads/sdmgr_head.py
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+
+
+class SDMGRHead(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 num_chars=92,
+                 visual_dim=16,
+                 fusion_dim=1024,
+                 node_input=32,
+                 node_embed=256,
+                 edge_input=5,
+                 edge_embed=256,
+                 num_gnn=2,
+                 num_classes=26,
+                 bidirectional=False):
+        super().__init__()
+
+        self.fusion = Block([visual_dim, node_embed], node_embed, fusion_dim)
+        self.node_embed = nn.Embedding(num_chars, node_input, 0)
+        hidden = node_embed // 2 if bidirectional else node_embed
+        self.rnn = nn.LSTM(
+            input_size=node_input, hidden_size=hidden, num_layers=1)
+        self.edge_embed = nn.Linear(edge_input, edge_embed)
+        self.gnn_layers = nn.LayerList(
+            [GNNLayer(node_embed, edge_embed) for _ in range(num_gnn)])
+        self.node_cls = nn.Linear(node_embed, num_classes)
+        self.edge_cls = nn.Linear(edge_embed, 2)
+
+    def forward(self, input, targets):
+        relations, texts, x = input
+        node_nums, char_nums = [], []
+        for text in texts:
+            node_nums.append(text.shape[0])
+            char_nums.append(paddle.sum((text > -1).astype(int), axis=-1))
+
+        max_num = max([char_num.max() for char_num in char_nums])
+        all_nodes = paddle.concat([
+            paddle.concat(
+                [text, paddle.zeros(
+                    (text.shape[0], max_num - text.shape[1]))], -1)
+            for text in texts
+        ])
+        temp = paddle.clip(all_nodes, min=0).astype(int)
+        embed_nodes = self.node_embed(temp)
+        rnn_nodes, _ = self.rnn(embed_nodes)
+
+        b, h, w = rnn_nodes.shape
+        nodes = paddle.zeros([b, w])
+        all_nums = paddle.concat(char_nums)
+        valid = paddle.nonzero((all_nums > 0).astype(int))
+        temp_all_nums = (
+            paddle.gather(all_nums, valid) - 1).unsqueeze(-1).unsqueeze(-1)
+        temp_all_nums = paddle.expand(temp_all_nums, [
+            temp_all_nums.shape[0], temp_all_nums.shape[1], rnn_nodes.shape[-1]
+        ])
+        temp_all_nodes = paddle.gather(rnn_nodes, valid)
+        N, C, A = temp_all_nodes.shape
+        one_hot = F.one_hot(
+            temp_all_nums[:, 0, :], num_classes=C).transpose([0, 2, 1])
+        one_hot = paddle.multiply(
+            temp_all_nodes, one_hot.astype("float32")).sum(axis=1, keepdim=True)
+        t = one_hot.expand([N, 1, A]).squeeze(1)
+        nodes = paddle.scatter(nodes, valid.squeeze(1), t)
+
+        if x is not None:
+            nodes = self.fusion([x, nodes])
+
+        all_edges = paddle.concat(
+            [rel.reshape([-1, rel.shape[-1]]) for rel in relations])
+        embed_edges = self.edge_embed(all_edges.astype('float32'))
+        embed_edges = F.normalize(embed_edges)
+
+        for gnn_layer in self.gnn_layers:
+            nodes, cat_nodes = gnn_layer(nodes, embed_edges, node_nums)
+
+        node_cls, edge_cls = self.node_cls(nodes), self.edge_cls(cat_nodes)
+        return node_cls, edge_cls
+
+
+class GNNLayer(nn.Layer):
+    def __init__(self, node_dim=256, edge_dim=256):
+        super().__init__()
+        self.in_fc = nn.Linear(node_dim * 2 + edge_dim, node_dim)
+        self.coef_fc = nn.Linear(node_dim, 1)
+        self.out_fc = nn.Linear(node_dim, node_dim)
+        self.relu = nn.ReLU()
+
+    def forward(self, nodes, edges, nums):
+        start, cat_nodes = 0, []
+        for num in nums:
+            sample_nodes = nodes[start:start + num]
+            cat_nodes.append(
+                paddle.concat([
+                    paddle.expand(sample_nodes.unsqueeze(1), [-1, num, -1]),
+                    paddle.expand(sample_nodes.unsqueeze(0), [num, -1, -1])
+                ], -1).reshape([num**2, -1]))
+            start += num
+        cat_nodes = paddle.concat([paddle.concat(cat_nodes), edges], -1)
+        cat_nodes = self.relu(self.in_fc(cat_nodes))
+        coefs = self.coef_fc(cat_nodes)
+
+        start, residuals = 0, []
+        for num in nums:
+            residual = F.softmax(
+                -paddle.eye(num).unsqueeze(-1) * 1e9 +
+                coefs[start:start + num**2].reshape([num, num, -1]), 1)
+            residuals.append((residual * cat_nodes[start:start + num**2]
+                              .reshape([num, num, -1])).sum(1))
+            start += num**2
+
+        nodes += self.relu(self.out_fc(paddle.concat(residuals)))
+        return [nodes, cat_nodes]
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 input_dims,
+                 output_dim,
+                 mm_dim=1600,
+                 chunks=20,
+                 rank=15,
+                 shared=False,
+                 dropout_input=0.,
+                 dropout_pre_lin=0.,
+                 dropout_output=0.,
+                 pos_norm='before_cat'):
+        super().__init__()
+        self.rank = rank
+        self.dropout_input = dropout_input
+        self.dropout_pre_lin = dropout_pre_lin
+        self.dropout_output = dropout_output
+        assert (pos_norm in ['before_cat', 'after_cat'])
+        self.pos_norm = pos_norm
+        # Modules
+        self.linear0 = nn.Linear(input_dims[0], mm_dim)
+        self.linear1 = (self.linear0
+                        if shared else nn.Linear(input_dims[1], mm_dim))
+        self.merge_linears0 = nn.LayerList()
+        self.merge_linears1 = nn.LayerList()
+        self.chunks = self.chunk_sizes(mm_dim, chunks)
+        for size in self.chunks:
+            ml0 = nn.Linear(size, size * rank)
+            self.merge_linears0.append(ml0)
+            ml1 = ml0 if shared else nn.Linear(size, size * rank)
+            self.merge_linears1.append(ml1)
+        self.linear_out = nn.Linear(mm_dim, output_dim)
+
+    def forward(self, x):
+        x0 = self.linear0(x[0])
+        x1 = self.linear1(x[1])
+        bs = x1.shape[0]
+        if self.dropout_input > 0:
+            x0 = F.dropout(x0, p=self.dropout_input, training=self.training)
+            x1 = F.dropout(x1, p=self.dropout_input, training=self.training)
+        x0_chunks = paddle.split(x0, self.chunks, -1)
+        x1_chunks = paddle.split(x1, self.chunks, -1)
+        zs = []
+        for x0_c, x1_c, m0, m1 in zip(x0_chunks, x1_chunks, self.merge_linears0,
+                                      self.merge_linears1):
+            m = m0(x0_c) * m1(x1_c)  # bs x split_size*rank
+            m = m.reshape([bs, self.rank, -1])
+            z = paddle.sum(m, 1)
+            if self.pos_norm == 'before_cat':
+                z = paddle.sqrt(F.relu(z)) - paddle.sqrt(F.relu(-z))
+                z = F.normalize(z)
+            zs.append(z)
+        z = paddle.concat(zs, 1)
+        if self.pos_norm == 'after_cat':
+            z = paddle.sqrt(F.relu(z)) - paddle.sqrt(F.relu(-z))
+            z = F.normalize(z)
+
+        if self.dropout_pre_lin > 0:
+            z = F.dropout(z, p=self.dropout_pre_lin, training=self.training)
+        z = self.linear_out(z)
+        if self.dropout_output > 0:
+            z = F.dropout(z, p=self.dropout_output, training=self.training)
+        return z
+
+    def chunk_sizes(self, dim, chunks):
+        split_size = (dim + chunks - 1) // chunks
+        sizes_list = [split_size] * chunks
+        sizes_list[-1] = sizes_list[-1] - (sum(sizes_list) - dim)
+        return sizes_list
--- a/ppocr/modeling/heads/multiheadAttention.py
+++ b/ppocr/modeling/heads/multiheadAttention.py
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+from paddle.nn import Linear
+from paddle.nn.initializer import XavierUniform as xavier_uniform_
+from paddle.nn.initializer import Constant as constant_
+from paddle.nn.initializer import XavierNormal as xavier_normal_
+
+zeros_ = constant_(value=0.)
+ones_ = constant_(value=1.)
+
+
+class MultiheadAttention(nn.Layer):
+    """Allows the model to jointly attend to information
+    from different representation subspaces.
+    See reference: Attention Is All You Need
+
+    .. math::
+        \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
+        \text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)
+
+    Args:
+        embed_dim: total dimension of the model
+        num_heads: parallel attention layers, or heads
+
+    """
+
+    def __init__(self,
+                 embed_dim,
+                 num_heads,
+                 dropout=0.,
+                 bias=True,
+                 add_bias_kv=False,
+                 add_zero_attn=False):
+        super(MultiheadAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+        self.scaling = self.head_dim**-0.5
+        self.out_proj = Linear(embed_dim, embed_dim, bias_attr=bias)
+        self._reset_parameters()
+        self.conv1 = paddle.nn.Conv2D(
+            in_channels=embed_dim, out_channels=embed_dim, kernel_size=(1, 1))
+        self.conv2 = paddle.nn.Conv2D(
+            in_channels=embed_dim, out_channels=embed_dim, kernel_size=(1, 1))
+        self.conv3 = paddle.nn.Conv2D(
+            in_channels=embed_dim, out_channels=embed_dim, kernel_size=(1, 1))
+
+    def _reset_parameters(self):
+        xavier_uniform_(self.out_proj.weight)
+
+    def forward(self,
+                query,
+                key,
+                value,
+                key_padding_mask=None,
+                incremental_state=None,
+                attn_mask=None):
+        """
+        Inputs of forward function
+            query: [target length, batch size, embed dim]
+            key: [sequence length, batch size, embed dim]
+            value: [sequence length, batch size, embed dim]
+            key_padding_mask: if True, mask padding based on batch size
+            incremental_state: if provided, previous time steps are cashed
+            need_weights: output attn_output_weights
+            static_kv: key and value are static
+
+        Outputs of forward function
+            attn_output: [target length, batch size, embed dim]
+            attn_output_weights: [batch size, target length, sequence length]
+        """
+        q_shape = paddle.shape(query)
+        src_shape = paddle.shape(key)
+        q = self._in_proj_q(query)
+        k = self._in_proj_k(key)
+        v = self._in_proj_v(value)
+        q *= self.scaling
+        q = paddle.transpose(
+            paddle.reshape(
+                q, [q_shape[0], q_shape[1], self.num_heads, self.head_dim]),
+            [1, 2, 0, 3])
+        k = paddle.transpose(
+            paddle.reshape(
+                k, [src_shape[0], q_shape[1], self.num_heads, self.head_dim]),
+            [1, 2, 0, 3])
+        v = paddle.transpose(
+            paddle.reshape(
+                v, [src_shape[0], q_shape[1], self.num_heads, self.head_dim]),
+            [1, 2, 0, 3])
+        if key_padding_mask is not None:
+            assert key_padding_mask.shape[0] == q_shape[1]
+            assert key_padding_mask.shape[1] == src_shape[0]
+        attn_output_weights = paddle.matmul(q,
+                                            paddle.transpose(k, [0, 1, 3, 2]))
+        if attn_mask is not None:
+            attn_mask = paddle.unsqueeze(paddle.unsqueeze(attn_mask, 0), 0)
+            attn_output_weights += attn_mask
+        if key_padding_mask is not None:
+            attn_output_weights = paddle.reshape(
+                attn_output_weights,
+                [q_shape[1], self.num_heads, q_shape[0], src_shape[0]])
+            key = paddle.unsqueeze(paddle.unsqueeze(key_padding_mask, 1), 2)
+            key = paddle.cast(key, 'float32')
+            y = paddle.full(
+                shape=paddle.shape(key), dtype='float32', fill_value='-inf')
+            y = paddle.where(key == 0., key, y)
+            attn_output_weights += y
+        attn_output_weights = F.softmax(
+            attn_output_weights.astype('float32'),
+            axis=-1,
+            dtype=paddle.float32 if attn_output_weights.dtype == paddle.float16
+            else attn_output_weights.dtype)
+        attn_output_weights = F.dropout(
+            attn_output_weights, p=self.dropout, training=self.training)
+
+        attn_output = paddle.matmul(attn_output_weights, v)
+        attn_output = paddle.reshape(
+            paddle.transpose(attn_output, [2, 0, 1, 3]),
+            [q_shape[0], q_shape[1], self.embed_dim])
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output
+
+    def _in_proj_q(self, query):
+        query = paddle.transpose(query, [1, 2, 0])
+        query = paddle.unsqueeze(query, axis=2)
+        res = self.conv1(query)
+        res = paddle.squeeze(res, axis=2)
+        res = paddle.transpose(res, [2, 0, 1])
+        return res
+
+    def _in_proj_k(self, key):
+        key = paddle.transpose(key, [1, 2, 0])
+        key = paddle.unsqueeze(key, axis=2)
+        res = self.conv2(key)
+        res = paddle.squeeze(res, axis=2)
+        res = paddle.transpose(res, [2, 0, 1])
+        return res
+
+    def _in_proj_v(self, value):
+        value = paddle.transpose(value, [1, 2, 0])  #(1, 2, 0)
+        value = paddle.unsqueeze(value, axis=2)
+        res = self.conv3(value)
+        res = paddle.squeeze(res, axis=2)
+        res = paddle.transpose(res, [2, 0, 1])
+        return res
--- a/ppocr/modeling/heads/rec_aster_head.py
+++ b/ppocr/modeling/heads/rec_aster_head.py
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is refer from:
+https://github.com/ayumiymk/aster.pytorch/blob/master/lib/models/attention_recognition_head.py
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+
+import paddle
+from paddle import nn
+from paddle.nn import functional as F
+
+
+class AsterHead(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 sDim,
+                 attDim,
+                 max_len_labels,
+                 time_step=25,
+                 beam_width=5,
+                 **kwargs):
+        super(AsterHead, self).__init__()
+        self.num_classes = out_channels
+        self.in_planes = in_channels
+        self.sDim = sDim
+        self.attDim = attDim
+        self.max_len_labels = max_len_labels
+        self.decoder = AttentionRecognitionHead(in_channels, out_channels, sDim,
+                                                attDim, max_len_labels)
+        self.time_step = time_step
+        self.embeder = Embedding(self.time_step, in_channels)
+        self.beam_width = beam_width
+        self.eos = self.num_classes - 3
+
+    def forward(self, x, targets=None, embed=None):
+        return_dict = {}
+        embedding_vectors = self.embeder(x)
+
+        if self.training:
+            rec_targets, rec_lengths, _ = targets
+            rec_pred = self.decoder([x, rec_targets, rec_lengths],
+                                    embedding_vectors)
+            return_dict['rec_pred'] = rec_pred
+            return_dict['embedding_vectors'] = embedding_vectors
+        else:
+            rec_pred, rec_pred_scores = self.decoder.beam_search(
+                x, self.beam_width, self.eos, embedding_vectors)
+            return_dict['rec_pred'] = rec_pred
+            return_dict['rec_pred_scores'] = rec_pred_scores
+            return_dict['embedding_vectors'] = embedding_vectors
+
+        return return_dict
+
+
+class Embedding(nn.Layer):
+    def __init__(self, in_timestep, in_planes, mid_dim=4096, embed_dim=300):
+        super(Embedding, self).__init__()
+        self.in_timestep = in_timestep
+        self.in_planes = in_planes
+        self.embed_dim = embed_dim
+        self.mid_dim = mid_dim
+        self.eEmbed = nn.Linear(
+            in_timestep * in_planes,
+            self.embed_dim)  # Embed encoder output to a word-embedding like
+
+    def forward(self, x):
+        x = paddle.reshape(x, [paddle.shape(x)[0], -1])
+        x = self.eEmbed(x)
+        return x
+
+
+class AttentionRecognitionHead(nn.Layer):
+    """
+  input: [b x 16 x 64 x in_planes]
+  output: probability sequence: [b x T x num_classes]
+  """
+
+    def __init__(self, in_channels, out_channels, sDim, attDim, max_len_labels):
+        super(AttentionRecognitionHead, self).__init__()
+        self.num_classes = out_channels  # this is the output classes. So it includes the <EOS>.
+        self.in_planes = in_channels
+        self.sDim = sDim
+        self.attDim = attDim
+        self.max_len_labels = max_len_labels
+
+        self.decoder = DecoderUnit(
+            sDim=sDim, xDim=in_channels, yDim=self.num_classes, attDim=attDim)
+
+    def forward(self, x, embed):
+        x, targets, lengths = x
+        batch_size = paddle.shape(x)[0]
+        # Decoder
+        state = self.decoder.get_initial_state(embed)
+        outputs = []
+        for i in range(max(lengths)):
+            if i == 0:
+                y_prev = paddle.full(
+                    shape=[batch_size], fill_value=self.num_classes)
+            else:
+                y_prev = targets[:, i - 1]
+            output, state = self.decoder(x, state, y_prev)
+            outputs.append(output)
+        outputs = paddle.concat([_.unsqueeze(1) for _ in outputs], 1)
+        return outputs
+
+    # inference stage.
+    def sample(self, x):
+        x, _, _ = x
+        batch_size = x.size(0)
+        # Decoder
+        state = paddle.zeros([1, batch_size, self.sDim])
+
+        predicted_ids, predicted_scores = [], []
+        for i in range(self.max_len_labels):
+            if i == 0:
+                y_prev = paddle.full(
+                    shape=[batch_size], fill_value=self.num_classes)
+            else:
+                y_prev = predicted
+
+            output, state = self.decoder(x, state, y_prev)
+            output = F.softmax(output, axis=1)
+            score, predicted = output.max(1)
+            predicted_ids.append(predicted.unsqueeze(1))
+            predicted_scores.append(score.unsqueeze(1))
+        predicted_ids = paddle.concat([predicted_ids, 1])
+        predicted_scores = paddle.concat([predicted_scores, 1])
+        # return predicted_ids.squeeze(), predicted_scores.squeeze()
+        return predicted_ids, predicted_scores
+
+    def beam_search(self, x, beam_width, eos, embed):
+        def _inflate(tensor, times, dim):
+            repeat_dims = [1] * tensor.dim()
+            repeat_dims[dim] = times
+            output = paddle.tile(tensor, repeat_dims)
+            return output
+
+        # https://github.com/IBM/pytorch-seq2seq/blob/fede87655ddce6c94b38886089e05321dc9802af/seq2seq/models/TopKDecoder.py
+        batch_size, l, d = x.shape
+        x = paddle.tile(
+            paddle.transpose(
+                x.unsqueeze(1), perm=[1, 0, 2, 3]), [beam_width, 1, 1, 1])
+        inflated_encoder_feats = paddle.reshape(
+            paddle.transpose(
+                x, perm=[1, 0, 2, 3]), [-1, l, d])
+
+        # Initialize the decoder
+        state = self.decoder.get_initial_state(embed, tile_times=beam_width)
+
+        pos_index = paddle.reshape(
+            paddle.arange(batch_size) * beam_width, shape=[-1, 1])
+
+        # Initialize the scores
+        sequence_scores = paddle.full(
+            shape=[batch_size * beam_width, 1], fill_value=-float('Inf'))
+        index = [i * beam_width for i in range(0, batch_size)]
+        sequence_scores[index] = 0.0
+
+        # Initialize the input vector
+        y_prev = paddle.full(
+            shape=[batch_size * beam_width], fill_value=self.num_classes)
+
+        # Store decisions for backtracking
+        stored_scores = list()
+        stored_predecessors = list()
+        stored_emitted_symbols = list()
+
+        for i in range(self.max_len_labels):
+            output, state = self.decoder(inflated_encoder_feats, state, y_prev)
+            state = paddle.unsqueeze(state, axis=0)
+            log_softmax_output = paddle.nn.functional.log_softmax(
+                output, axis=1)
+
+            sequence_scores = _inflate(sequence_scores, self.num_classes, 1)
+            sequence_scores += log_softmax_output
+            scores, candidates = paddle.topk(
+                paddle.reshape(sequence_scores, [batch_size, -1]),
+                beam_width,
+                axis=1)
+
+            # Reshape input = (bk, 1) and sequence_scores = (bk, 1)
+            y_prev = paddle.reshape(
+                candidates % self.num_classes, shape=[batch_size * beam_width])
+            sequence_scores = paddle.reshape(
+                scores, shape=[batch_size * beam_width, 1])
+
+            # Update fields for next timestep
+            pos_index = paddle.expand_as(pos_index, candidates)
+            predecessors = paddle.cast(
+                candidates / self.num_classes + pos_index, dtype='int64')
+            predecessors = paddle.reshape(
+                predecessors, shape=[batch_size * beam_width, 1])
+            state = paddle.index_select(
+                state, index=predecessors.squeeze(), axis=1)
+
+            # Update sequence socres and erase scores for <eos> symbol so that they aren't expanded
+            stored_scores.append(sequence_scores.clone())
+            y_prev = paddle.reshape(y_prev, shape=[-1, 1])
+            eos_prev = paddle.full_like(y_prev, fill_value=eos)
+            mask = eos_prev == y_prev
+            mask = paddle.nonzero(mask)
+            if mask.dim() > 0:
+                sequence_scores = sequence_scores.numpy()
+                mask = mask.numpy()
+                sequence_scores[mask] = -float('inf')
+                sequence_scores = paddle.to_tensor(sequence_scores)
+
+            # Cache results for backtracking
+            stored_predecessors.append(predecessors)
+            y_prev = paddle.squeeze(y_prev)
+            stored_emitted_symbols.append(y_prev)
+
+        # Do backtracking to return the optimal values
+        #====== backtrak ======#
+        # Initialize return variables given different types
+        p = list()
+        l = [[self.max_len_labels] * beam_width for _ in range(batch_size)
+             ]  # Placeholder for lengths of top-k sequences
+
+        # the last step output of the beams are not sorted
+        # thus they are sorted here
+        sorted_score, sorted_idx = paddle.topk(
+            paddle.reshape(
+                stored_scores[-1], shape=[batch_size, beam_width]),
+            beam_width)
+
+        # initialize the sequence scores with the sorted last step beam scores
+        s = sorted_score.clone()
+
+        batch_eos_found = [0] * batch_size  # the number of EOS found
+        # in the backward loop below for each batch
+        t = self.max_len_labels - 1
+        # initialize the back pointer with the sorted order of the last step beams.
+        # add pos_index for indexing variable with b*k as the first dimension.
+        t_predecessors = paddle.reshape(
+            sorted_idx + pos_index.expand_as(sorted_idx),
+            shape=[batch_size * beam_width])
+        while t >= 0:
+            # Re-order the variables with the back pointer
+            current_symbol = paddle.index_select(
+                stored_emitted_symbols[t], index=t_predecessors, axis=0)
+            t_predecessors = paddle.index_select(
+                stored_predecessors[t].squeeze(), index=t_predecessors, axis=0)
+            eos_indices = stored_emitted_symbols[t] == eos
+            eos_indices = paddle.nonzero(eos_indices)
+
+            if eos_indices.dim() > 0:
+                for i in range(eos_indices.shape[0] - 1, -1, -1):
+                    # Indices of the EOS symbol for both variables
+                    # with b*k as the first dimension, and b, k for
+                    # the first two dimensions
+                    idx = eos_indices[i]
+                    b_idx = int(idx[0] / beam_width)
+                    # The indices of the replacing position
+                    # according to the replacement strategy noted above
+                    res_k_idx = beam_width - (batch_eos_found[b_idx] %
+                                              beam_width) - 1
+                    batch_eos_found[b_idx] += 1
+                    res_idx = b_idx * beam_width + res_k_idx
+
+                    # Replace the old information in return variables
+                    # with the new ended sequence information
+                    t_predecessors[res_idx] = stored_predecessors[t][idx[0]]
+                    current_symbol[res_idx] = stored_emitted_symbols[t][idx[0]]
+                    s[b_idx, res_k_idx] = stored_scores[t][idx[0], 0]
+                    l[b_idx][res_k_idx] = t + 1
+
+            # record the back tracked results
+            p.append(current_symbol)
+            t -= 1
+
+        # Sort and re-order again as the added ended sequences may change
+        # the order (very unlikely)
+        s, re_sorted_idx = s.topk(beam_width)
+        for b_idx in range(batch_size):
+            l[b_idx] = [
+                l[b_idx][k_idx.item()] for k_idx in re_sorted_idx[b_idx, :]
+            ]
+
+        re_sorted_idx = paddle.reshape(
+            re_sorted_idx + pos_index.expand_as(re_sorted_idx),
+            [batch_size * beam_width])
+
+        # Reverse the sequences and re-order at the same time
+        # It is reversed because the backtracking happens in reverse time order
+        p = [
+            paddle.reshape(
+                paddle.index_select(step, re_sorted_idx, 0),
+                shape=[batch_size, beam_width, -1]) for step in reversed(p)
+        ]
+        p = paddle.concat(p, -1)[:, 0, :]
+        return p, paddle.ones_like(p)
+
+
+class AttentionUnit(nn.Layer):
+    def __init__(self, sDim, xDim, attDim):
+        super(AttentionUnit, self).__init__()
+
+        self.sDim = sDim
+        self.xDim = xDim
+        self.attDim = attDim
+
+        self.sEmbed = nn.Linear(sDim, attDim)
+        self.xEmbed = nn.Linear(xDim, attDim)
+        self.wEmbed = nn.Linear(attDim, 1)
+
+    def forward(self, x, sPrev):
+        batch_size, T, _ = x.shape  # [b x T x xDim]
+        x = paddle.reshape(x, [-1, self.xDim])  # [(b x T) x xDim]
+        xProj = self.xEmbed(x)  # [(b x T) x attDim]
+        xProj = paddle.reshape(xProj, [batch_size, T, -1])  # [b x T x attDim]
+
+        sPrev = sPrev.squeeze(0)
+        sProj = self.sEmbed(sPrev)  # [b x attDim]
+        sProj = paddle.unsqueeze(sProj, 1)  # [b x 1 x attDim]
+        sProj = paddle.expand(sProj,
+                              [batch_size, T, self.attDim])  # [b x T x attDim]
+
+        sumTanh = paddle.tanh(sProj + xProj)
+        sumTanh = paddle.reshape(sumTanh, [-1, self.attDim])
+
+        vProj = self.wEmbed(sumTanh)  # [(b x T) x 1]
+        vProj = paddle.reshape(vProj, [batch_size, T])
+        alpha = F.softmax(
+            vProj, axis=1)  # attention weights for each sample in the minibatch
+        return alpha
+
+
+class DecoderUnit(nn.Layer):
+    def __init__(self, sDim, xDim, yDim, attDim):
+        super(DecoderUnit, self).__init__()
+        self.sDim = sDim
+        self.xDim = xDim
+        self.yDim = yDim
+        self.attDim = attDim
+        self.emdDim = attDim
+
+        self.attention_unit = AttentionUnit(sDim, xDim, attDim)
+        self.tgt_embedding = nn.Embedding(
+            yDim + 1, self.emdDim, weight_attr=nn.initializer.Normal(
+                std=0.01))  # the last is used for <BOS>
+        self.gru = nn.GRUCell(input_size=xDim + self.emdDim, hidden_size=sDim)
+        self.fc = nn.Linear(
+            sDim,
+            yDim,
+            weight_attr=nn.initializer.Normal(std=0.01),
+            bias_attr=nn.initializer.Constant(value=0))
+        self.embed_fc = nn.Linear(300, self.sDim)
+
+    def get_initial_state(self, embed, tile_times=1):
+        assert embed.shape[1] == 300
+        state = self.embed_fc(embed)  # N * sDim
+        if tile_times != 1:
+            state = state.unsqueeze(1)
+            trans_state = paddle.transpose(state, perm=[1, 0, 2])
+            state = paddle.tile(trans_state, repeat_times=[tile_times, 1, 1])
+            trans_state = paddle.transpose(state, perm=[1, 0, 2])
+            state = paddle.reshape(trans_state, shape=[-1, self.sDim])
+        state = state.unsqueeze(0)  # 1 * N * sDim
+        return state
+
+    def forward(self, x, sPrev, yPrev):
+        # x: feature sequence from the image decoder.
+        batch_size, T, _ = x.shape
+        alpha = self.attention_unit(x, sPrev)
+        context = paddle.squeeze(paddle.matmul(alpha.unsqueeze(1), x), axis=1)
+        yPrev = paddle.cast(yPrev, dtype="int64")
+        yProj = self.tgt_embedding(yPrev)
+
+        concat_context = paddle.concat([yProj, context], 1)
+        concat_context = paddle.squeeze(concat_context, 1)
+        sPrev = paddle.squeeze(sPrev, 0)
+        output, state = self.gru(concat_context, sPrev)
+        output = paddle.squeeze(output, axis=1)
+        output = self.fc(output)
+        return output, state
\ No newline at end of file