Merge branch 'dygraph' of https://github.com/PaddlePaddle/PaddleOCR into trt_cpp

c4303163 · LDOUBLEV · de37eedf · f02dcc06 · c4303163 · c4303163
Commit c4303163 authored Feb 02, 2021 by LDOUBLEV
15 changed files
--- a/ppocr/modeling/backbones/det_mobilenet_v3.py
+++ b/ppocr/modeling/backbones/det_mobilenet_v3.py
@@ -58,15 +58,15 @@ class MobileNetV3(nn.Layer):
                [5, 72, 40, True, 'relu', 2],
                [5, 120, 40, True, 'relu', 1],
                [5, 120, 40, True, 'relu', 1],
-                [3, 240, 80, False, 'hard_swish', 2],
+                [3, 240, 80, False, 'hardswish', 2],
-                [3, 200, 80, False, 'hard_swish', 1],
+                [3, 200, 80, False, 'hardswish', 1],
-                [3, 184, 80, False, 'hard_swish', 1],
+                [3, 184, 80, False, 'hardswish', 1],
-                [3, 184, 80, False, 'hard_swish', 1],
+                [3, 184, 80, False, 'hardswish', 1],
-                [3, 480, 112, True, 'hard_swish', 1],
+                [3, 480, 112, True, 'hardswish', 1],
-                [3, 672, 112, True, 'hard_swish', 1],
+                [3, 672, 112, True, 'hardswish', 1],
-                [5, 672, 160, True, 'hard_swish', 2],
+                [5, 672, 160, True, 'hardswish', 2],
-                [5, 960, 160, True, 'hard_swish', 1],
+                [5, 960, 160, True, 'hardswish', 1],
-                [5, 960, 160, True, 'hard_swish', 1],
+                [5, 960, 160, True, 'hardswish', 1],
            ]
            cls_ch_squeeze = 960
        elif model_name == "small":
@@ -75,14 +75,14 @@ class MobileNetV3(nn.Layer):
                [3, 16, 16, True, 'relu', 2],
                [3, 72, 24, False, 'relu', 2],
                [3, 88, 24, False, 'relu', 1],
-                [5, 96, 40, True, 'hard_swish', 2],
+                [5, 96, 40, True, 'hardswish', 2],
-                [5, 240, 40, True, 'hard_swish', 1],
+                [5, 240, 40, True, 'hardswish', 1],
-                [5, 240, 40, True, 'hard_swish', 1],
+                [5, 240, 40, True, 'hardswish', 1],
-                [5, 120, 48, True, 'hard_swish', 1],
+                [5, 120, 48, True, 'hardswish', 1],
-                [5, 144, 48, True, 'hard_swish', 1],
+                [5, 144, 48, True, 'hardswish', 1],
-                [5, 288, 96, True, 'hard_swish', 2],
+                [5, 288, 96, True, 'hardswish', 2],
-                [5, 576, 96, True, 'hard_swish', 1],
+                [5, 576, 96, True, 'hardswish', 1],
-                [5, 576, 96, True, 'hard_swish', 1],
+                [5, 576, 96, True, 'hardswish', 1],
            ]
            cls_ch_squeeze = 576
        else:
@@ -102,7 +102,7 @@ class MobileNetV3(nn.Layer):
            padding=1,
            groups=1,
            if_act=True,
-            act='hard_swish',
+            act='hardswish',
            name='conv1')
        self.stages = []
@@ -138,7 +138,7 @@ class MobileNetV3(nn.Layer):
                padding=0,
                groups=1,
                if_act=True,
-                act='hard_swish',
+                act='hardswish',
                name='conv_last'))
        self.stages.append(nn.Sequential(*block_list))
        self.out_channels.append(make_divisible(scale * cls_ch_squeeze))
@@ -192,10 +192,11 @@ class ConvBNLayer(nn.Layer):
        if self.if_act:
            if self.act == "relu":
                x = F.relu(x)
-            elif self.act == "hard_swish":
+            elif self.act == "hardswish":
-                x = F.activation.hard_swish(x)
+                x = F.hardswish(x)
            else:
-                print("The activation function is selected incorrectly.")
+                print("The activation function({}) is selected incorrectly.".
+                      format(self.act))
                exit()
        return x
@@ -282,5 +283,5 @@ class SEModule(nn.Layer):
        outputs = self.conv1(outputs)
        outputs = F.relu(outputs)
        outputs = self.conv2(outputs)
-        outputs = F.activation.hard_sigmoid(outputs)
+        outputs = F.hardsigmoid(outputs, slope=0.2, offset=0.5)
        return inputs * outputs
--- a/ppocr/modeling/backbones/rec_mobilenet_v3.py
+++ b/ppocr/modeling/backbones/rec_mobilenet_v3.py
@@ -51,15 +51,15 @@ class MobileNetV3(nn.Layer):
                [5, 72, 40, True, 'relu', (large_stride[2], 1)],
                [5, 120, 40, True, 'relu', 1],
                [5, 120, 40, True, 'relu', 1],
-                [3, 240, 80, False, 'hard_swish', 1],
+                [3, 240, 80, False, 'hardswish', 1],
-                [3, 200, 80, False, 'hard_swish', 1],
+                [3, 200, 80, False, 'hardswish', 1],
-                [3, 184, 80, False, 'hard_swish', 1],
+                [3, 184, 80, False, 'hardswish', 1],
-                [3, 184, 80, False, 'hard_swish', 1],
+                [3, 184, 80, False, 'hardswish', 1],
-                [3, 480, 112, True, 'hard_swish', 1],
+                [3, 480, 112, True, 'hardswish', 1],
-                [3, 672, 112, True, 'hard_swish', 1],
+                [3, 672, 112, True, 'hardswish', 1],
-                [5, 672, 160, True, 'hard_swish', (large_stride[3], 1)],
+                [5, 672, 160, True, 'hardswish', (large_stride[3], 1)],
-                [5, 960, 160, True, 'hard_swish', 1],
+                [5, 960, 160, True, 'hardswish', 1],
-                [5, 960, 160, True, 'hard_swish', 1],
+                [5, 960, 160, True, 'hardswish', 1],
            ]
            cls_ch_squeeze = 960
        elif model_name == "small":
@@ -68,14 +68,14 @@ class MobileNetV3(nn.Layer):
                [3, 16, 16, True, 'relu', (small_stride[0], 1)],
                [3, 72, 24, False, 'relu', (small_stride[1], 1)],
                [3, 88, 24, False, 'relu', 1],
-                [5, 96, 40, True, 'hard_swish', (small_stride[2], 1)],
+                [5, 96, 40, True, 'hardswish', (small_stride[2], 1)],
-                [5, 240, 40, True, 'hard_swish', 1],
+                [5, 240, 40, True, 'hardswish', 1],
-                [5, 240, 40, True, 'hard_swish', 1],
+                [5, 240, 40, True, 'hardswish', 1],
-                [5, 120, 48, True, 'hard_swish', 1],
+                [5, 120, 48, True, 'hardswish', 1],
-                [5, 144, 48, True, 'hard_swish', 1],
+                [5, 144, 48, True, 'hardswish', 1],
-                [5, 288, 96, True, 'hard_swish', (small_stride[3], 1)],
+                [5, 288, 96, True, 'hardswish', (small_stride[3], 1)],
-                [5, 576, 96, True, 'hard_swish', 1],
+                [5, 576, 96, True, 'hardswish', 1],
-                [5, 576, 96, True, 'hard_swish', 1],
+                [5, 576, 96, True, 'hardswish', 1],
            ]
            cls_ch_squeeze = 576
        else:
@@ -96,7 +96,7 @@ class MobileNetV3(nn.Layer):
            padding=1,
            groups=1,
            if_act=True,
-            act='hard_swish',
+            act='hardswish',
            name='conv1')
        i = 0
        block_list = []
@@ -124,7 +124,7 @@ class MobileNetV3(nn.Layer):
            padding=0,
            groups=1,
            if_act=True,
-            act='hard_swish',
+            act='hardswish',
            name='conv_last')
        self.pool = nn.MaxPool2D(kernel_size=2, stride=2, padding=0)

--- a/ppocr/modeling/backbones/rec_resnet_fpn.py
+++ b/ppocr/modeling/backbones/rec_resnet_fpn.py
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from paddle import nn, ParamAttr
+from paddle.nn import functional as F
+import paddle.fluid as fluid
+import paddle
+import numpy as np
+__all__ = ["ResNetFPN"]
+class ResNetFPN(nn.Layer):
+    def __init__(self, in_channels=1, layers=50, **kwargs):
+        super(ResNetFPN, self).__init__()
+        supported_layers = {
+            18: {
+                'depth': [2, 2, 2, 2],
+                'block_class': BasicBlock
+            },
+            34: {
+                'depth': [3, 4, 6, 3],
+                'block_class': BasicBlock
+            },
+            50: {
+                'depth': [3, 4, 6, 3],
+                'block_class': BottleneckBlock
+            },
+            101: {
+                'depth': [3, 4, 23, 3],
+                'block_class': BottleneckBlock
+            },
+            152: {
+                'depth': [3, 8, 36, 3],
+                'block_class': BottleneckBlock
+            }
+        }
+        stride_list = [(2, 2), (2, 2), (1, 1), (1, 1)]
+        num_filters = [64, 128, 256, 512]
+        self.depth = supported_layers[layers]['depth']
+        self.F = []
+        self.conv = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=64,
+            kernel_size=7,
+            stride=2,
+            act="relu",
+            name="conv1")
+        self.block_list = []
+        in_ch = 64
+        if layers >= 50:
+            for block in range(len(self.depth)):
+                for i in range(self.depth[block]):
+                    if layers in [101, 152] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+                    block_list = self.add_sublayer(
+                        "bottleneckBlock_{}_{}".format(block, i),
+                        BottleneckBlock(
+                            in_channels=in_ch,
+                            out_channels=num_filters[block],
+                            stride=stride_list[block] if i == 0 else 1,
+                            name=conv_name))
+                    in_ch = num_filters[block] * 4
+                    self.block_list.append(block_list)
+                self.F.append(block_list)
+        else:
+            for block in range(len(self.depth)):
+                for i in range(self.depth[block]):
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                    if i == 0 and block != 0:
+                        stride = (2, 1)
+                    else:
+                        stride = (1, 1)
+                    basic_block = self.add_sublayer(
+                        conv_name,
+                        BasicBlock(
+                            in_channels=in_ch,
+                            out_channels=num_filters[block],
+                            stride=stride_list[block] if i == 0 else 1,
+                            is_first=block == i == 0,
+                            name=conv_name))
+                    in_ch = basic_block.out_channels
+                    self.block_list.append(basic_block)
+        out_ch_list = [in_ch // 4, in_ch // 2, in_ch]
+        self.base_block = []
+        self.conv_trans = []
+        self.bn_block = []
+        for i in [-2, -3]:
+            in_channels = out_ch_list[i + 1] + out_ch_list[i]
+            self.base_block.append(
+                self.add_sublayer(
+                    "F_{}_base_block_0".format(i),
+                    nn.Conv2D(
+                        in_channels=in_channels,
+                        out_channels=out_ch_list[i],
+                        kernel_size=1,
+                        weight_attr=ParamAttr(trainable=True),
+                        bias_attr=ParamAttr(trainable=True))))
+            self.base_block.append(
+                self.add_sublayer(
+                    "F_{}_base_block_1".format(i),
+                    nn.Conv2D(
+                        in_channels=out_ch_list[i],
+                        out_channels=out_ch_list[i],
+                        kernel_size=3,
+                        padding=1,
+                        weight_attr=ParamAttr(trainable=True),
+                        bias_attr=ParamAttr(trainable=True))))
+            self.base_block.append(
+                self.add_sublayer(
+                    "F_{}_base_block_2".format(i),
+                    nn.BatchNorm(
+                        num_channels=out_ch_list[i],
+                        act="relu",
+                        param_attr=ParamAttr(trainable=True),
+                        bias_attr=ParamAttr(trainable=True))))
+        self.base_block.append(
+            self.add_sublayer(
+                "F_{}_base_block_3".format(i),
+                nn.Conv2D(
+                    in_channels=out_ch_list[i],
+                    out_channels=512,
+                    kernel_size=1,
+                    bias_attr=ParamAttr(trainable=True),
+                    weight_attr=ParamAttr(trainable=True))))
+        self.out_channels = 512
+    def __call__(self, x):
+        x = self.conv(x)
+        fpn_list = []
+        F = []
+        for i in range(len(self.depth)):
+            fpn_list.append(np.sum(self.depth[:i + 1]))
+        for i, block in enumerate(self.block_list):
+            x = block(x)
+            for number in fpn_list:
+                if i + 1 == number:
+                    F.append(x)
+        base = F[-1]
+        j = 0
+        for i, block in enumerate(self.base_block):
+            if i % 3 == 0 and i < 6:
+                j = j + 1
+                b, c, w, h = F[-j - 1].shape
+                if [w, h] == list(base.shape[2:]):
+                    base = base
+                else:
+                    base = self.conv_trans[j - 1](base)
+                    base = self.bn_block[j - 1](base)
+                base = paddle.concat([base, F[-j - 1]], axis=1)
+            base = block(base)
+        return base
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self.conv = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=2 if stride == (1, 1) else kernel_size,
+            dilation=2 if stride == (1, 1) else 1,
+            stride=stride,
+            padding=(kernel_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + '.conv2d.output.1.w_0'),
+            bias_attr=False, )
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        self.bn = nn.BatchNorm(
+            num_channels=out_channels,
+            act=act,
+            param_attr=ParamAttr(name=name + '.output.1.w_0'),
+            bias_attr=ParamAttr(name=name + '.output.1.b_0'),
+            moving_mean_name=bn_name + "_mean",
+            moving_variance_name=bn_name + "_variance")
+    def __call__(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        return x
+class ShortCut(nn.Layer):
+    def __init__(self, in_channels, out_channels, stride, name, is_first=False):
+        super(ShortCut, self).__init__()
+        self.use_conv = True
+        if in_channels != out_channels or stride != 1 or is_first == True:
+            if stride == (1, 1):
+                self.conv = ConvBNLayer(
+                    in_channels, out_channels, 1, 1, name=name)
+            else:  # stride==(2,2)
+                self.conv = ConvBNLayer(
+                    in_channels, out_channels, 1, stride, name=name)
+        else:
+            self.use_conv = False
+    def forward(self, x):
+        if self.use_conv:
+            x = self.conv(x)
+        return x
+class BottleneckBlock(nn.Layer):
+    def __init__(self, in_channels, out_channels, stride, name):
+        super(BottleneckBlock, self).__init__()
+        self.conv0 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            act='relu',
+            name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2b")
+        self.conv2 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels * 4,
+            kernel_size=1,
+            act=None,
+            name=name + "_branch2c")
+        self.short = ShortCut(
+            in_channels=in_channels,
+            out_channels=out_channels * 4,
+            stride=stride,
+            is_first=False,
+            name=name + "_branch1")
+        self.out_channels = out_channels * 4
+    def forward(self, x):
+        y = self.conv0(x)
+        y = self.conv1(y)
+        y = self.conv2(y)
+        y = y + self.short(x)
+        y = F.relu(y)
+        return y
+class BasicBlock(nn.Layer):
+    def __init__(self, in_channels, out_channels, stride, name, is_first):
+        super(BasicBlock, self).__init__()
+        self.conv0 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            act='relu',
+            stride=stride,
+            name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            act=None,
+            name=name + "_branch2b")
+        self.short = ShortCut(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            stride=stride,
+            is_first=is_first,
+            name=name + "_branch1")
+        self.out_channels = out_channels
+    def forward(self, x):
+        y = self.conv0(x)
+        y = self.conv1(y)
+        y = y + self.short(x)
+        return F.relu(y)
--- a/ppocr/modeling/heads/__init__.py
+++ b/ppocr/modeling/heads/__init__.py
@@ -23,10 +23,15 @@ def build_head(config):
    # rec head
    from .rec_ctc_head import CTCHead
+    from .rec_att_head import AttentionHead
+    from .rec_srn_head import SRNHead
    # cls head
    from .cls_head import ClsHead
-    support_dict = ['DBHead', 'EASTHead', 'SASTHead', 'CTCHead', 'ClsHead']
+    support_dict = [
+        'DBHead', 'EASTHead', 'SASTHead', 'CTCHead', 'ClsHead', 'AttentionHead',
+        'SRNHead'
+    ]
    module_name = config.pop('name')
    assert module_name in support_dict, Exception('head only support {}'.format(

--- a/ppocr/modeling/heads/rec_att_head.py
+++ b/ppocr/modeling/heads/rec_att_head.py
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+class AttentionHead(nn.Layer):
+    def __init__(self, in_channels, out_channels, hidden_size, **kwargs):
+        super(AttentionHead, self).__init__()
+        self.input_size = in_channels
+        self.hidden_size = hidden_size
+        self.num_classes = out_channels
+        self.attention_cell = AttentionGRUCell(
+            in_channels, hidden_size, out_channels, use_gru=False)
+        self.generator = nn.Linear(hidden_size, out_channels)
+    def _char_to_onehot(self, input_char, onehot_dim):
+        input_ont_hot = F.one_hot(input_char, onehot_dim)
+        return input_ont_hot
+    def forward(self, inputs, targets=None, batch_max_length=25):
+        batch_size = inputs.shape[0]
+        num_steps = batch_max_length
+        hidden = paddle.zeros((batch_size, self.hidden_size))
+        output_hiddens = []
+        if targets is not None:
+            for i in range(num_steps):
+                char_onehots = self._char_to_onehot(
+                    targets[:, i], onehot_dim=self.num_classes)
+                (outputs, hidden), alpha = self.attention_cell(hidden, inputs,
+                                                               char_onehots)
+                output_hiddens.append(paddle.unsqueeze(outputs, axis=1))
+            output = paddle.concat(output_hiddens, axis=1)
+            probs = self.generator(output)
+        else:
+            targets = paddle.zeros(shape=[batch_size], dtype="int32")
+            probs = None
+            for i in range(num_steps):
+                char_onehots = self._char_to_onehot(
+                    targets, onehot_dim=self.num_classes)
+                (outputs, hidden), alpha = self.attention_cell(hidden, inputs,
+                                                               char_onehots)
+                probs_step = self.generator(outputs)
+                if probs is None:
+                    probs = paddle.unsqueeze(probs_step, axis=1)
+                else:
+                    probs = paddle.concat(
+                        [probs, paddle.unsqueeze(
+                            probs_step, axis=1)], axis=1)
+                next_input = probs_step.argmax(axis=1)
+                targets = next_input
+        return probs
+class AttentionGRUCell(nn.Layer):
+    def __init__(self, input_size, hidden_size, num_embeddings, use_gru=False):
+        super(AttentionGRUCell, self).__init__()
+        self.i2h = nn.Linear(input_size, hidden_size, bias_attr=False)
+        self.h2h = nn.Linear(hidden_size, hidden_size)
+        self.score = nn.Linear(hidden_size, 1, bias_attr=False)
+        self.rnn = nn.GRUCell(
+            input_size=input_size + num_embeddings, hidden_size=hidden_size)
+        self.hidden_size = hidden_size
+    def forward(self, prev_hidden, batch_H, char_onehots):
+        batch_H_proj = self.i2h(batch_H)
+        prev_hidden_proj = paddle.unsqueeze(self.h2h(prev_hidden), axis=1)
+        res = paddle.add(batch_H_proj, prev_hidden_proj)
+        res = paddle.tanh(res)
+        e = self.score(res)
+        alpha = F.softmax(e, axis=1)
+        alpha = paddle.transpose(alpha, [0, 2, 1])
+        context = paddle.squeeze(paddle.mm(alpha, batch_H), axis=1)
+        concat_context = paddle.concat([context, char_onehots], 1)
+        cur_hidden = self.rnn(concat_context, prev_hidden)
+        return cur_hidden, alpha
+class AttentionLSTM(nn.Layer):
+    def __init__(self, in_channels, out_channels, hidden_size, **kwargs):
+        super(AttentionLSTM, self).__init__()
+        self.input_size = in_channels
+        self.hidden_size = hidden_size
+        self.num_classes = out_channels
+        self.attention_cell = AttentionLSTMCell(
+            in_channels, hidden_size, out_channels, use_gru=False)
+        self.generator = nn.Linear(hidden_size, out_channels)
+    def _char_to_onehot(self, input_char, onehot_dim):
+        input_ont_hot = F.one_hot(input_char, onehot_dim)
+        return input_ont_hot
+    def forward(self, inputs, targets=None, batch_max_length=25):
+        batch_size = inputs.shape[0]
+        num_steps = batch_max_length
+        hidden = (paddle.zeros((batch_size, self.hidden_size)), paddle.zeros(
+            (batch_size, self.hidden_size)))
+        output_hiddens = []
+        if targets is not None:
+            for i in range(num_steps):
+                # one-hot vectors for a i-th char
+                char_onehots = self._char_to_onehot(
+                    targets[:, i], onehot_dim=self.num_classes)
+                hidden, alpha = self.attention_cell(hidden, inputs,
+                                                    char_onehots)
+                hidden = (hidden[1][0], hidden[1][1])
+                output_hiddens.append(paddle.unsqueeze(hidden[0], axis=1))
+            output = paddle.concat(output_hiddens, axis=1)
+            probs = self.generator(output)
+        else:
+            targets = paddle.zeros(shape=[batch_size], dtype="int32")
+            probs = None
+            for i in range(num_steps):
+                char_onehots = self._char_to_onehot(
+                    targets, onehot_dim=self.num_classes)
+                hidden, alpha = self.attention_cell(hidden, inputs,
+                                                    char_onehots)
+                probs_step = self.generator(hidden[0])
+                hidden = (hidden[1][0], hidden[1][1])
+                if probs is None:
+                    probs = paddle.unsqueeze(probs_step, axis=1)
+                else:
+                    probs = paddle.concat(
+                        [probs, paddle.unsqueeze(
+                            probs_step, axis=1)], axis=1)
+                next_input = probs_step.argmax(axis=1)
+                targets = next_input
+        return probs
+class AttentionLSTMCell(nn.Layer):
+    def __init__(self, input_size, hidden_size, num_embeddings, use_gru=False):
+        super(AttentionLSTMCell, self).__init__()
+        self.i2h = nn.Linear(input_size, hidden_size, bias_attr=False)
+        self.h2h = nn.Linear(hidden_size, hidden_size)
+        self.score = nn.Linear(hidden_size, 1, bias_attr=False)
+        if not use_gru:
+            self.rnn = nn.LSTMCell(
+                input_size=input_size + num_embeddings, hidden_size=hidden_size)
+        else:
+            self.rnn = nn.GRUCell(
+                input_size=input_size + num_embeddings, hidden_size=hidden_size)
+        self.hidden_size = hidden_size
+    def forward(self, prev_hidden, batch_H, char_onehots):
+        batch_H_proj = self.i2h(batch_H)
+        prev_hidden_proj = paddle.unsqueeze(self.h2h(prev_hidden[0]), axis=1)
+        res = paddle.add(batch_H_proj, prev_hidden_proj)
+        res = paddle.tanh(res)
+        e = self.score(res)
+        alpha = F.softmax(e, axis=1)
+        alpha = paddle.transpose(alpha, [0, 2, 1])
+        context = paddle.squeeze(paddle.mm(alpha, batch_H), axis=1)
+        concat_context = paddle.concat([context, char_onehots], 1)
+        cur_hidden = self.rnn(concat_context, prev_hidden)
+        return cur_hidden, alpha
--- a/ppocr/modeling/heads/rec_srn_head.py
+++ b/ppocr/modeling/heads/rec_srn_head.py
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import math
+import paddle
+from paddle import nn, ParamAttr
+from paddle.nn import functional as F
+import paddle.fluid as fluid
+import numpy as np
+from .self_attention import WrapEncoderForFeature
+from .self_attention import WrapEncoder
+from paddle.static import Program
+from ppocr.modeling.backbones.rec_resnet_fpn import ResNetFPN
+import paddle.fluid.framework as framework
+from collections import OrderedDict
+gradient_clip = 10
+class PVAM(nn.Layer):
+    def __init__(self, in_channels, char_num, max_text_length, num_heads,
+                 num_encoder_tus, hidden_dims):
+        super(PVAM, self).__init__()
+        self.char_num = char_num
+        self.max_length = max_text_length
+        self.num_heads = num_heads
+        self.num_encoder_TUs = num_encoder_tus
+        self.hidden_dims = hidden_dims
+        # Transformer encoder
+        t = 256
+        c = 512
+        self.wrap_encoder_for_feature = WrapEncoderForFeature(
+            src_vocab_size=1,
+            max_length=t,
+            n_layer=self.num_encoder_TUs,
+            n_head=self.num_heads,
+            d_key=int(self.hidden_dims / self.num_heads),
+            d_value=int(self.hidden_dims / self.num_heads),
+            d_model=self.hidden_dims,
+            d_inner_hid=self.hidden_dims,
+            prepostprocess_dropout=0.1,
+            attention_dropout=0.1,
+            relu_dropout=0.1,
+            preprocess_cmd="n",
+            postprocess_cmd="da",
+            weight_sharing=True)
+        # PVAM
+        self.flatten0 = paddle.nn.Flatten(start_axis=0, stop_axis=1)
+        self.fc0 = paddle.nn.Linear(
+            in_features=in_channels,
+            out_features=in_channels, )
+        self.emb = paddle.nn.Embedding(
+            num_embeddings=self.max_length, embedding_dim=in_channels)
+        self.flatten1 = paddle.nn.Flatten(start_axis=0, stop_axis=2)
+        self.fc1 = paddle.nn.Linear(
+            in_features=in_channels, out_features=1, bias_attr=False)
+    def forward(self, inputs, encoder_word_pos, gsrm_word_pos):
+        b, c, h, w = inputs.shape
+        conv_features = paddle.reshape(inputs, shape=[-1, c, h * w])
+        conv_features = paddle.transpose(conv_features, perm=[0, 2, 1])
+        # transformer encoder
+        b, t, c = conv_features.shape
+        enc_inputs = [conv_features, encoder_word_pos, None]
+        word_features = self.wrap_encoder_for_feature(enc_inputs)
+        # pvam
+        b, t, c = word_features.shape
+        word_features = self.fc0(word_features)
+        word_features_ = paddle.reshape(word_features, [-1, 1, t, c])
+        word_features_ = paddle.tile(word_features_, [1, self.max_length, 1, 1])
+        word_pos_feature = self.emb(gsrm_word_pos)
+        word_pos_feature_ = paddle.reshape(word_pos_feature,
+                                           [-1, self.max_length, 1, c])
+        word_pos_feature_ = paddle.tile(word_pos_feature_, [1, 1, t, 1])
+        y = word_pos_feature_ + word_features_
+        y = F.tanh(y)
+        attention_weight = self.fc1(y)
+        attention_weight = paddle.reshape(
+            attention_weight, shape=[-1, self.max_length, t])
+        attention_weight = F.softmax(attention_weight, axis=-1)
+        pvam_features = paddle.matmul(attention_weight,
+                                      word_features)  #[b, max_length, c]
+        return pvam_features
+class GSRM(nn.Layer):
+    def __init__(self, in_channels, char_num, max_text_length, num_heads,
+                 num_encoder_tus, num_decoder_tus, hidden_dims):
+        super(GSRM, self).__init__()
+        self.char_num = char_num
+        self.max_length = max_text_length
+        self.num_heads = num_heads
+        self.num_encoder_TUs = num_encoder_tus
+        self.num_decoder_TUs = num_decoder_tus
+        self.hidden_dims = hidden_dims
+        self.fc0 = paddle.nn.Linear(
+            in_features=in_channels, out_features=self.char_num)
+        self.wrap_encoder0 = WrapEncoder(
+            src_vocab_size=self.char_num + 1,
+            max_length=self.max_length,
+            n_layer=self.num_decoder_TUs,
+            n_head=self.num_heads,
+            d_key=int(self.hidden_dims / self.num_heads),
+            d_value=int(self.hidden_dims / self.num_heads),
+            d_model=self.hidden_dims,
+            d_inner_hid=self.hidden_dims,
+            prepostprocess_dropout=0.1,
+            attention_dropout=0.1,
+            relu_dropout=0.1,
+            preprocess_cmd="n",
+            postprocess_cmd="da",
+            weight_sharing=True)
+        self.wrap_encoder1 = WrapEncoder(
+            src_vocab_size=self.char_num + 1,
+            max_length=self.max_length,
+            n_layer=self.num_decoder_TUs,
+            n_head=self.num_heads,
+            d_key=int(self.hidden_dims / self.num_heads),
+            d_value=int(self.hidden_dims / self.num_heads),
+            d_model=self.hidden_dims,
+            d_inner_hid=self.hidden_dims,
+            prepostprocess_dropout=0.1,
+            attention_dropout=0.1,
+            relu_dropout=0.1,
+            preprocess_cmd="n",
+            postprocess_cmd="da",
+            weight_sharing=True)
+        self.mul = lambda x: paddle.matmul(x=x,
+                                           y=self.wrap_encoder0.prepare_decoder.emb0.weight,
+                                           transpose_y=True)
+    def forward(self, inputs, gsrm_word_pos, gsrm_slf_attn_bias1,
+                gsrm_slf_attn_bias2):
+        # ===== GSRM Visual-to-semantic embedding block =====
+        b, t, c = inputs.shape
+        pvam_features = paddle.reshape(inputs, [-1, c])
+        word_out = self.fc0(pvam_features)
+        word_ids = paddle.argmax(F.softmax(word_out), axis=1)
+        word_ids = paddle.reshape(x=word_ids, shape=[-1, t, 1])
+        #===== GSRM Semantic reasoning block =====
+        """
+        This module is achieved through bi-transformers,
+        ngram_feature1 is the froward one, ngram_fetaure2 is the backward one
+        """
+        pad_idx = self.char_num
+        word1 = paddle.cast(word_ids, "float32")
+        word1 = F.pad(word1, [1, 0], value=1.0 * pad_idx, data_format="NLC")
+        word1 = paddle.cast(word1, "int64")
+        word1 = word1[:, :-1, :]
+        word2 = word_ids
+        enc_inputs_1 = [word1, gsrm_word_pos, gsrm_slf_attn_bias1]
+        enc_inputs_2 = [word2, gsrm_word_pos, gsrm_slf_attn_bias2]
+        gsrm_feature1 = self.wrap_encoder0(enc_inputs_1)
+        gsrm_feature2 = self.wrap_encoder1(enc_inputs_2)
+        gsrm_feature2 = F.pad(gsrm_feature2, [0, 1],
+                              value=0.,
+                              data_format="NLC")
+        gsrm_feature2 = gsrm_feature2[:, 1:, ]
+        gsrm_features = gsrm_feature1 + gsrm_feature2
+        gsrm_out = self.mul(gsrm_features)
+        b, t, c = gsrm_out.shape
+        gsrm_out = paddle.reshape(gsrm_out, [-1, c])
+        return gsrm_features, word_out, gsrm_out
+class VSFD(nn.Layer):
+    def __init__(self, in_channels=512, pvam_ch=512, char_num=38):
+        super(VSFD, self).__init__()
+        self.char_num = char_num
+        self.fc0 = paddle.nn.Linear(
+            in_features=in_channels * 2, out_features=pvam_ch)
+        self.fc1 = paddle.nn.Linear(
+            in_features=pvam_ch, out_features=self.char_num)
+    def forward(self, pvam_feature, gsrm_feature):
+        b, t, c1 = pvam_feature.shape
+        b, t, c2 = gsrm_feature.shape
+        combine_feature_ = paddle.concat([pvam_feature, gsrm_feature], axis=2)
+        img_comb_feature_ = paddle.reshape(
+            combine_feature_, shape=[-1, c1 + c2])
+        img_comb_feature_map = self.fc0(img_comb_feature_)
+        img_comb_feature_map = F.sigmoid(img_comb_feature_map)
+        img_comb_feature_map = paddle.reshape(
+            img_comb_feature_map, shape=[-1, t, c1])
+        combine_feature = img_comb_feature_map * pvam_feature + (
+            1.0 - img_comb_feature_map) * gsrm_feature
+        img_comb_feature = paddle.reshape(combine_feature, shape=[-1, c1])
+        out = self.fc1(img_comb_feature)
+        return out
+class SRNHead(nn.Layer):
+    def __init__(self, in_channels, out_channels, max_text_length, num_heads,
+                 num_encoder_TUs, num_decoder_TUs, hidden_dims, **kwargs):
+        super(SRNHead, self).__init__()
+        self.char_num = out_channels
+        self.max_length = max_text_length
+        self.num_heads = num_heads
+        self.num_encoder_TUs = num_encoder_TUs
+        self.num_decoder_TUs = num_decoder_TUs
+        self.hidden_dims = hidden_dims
+        self.pvam = PVAM(
+            in_channels=in_channels,
+            char_num=self.char_num,
+            max_text_length=self.max_length,
+            num_heads=self.num_heads,
+            num_encoder_tus=self.num_encoder_TUs,
+            hidden_dims=self.hidden_dims)
+        self.gsrm = GSRM(
+            in_channels=in_channels,
+            char_num=self.char_num,
+            max_text_length=self.max_length,
+            num_heads=self.num_heads,
+            num_encoder_tus=self.num_encoder_TUs,
+            num_decoder_tus=self.num_decoder_TUs,
+            hidden_dims=self.hidden_dims)
+        self.vsfd = VSFD(in_channels=in_channels, char_num=self.char_num)
+        self.gsrm.wrap_encoder1.prepare_decoder.emb0 = self.gsrm.wrap_encoder0.prepare_decoder.emb0
+    def forward(self, inputs, others):
+        encoder_word_pos = others[0]
+        gsrm_word_pos = others[1]
+        gsrm_slf_attn_bias1 = others[2]
+        gsrm_slf_attn_bias2 = others[3]
+        pvam_feature = self.pvam(inputs, encoder_word_pos, gsrm_word_pos)
+        gsrm_feature, word_out, gsrm_out = self.gsrm(
+            pvam_feature, gsrm_word_pos, gsrm_slf_attn_bias1,
+            gsrm_slf_attn_bias2)
+        final_out = self.vsfd(pvam_feature, gsrm_feature)
+        if not self.training:
+            final_out = F.softmax(final_out, axis=1)
+        _, decoded_out = paddle.topk(final_out, k=1)
+        predicts = OrderedDict([
+            ('predict', final_out),
+            ('pvam_feature', pvam_feature),
+            ('decoded_out', decoded_out),
+            ('word_out', word_out),
+            ('gsrm_out', gsrm_out),
+        ])
+        return predicts
--- a/ppocr/modeling/heads/self_attention.py
+++ b/ppocr/modeling/heads/self_attention.py
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import math
+import paddle
+from paddle import ParamAttr, nn
+from paddle import nn, ParamAttr
+from paddle.nn import functional as F
+import paddle.fluid as fluid
+import numpy as np
+gradient_clip = 10
+class WrapEncoderForFeature(nn.Layer):
+    def __init__(self,
+                 src_vocab_size,
+                 max_length,
+                 n_layer,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd,
+                 postprocess_cmd,
+                 weight_sharing,
+                 bos_idx=0):
+        super(WrapEncoderForFeature, self).__init__()
+        self.prepare_encoder = PrepareEncoder(
+            src_vocab_size,
+            d_model,
+            max_length,
+            prepostprocess_dropout,
+            bos_idx=bos_idx,
+            word_emb_param_name="src_word_emb_table")
+        self.encoder = Encoder(n_layer, n_head, d_key, d_value, d_model,
+                               d_inner_hid, prepostprocess_dropout,
+                               attention_dropout, relu_dropout, preprocess_cmd,
+                               postprocess_cmd)
+    def forward(self, enc_inputs):
+        conv_features, src_pos, src_slf_attn_bias = enc_inputs
+        enc_input = self.prepare_encoder(conv_features, src_pos)
+        enc_output = self.encoder(enc_input, src_slf_attn_bias)
+        return enc_output
+class WrapEncoder(nn.Layer):
+    """
+    embedder + encoder
+    """
+    def __init__(self,
+                 src_vocab_size,
+                 max_length,
+                 n_layer,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd,
+                 postprocess_cmd,
+                 weight_sharing,
+                 bos_idx=0):
+        super(WrapEncoder, self).__init__()
+        self.prepare_decoder = PrepareDecoder(
+            src_vocab_size,
+            d_model,
+            max_length,
+            prepostprocess_dropout,
+            bos_idx=bos_idx)
+        self.encoder = Encoder(n_layer, n_head, d_key, d_value, d_model,
+                               d_inner_hid, prepostprocess_dropout,
+                               attention_dropout, relu_dropout, preprocess_cmd,
+                               postprocess_cmd)
+    def forward(self, enc_inputs):
+        src_word, src_pos, src_slf_attn_bias = enc_inputs
+        enc_input = self.prepare_decoder(src_word, src_pos)
+        enc_output = self.encoder(enc_input, src_slf_attn_bias)
+        return enc_output
+class Encoder(nn.Layer):
+    """
+    encoder
+    """
+    def __init__(self,
+                 n_layer,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd="n",
+                 postprocess_cmd="da"):
+        super(Encoder, self).__init__()
+        self.encoder_layers = list()
+        for i in range(n_layer):
+            self.encoder_layers.append(
+                self.add_sublayer(
+                    "layer_%d" % i,
+                    EncoderLayer(n_head, d_key, d_value, d_model, d_inner_hid,
+                                 prepostprocess_dropout, attention_dropout,
+                                 relu_dropout, preprocess_cmd,
+                                 postprocess_cmd)))
+        self.processer = PrePostProcessLayer(preprocess_cmd, d_model,
+                                             prepostprocess_dropout)
+    def forward(self, enc_input, attn_bias):
+        for encoder_layer in self.encoder_layers:
+            enc_output = encoder_layer(enc_input, attn_bias)
+            enc_input = enc_output
+        enc_output = self.processer(enc_output)
+        return enc_output
+class EncoderLayer(nn.Layer):
+    """
+    EncoderLayer
+    """
+    def __init__(self,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd="n",
+                 postprocess_cmd="da"):
+        super(EncoderLayer, self).__init__()
+        self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
+                                                 prepostprocess_dropout)
+        self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
+                                            attention_dropout)
+        self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model,
+                                                  prepostprocess_dropout)
+        self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
+                                                 prepostprocess_dropout)
+        self.ffn = FFN(d_inner_hid, d_model, relu_dropout)
+        self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model,
+                                                  prepostprocess_dropout)
+    def forward(self, enc_input, attn_bias):
+        attn_output = self.self_attn(
+            self.preprocesser1(enc_input), None, None, attn_bias)
+        attn_output = self.postprocesser1(attn_output, enc_input)
+        ffn_output = self.ffn(self.preprocesser2(attn_output))
+        ffn_output = self.postprocesser2(ffn_output, attn_output)
+        return ffn_output
+class MultiHeadAttention(nn.Layer):
+    """
+    Multi-Head Attention
+    """
+    def __init__(self, d_key, d_value, d_model, n_head=1, dropout_rate=0.):
+        super(MultiHeadAttention, self).__init__()
+        self.n_head = n_head
+        self.d_key = d_key
+        self.d_value = d_value
+        self.d_model = d_model
+        self.dropout_rate = dropout_rate
+        self.q_fc = paddle.nn.Linear(
+            in_features=d_model, out_features=d_key * n_head, bias_attr=False)
+        self.k_fc = paddle.nn.Linear(
+            in_features=d_model, out_features=d_key * n_head, bias_attr=False)
+        self.v_fc = paddle.nn.Linear(
+            in_features=d_model, out_features=d_value * n_head, bias_attr=False)
+        self.proj_fc = paddle.nn.Linear(
+            in_features=d_value * n_head, out_features=d_model, bias_attr=False)
+    def _prepare_qkv(self, queries, keys, values, cache=None):
+        if keys is None:  # self-attention
+            keys, values = queries, queries
+            static_kv = False
+        else:  # cross-attention
+            static_kv = True
+        q = self.q_fc(queries)
+        q = paddle.reshape(x=q, shape=[0, 0, self.n_head, self.d_key])
+        q = paddle.transpose(x=q, perm=[0, 2, 1, 3])
+        if cache is not None and static_kv and "static_k" in cache:
+            # for encoder-decoder attention in inference and has cached
+            k = cache["static_k"]
+            v = cache["static_v"]
+        else:
+            k = self.k_fc(keys)
+            v = self.v_fc(values)
+            k = paddle.reshape(x=k, shape=[0, 0, self.n_head, self.d_key])
+            k = paddle.transpose(x=k, perm=[0, 2, 1, 3])
+            v = paddle.reshape(x=v, shape=[0, 0, self.n_head, self.d_value])
+            v = paddle.transpose(x=v, perm=[0, 2, 1, 3])
+        if cache is not None:
+            if static_kv and not "static_k" in cache:
+                # for encoder-decoder attention in inference and has not cached
+                cache["static_k"], cache["static_v"] = k, v
+            elif not static_kv:
+                # for decoder self-attention in inference
+                cache_k, cache_v = cache["k"], cache["v"]
+                k = paddle.concat([cache_k, k], axis=2)
+                v = paddle.concat([cache_v, v], axis=2)
+                cache["k"], cache["v"] = k, v
+        return q, k, v
+    def forward(self, queries, keys, values, attn_bias, cache=None):
+        # compute q ,k ,v
+        keys = queries if keys is None else keys
+        values = keys if values is None else values
+        q, k, v = self._prepare_qkv(queries, keys, values, cache)
+        # scale dot product attention
+        product = paddle.matmul(x=q, y=k, transpose_y=True)
+        product = product * self.d_model**-0.5
+        if attn_bias is not None:
+            product += attn_bias
+        weights = F.softmax(product)
+        if self.dropout_rate:
+            weights = F.dropout(
+                weights, p=self.dropout_rate, mode="downscale_in_infer")
+        out = paddle.matmul(weights, v)
+        # combine heads
+        out = paddle.transpose(out, perm=[0, 2, 1, 3])
+        out = paddle.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
+        # project to output
+        out = self.proj_fc(out)
+        return out
+class PrePostProcessLayer(nn.Layer):
+    """
+    PrePostProcessLayer
+    """
+    def __init__(self, process_cmd, d_model, dropout_rate):
+        super(PrePostProcessLayer, self).__init__()
+        self.process_cmd = process_cmd
+        self.functors = []
+        for cmd in self.process_cmd:
+            if cmd == "a":  # add residual connection
+                self.functors.append(lambda x, y: x + y if y is not None else x)
+            elif cmd == "n":  # add layer normalization
+                self.functors.append(
+                    self.add_sublayer(
+                        "layer_norm_%d" % len(
+                            self.sublayers(include_sublayers=False)),
+                        paddle.nn.LayerNorm(
+                            normalized_shape=d_model,
+                            weight_attr=fluid.ParamAttr(
+                                initializer=fluid.initializer.Constant(1.)),
+                            bias_attr=fluid.ParamAttr(
+                                initializer=fluid.initializer.Constant(0.)))))
+            elif cmd == "d":  # add dropout
+                self.functors.append(lambda x: F.dropout(
+                    x, p=dropout_rate, mode="downscale_in_infer")
+                                     if dropout_rate else x)
+    def forward(self, x, residual=None):
+        for i, cmd in enumerate(self.process_cmd):
+            if cmd == "a":
+                x = self.functors[i](x, residual)
+            else:
+                x = self.functors[i](x)
+        return x
+class PrepareEncoder(nn.Layer):
+    def __init__(self,
+                 src_vocab_size,
+                 src_emb_dim,
+                 src_max_len,
+                 dropout_rate=0,
+                 bos_idx=0,
+                 word_emb_param_name=None,
+                 pos_enc_param_name=None):
+        super(PrepareEncoder, self).__init__()
+        self.src_emb_dim = src_emb_dim
+        self.src_max_len = src_max_len
+        self.emb = paddle.nn.Embedding(
+            num_embeddings=self.src_max_len,
+            embedding_dim=self.src_emb_dim,
+            sparse=True)
+        self.dropout_rate = dropout_rate
+    def forward(self, src_word, src_pos):
+        src_word_emb = src_word
+        src_word_emb = fluid.layers.cast(src_word_emb, 'float32')
+        src_word_emb = paddle.scale(x=src_word_emb, scale=self.src_emb_dim**0.5)
+        src_pos = paddle.squeeze(src_pos, axis=-1)
+        src_pos_enc = self.emb(src_pos)
+        src_pos_enc.stop_gradient = True
+        enc_input = src_word_emb + src_pos_enc
+        if self.dropout_rate:
+            out = F.dropout(
+                x=enc_input, p=self.dropout_rate, mode="downscale_in_infer")
+        else:
+            out = enc_input
+        return out
+class PrepareDecoder(nn.Layer):
+    def __init__(self,
+                 src_vocab_size,
+                 src_emb_dim,
+                 src_max_len,
+                 dropout_rate=0,
+                 bos_idx=0,
+                 word_emb_param_name=None,
+                 pos_enc_param_name=None):
+        super(PrepareDecoder, self).__init__()
+        self.src_emb_dim = src_emb_dim
+        """
+        self.emb0 = Embedding(num_embeddings=src_vocab_size,
+                              embedding_dim=src_emb_dim)
+        """
+        self.emb0 = paddle.nn.Embedding(
+            num_embeddings=src_vocab_size,
+            embedding_dim=self.src_emb_dim,
+            padding_idx=bos_idx,
+            weight_attr=paddle.ParamAttr(
+                name=word_emb_param_name,
+                initializer=nn.initializer.Normal(0., src_emb_dim**-0.5)))
+        self.emb1 = paddle.nn.Embedding(
+            num_embeddings=src_max_len,
+            embedding_dim=self.src_emb_dim,
+            weight_attr=paddle.ParamAttr(name=pos_enc_param_name))
+        self.dropout_rate = dropout_rate
+    def forward(self, src_word, src_pos):
+        src_word = fluid.layers.cast(src_word, 'int64')
+        src_word = paddle.squeeze(src_word, axis=-1)
+        src_word_emb = self.emb0(src_word)
+        src_word_emb = paddle.scale(x=src_word_emb, scale=self.src_emb_dim**0.5)
+        src_pos = paddle.squeeze(src_pos, axis=-1)
+        src_pos_enc = self.emb1(src_pos)
+        src_pos_enc.stop_gradient = True
+        enc_input = src_word_emb + src_pos_enc
+        if self.dropout_rate:
+            out = F.dropout(
+                x=enc_input, p=self.dropout_rate, mode="downscale_in_infer")
+        else:
+            out = enc_input
+        return out
+class FFN(nn.Layer):
+    """
+    Feed-Forward Network
+    """
+    def __init__(self, d_inner_hid, d_model, dropout_rate):
+        super(FFN, self).__init__()
+        self.dropout_rate = dropout_rate
+        self.fc1 = paddle.nn.Linear(
+            in_features=d_model, out_features=d_inner_hid)
+        self.fc2 = paddle.nn.Linear(
+            in_features=d_inner_hid, out_features=d_model)
+    def forward(self, x):
+        hidden = self.fc1(x)
+        hidden = F.relu(hidden)
+        if self.dropout_rate:
+            hidden = F.dropout(
+                hidden, p=self.dropout_rate, mode="downscale_in_infer")
+        out = self.fc2(hidden)
+        return out
--- a/ppocr/postprocess/__init__.py
+++ b/ppocr/postprocess/__init__.py
@@ -26,11 +26,12 @@ def build_post_process(config, global_config=None):
    from .db_postprocess import DBPostProcess
    from .east_postprocess import EASTPostProcess
    from .sast_postprocess import SASTPostProcess
-    from .rec_postprocess import CTCLabelDecode, AttnLabelDecode
+    from .rec_postprocess import CTCLabelDecode, AttnLabelDecode, SRNLabelDecode
    from .cls_postprocess import ClsPostProcess
    support_dict = [
-        'DBPostProcess', 'EASTPostProcess', 'SASTPostProcess', 'CTCLabelDecode', 'AttnLabelDecode', 'ClsPostProcess'
+        'DBPostProcess', 'EASTPostProcess', 'SASTPostProcess', 'CTCLabelDecode',
+        'AttnLabelDecode', 'ClsPostProcess', 'SRNLabelDecode'
    ]
    config = copy.deepcopy(config)

--- a/ppocr/postprocess/rec_postprocess.py
+++ b/ppocr/postprocess/rec_postprocess.py
@@ -33,6 +33,9 @@ class BaseRecLabelDecode(object):
        assert character_type in support_character_type, "Only {} are supported now but get {}".format(
            support_character_type, character_type)
+        self.beg_str = "sos"
+        self.end_str = "eos"
        if character_type == "en":
            self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz"
            dict_character = list(self.character_str)
@@ -109,7 +112,6 @@ class CTCLabelDecode(BaseRecLabelDecode):
    def __call__(self, preds, label=None, *args, **kwargs):
        if isinstance(preds, paddle.Tensor):
            preds = preds.numpy()
        preds_idx = preds.argmax(axis=2)
        preds_prob = preds.max(axis=2)
        text = self.decode(preds_idx, preds_prob, is_remove_duplicate=True)
@@ -133,16 +135,143 @@ class AttnLabelDecode(BaseRecLabelDecode):
                 **kwargs):
        super(AttnLabelDecode, self).__init__(character_dict_path,
                                              character_type, use_space_char)
-        self.beg_str = "sos"
-        self.end_str = "eos"
    def add_special_char(self, dict_character):
-        dict_character = [self.beg_str, self.end_str] + dict_character
+        self.beg_str = "sos"
+        self.end_str = "eos"
+        dict_character = dict_character
+        dict_character = [self.beg_str] + dict_character + [self.end_str]
        return dict_character
-    def __call__(self, text):
+    def decode(self, text_index, text_prob=None, is_remove_duplicate=False):
+        """ convert text-index into text-label. """
+        result_list = []
+        ignored_tokens = self.get_ignored_tokens()
+        [beg_idx, end_idx] = self.get_ignored_tokens()
+        batch_size = len(text_index)
+        for batch_idx in range(batch_size):
+            char_list = []
+            conf_list = []
+            for idx in range(len(text_index[batch_idx])):
+                if text_index[batch_idx][idx] in ignored_tokens:
+                    continue
+                if int(text_index[batch_idx][idx]) == int(end_idx):
+                    break
+                if is_remove_duplicate:
+                    # only for predict
+                    if idx > 0 and text_index[batch_idx][idx - 1] == text_index[
+                            batch_idx][idx]:
+                        continue
+                char_list.append(self.character[int(text_index[batch_idx][
+                    idx])])
+                if text_prob is not None:
+                    conf_list.append(text_prob[batch_idx][idx])
+                else:
+                    conf_list.append(1)
+            text = ''.join(char_list)
+            result_list.append((text, np.mean(conf_list)))
+        return result_list
+    def __call__(self, preds, label=None, *args, **kwargs):
+        """
        text = self.decode(text)
+        if label is None:
+            return text
+        else:
+            label = self.decode(label, is_remove_duplicate=False)
+            return text, label
+        """
+        if isinstance(preds, paddle.Tensor):
+            preds = preds.numpy()
+        preds_idx = preds.argmax(axis=2)
+        preds_prob = preds.max(axis=2)
+        text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False)
+        if label is None:
+            return text
+        label = self.decode(label, is_remove_duplicate=False)
+        return text, label
+    def get_ignored_tokens(self):
+        beg_idx = self.get_beg_end_flag_idx("beg")
+        end_idx = self.get_beg_end_flag_idx("end")
+        return [beg_idx, end_idx]
+    def get_beg_end_flag_idx(self, beg_or_end):
+        if beg_or_end == "beg":
+            idx = np.array(self.dict[self.beg_str])
+        elif beg_or_end == "end":
+            idx = np.array(self.dict[self.end_str])
+        else:
+            assert False, "unsupport type %s in get_beg_end_flag_idx" \
+                          % beg_or_end
+        return idx
+class SRNLabelDecode(BaseRecLabelDecode):
+    """ Convert between text-label and text-index """
+    def __init__(self,
+                 character_dict_path=None,
+                 character_type='en',
+                 use_space_char=False,
+                 **kwargs):
+        super(SRNLabelDecode, self).__init__(character_dict_path,
+                                             character_type, use_space_char)
+    def __call__(self, preds, label=None, *args, **kwargs):
+        pred = preds['predict']
+        char_num = len(self.character_str) + 2
+        if isinstance(pred, paddle.Tensor):
+            pred = pred.numpy()
+        pred = np.reshape(pred, [-1, char_num])
+        preds_idx = np.argmax(pred, axis=1)
+        preds_prob = np.max(pred, axis=1)
+        preds_idx = np.reshape(preds_idx, [-1, 25])
+        preds_prob = np.reshape(preds_prob, [-1, 25])
+        text = self.decode(preds_idx, preds_prob)
+        if label is None:
+            text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False)
            return text
+        label = self.decode(label)
+        return text, label
+    def decode(self, text_index, text_prob=None, is_remove_duplicate=False):
+        """ convert text-index into text-label. """
+        result_list = []
+        ignored_tokens = self.get_ignored_tokens()
+        batch_size = len(text_index)
+        for batch_idx in range(batch_size):
+            char_list = []
+            conf_list = []
+            for idx in range(len(text_index[batch_idx])):
+                if text_index[batch_idx][idx] in ignored_tokens:
+                    continue
+                if is_remove_duplicate:
+                    # only for predict
+                    if idx > 0 and text_index[batch_idx][idx - 1] == text_index[
+                            batch_idx][idx]:
+                        continue
+                char_list.append(self.character[int(text_index[batch_idx][
+                    idx])])
+                if text_prob is not None:
+                    conf_list.append(text_prob[batch_idx][idx])
+                else:
+                    conf_list.append(1)
+            text = ''.join(char_list)
+            result_list.append((text, np.mean(conf_list)))
+        return result_list
+    def add_special_char(self, dict_character):
+        dict_character = dict_character + [self.beg_str, self.end_str]
+        return dict_character
    def get_ignored_tokens(self):
        beg_idx = self.get_beg_end_flag_idx("beg")

--- a/tools/export_model.py
+++ b/tools/export_model.py
@@ -31,6 +31,14 @@ from ppocr.utils.logging import get_logger
 from tools.program import load_config, merge_config, ArgsParser
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-c", "--config", help="configuration file to use")
+    parser.add_argument(
+        "-o", "--output_path", type=str, default='./output/infer/')
+    return parser.parse_args()
 def main():
    FLAGS = ArgsParser().parse_args()
    config = load_config(FLAGS.config)
@@ -52,6 +60,22 @@ def main():
    save_path = '{}/inference'.format(config['Global']['save_inference_dir'])
+    if config['Architecture']['algorithm'] == "SRN":
+        other_shape = [
+            paddle.static.InputSpec(
+                shape=[None, 1, 64, 256], dtype='float32'), [
+                    paddle.static.InputSpec(
+                        shape=[None, 256, 1],
+                        dtype="int64"), paddle.static.InputSpec(
+                            shape=[None, 25, 1],
+                            dtype="int64"), paddle.static.InputSpec(
+                                shape=[None, 8, 25, 25], dtype="int64"),
+                    paddle.static.InputSpec(
+                        shape=[None, 8, 25, 25], dtype="int64")
+                ]
+        ]
+        model = to_static(model, input_spec=other_shape)
+    else:
        infer_shape = [3, -1, -1]
        if config['Architecture']['model_type'] == "rec":
            infer_shape = [3, 32, -1]  # for rec model, H must be 32
@@ -62,13 +86,13 @@ def main():
                    'When there is tps in the network, variable length input is not supported, and the input size needs to be the same as during training'
                )
                infer_shape[-1] = 100
        model = to_static(
            model,
            input_spec=[
                paddle.static.InputSpec(
                    shape=[None] + infer_shape, dtype='float32')
            ])
    paddle.jit.save(model, save_path)
    logger.info('inference model is saved to {}'.format(save_path))

--- a/tools/infer/predict_rec.py
+++ b/tools/infer/predict_rec.py
@@ -25,6 +25,7 @@ import numpy as np
 import math
 import time
 import traceback
+import paddle
 import tools.infer.utility as utility
 from ppocr.postprocess import build_post_process
@@ -46,6 +47,13 @@ class TextRecognizer(object):
            "character_dict_path": args.rec_char_dict_path,
            "use_space_char": args.use_space_char
        }
+        if self.rec_algorithm == "SRN":
+            postprocess_params = {
+                'name': 'SRNLabelDecode',
+                "character_type": args.rec_char_type,
+                "character_dict_path": args.rec_char_dict_path,
+                "use_space_char": args.use_space_char
+            }
        self.postprocess_op = build_post_process(postprocess_params)
        self.predictor, self.input_tensor, self.output_tensors = \
            utility.create_predictor(args, 'rec', logger)
@@ -70,6 +78,78 @@ class TextRecognizer(object):
        padding_im[:, :, 0:resized_w] = resized_image
        return padding_im
+    def resize_norm_img_srn(self, img, image_shape):
+        imgC, imgH, imgW = image_shape
+        img_black = np.zeros((imgH, imgW))
+        im_hei = img.shape[0]
+        im_wid = img.shape[1]
+        if im_wid <= im_hei * 1:
+            img_new = cv2.resize(img, (imgH * 1, imgH))
+        elif im_wid <= im_hei * 2:
+            img_new = cv2.resize(img, (imgH * 2, imgH))
+        elif im_wid <= im_hei * 3:
+            img_new = cv2.resize(img, (imgH * 3, imgH))
+        else:
+            img_new = cv2.resize(img, (imgW, imgH))
+        img_np = np.asarray(img_new)
+        img_np = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY)
+        img_black[:, 0:img_np.shape[1]] = img_np
+        img_black = img_black[:, :, np.newaxis]
+        row, col, c = img_black.shape
+        c = 1
+        return np.reshape(img_black, (c, row, col)).astype(np.float32)
+    def srn_other_inputs(self, image_shape, num_heads, max_text_length):
+        imgC, imgH, imgW = image_shape
+        feature_dim = int((imgH / 8) * (imgW / 8))
+        encoder_word_pos = np.array(range(0, feature_dim)).reshape(
+            (feature_dim, 1)).astype('int64')
+        gsrm_word_pos = np.array(range(0, max_text_length)).reshape(
+            (max_text_length, 1)).astype('int64')
+        gsrm_attn_bias_data = np.ones((1, max_text_length, max_text_length))
+        gsrm_slf_attn_bias1 = np.triu(gsrm_attn_bias_data, 1).reshape(
+            [-1, 1, max_text_length, max_text_length])
+        gsrm_slf_attn_bias1 = np.tile(
+            gsrm_slf_attn_bias1,
+            [1, num_heads, 1, 1]).astype('float32') * [-1e9]
+        gsrm_slf_attn_bias2 = np.tril(gsrm_attn_bias_data, -1).reshape(
+            [-1, 1, max_text_length, max_text_length])
+        gsrm_slf_attn_bias2 = np.tile(
+            gsrm_slf_attn_bias2,
+            [1, num_heads, 1, 1]).astype('float32') * [-1e9]
+        encoder_word_pos = encoder_word_pos[np.newaxis, :]
+        gsrm_word_pos = gsrm_word_pos[np.newaxis, :]
+        return [
+            encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1,
+            gsrm_slf_attn_bias2
+        ]
+    def process_image_srn(self, img, image_shape, num_heads, max_text_length):
+        norm_img = self.resize_norm_img_srn(img, image_shape)
+        norm_img = norm_img[np.newaxis, :]
+        [encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2] = \
+            self.srn_other_inputs(image_shape, num_heads, max_text_length)
+        gsrm_slf_attn_bias1 = gsrm_slf_attn_bias1.astype(np.float32)
+        gsrm_slf_attn_bias2 = gsrm_slf_attn_bias2.astype(np.float32)
+        encoder_word_pos = encoder_word_pos.astype(np.int64)
+        gsrm_word_pos = gsrm_word_pos.astype(np.int64)
+        return (norm_img, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1,
+                gsrm_slf_attn_bias2)
    def __call__(self, img_list):
        img_num = len(img_list)
        # Calculate the aspect ratio of all text bars
@@ -93,21 +173,64 @@ class TextRecognizer(object):
                wh_ratio = w * 1.0 / h
                max_wh_ratio = max(max_wh_ratio, wh_ratio)
            for ino in range(beg_img_no, end_img_no):
-                # norm_img = self.resize_norm_img(img_list[ino], max_wh_ratio)
+                if self.rec_algorithm != "SRN":
                    norm_img = self.resize_norm_img(img_list[indices[ino]],
                                                    max_wh_ratio)
                    norm_img = norm_img[np.newaxis, :]
                    norm_img_batch.append(norm_img)
+                else:
+                    norm_img = self.process_image_srn(
+                        img_list[indices[ino]], self.rec_image_shape, 8, 25)
+                    encoder_word_pos_list = []
+                    gsrm_word_pos_list = []
+                    gsrm_slf_attn_bias1_list = []
+                    gsrm_slf_attn_bias2_list = []
+                    encoder_word_pos_list.append(norm_img[1])
+                    gsrm_word_pos_list.append(norm_img[2])
+                    gsrm_slf_attn_bias1_list.append(norm_img[3])
+                    gsrm_slf_attn_bias2_list.append(norm_img[4])
+                    norm_img_batch.append(norm_img[0])
            norm_img_batch = np.concatenate(norm_img_batch)
            norm_img_batch = norm_img_batch.copy()
+            if self.rec_algorithm == "SRN":
+                starttime = time.time()
+                encoder_word_pos_list = np.concatenate(encoder_word_pos_list)
+                gsrm_word_pos_list = np.concatenate(gsrm_word_pos_list)
+                gsrm_slf_attn_bias1_list = np.concatenate(
+                    gsrm_slf_attn_bias1_list)
+                gsrm_slf_attn_bias2_list = np.concatenate(
+                    gsrm_slf_attn_bias2_list)
+                inputs = [
+                    norm_img_batch,
+                    encoder_word_pos_list,
+                    gsrm_word_pos_list,
+                    gsrm_slf_attn_bias1_list,
+                    gsrm_slf_attn_bias2_list,
+                ]
+                input_names = self.predictor.get_input_names()
+                for i in range(len(input_names)):
+                    input_tensor = self.predictor.get_input_handle(input_names[
+                        i])
+                    input_tensor.copy_from_cpu(inputs[i])
+                self.predictor.run()
+                outputs = []
+                for output_tensor in self.output_tensors:
+                    output = output_tensor.copy_to_cpu()
+                    outputs.append(output)
+                preds = {"predict": outputs[2]}
+            else:
                starttime = time.time()
                self.input_tensor.copy_from_cpu(norm_img_batch)
                self.predictor.run()
                outputs = []
                for output_tensor in self.output_tensors:
                    output = output_tensor.copy_to_cpu()
                    outputs.append(output)
                preds = outputs[0]
            rec_result = self.postprocess_op(preds)
            for rno in range(len(rec_result)):
                rec_res[indices[beg_img_no + rno]] = rec_result[rno]

--- a/tools/infer/predict_system.py
+++ b/tools/infer/predict_system.py
--- a/tools/infer_rec.py
+++ b/tools/infer_rec.py
@@ -62,6 +62,12 @@ def main():
        elif op_name in ['RecResizeImg']:
            op[op_name]['infer_mode'] = True
        elif op_name == 'KeepKeys':
+            if config['Architecture']['algorithm'] == "SRN":
+                op[op_name]['keep_keys'] = [
+                    'image', 'encoder_word_pos', 'gsrm_word_pos',
+                    'gsrm_slf_attn_bias1', 'gsrm_slf_attn_bias2'
+                ]
+            else:
                op[op_name]['keep_keys'] = ['image']
        transforms.append(op)
    global_config['infer_mode'] = True
@@ -74,9 +80,24 @@ def main():
            img = f.read()
            data = {'image': img}
        batch = transform(data, ops)
+        if config['Architecture']['algorithm'] == "SRN":
+            encoder_word_pos_list = np.expand_dims(batch[1], axis=0)
+            gsrm_word_pos_list = np.expand_dims(batch[2], axis=0)
+            gsrm_slf_attn_bias1_list = np.expand_dims(batch[3], axis=0)
+            gsrm_slf_attn_bias2_list = np.expand_dims(batch[4], axis=0)
+            others = [
+                paddle.to_tensor(encoder_word_pos_list),
+                paddle.to_tensor(gsrm_word_pos_list),
+                paddle.to_tensor(gsrm_slf_attn_bias1_list),
+                paddle.to_tensor(gsrm_slf_attn_bias2_list)
+            ]
        images = np.expand_dims(batch[0], axis=0)
        images = paddle.to_tensor(images)
+        if config['Architecture']['algorithm'] == "SRN":
+            preds = model(images, others)
+        else:
            preds = model(images)
        post_result = post_process_class(preds)
        for rec_reuslt in post_result:

--- a/tools/program.py
+++ b/tools/program.py
@@ -174,6 +174,7 @@ def train(config,
    best_model_dict = {main_indicator: 0}
    best_model_dict.update(pre_best_model_dict)
    train_stats = TrainingStats(log_smooth_window, ['lr'])
+    model_average = False
    model.train()
    if 'start_epoch' in best_model_dict:
@@ -194,6 +195,11 @@ def train(config,
                break
            lr = optimizer.get_lr()
            images = batch[0]
+            if config['Architecture']['algorithm'] == "SRN":
+                others = batch[-4:]
+                preds = model(images, others)
+                model_average = True
+            else:
                preds = model(images)
            loss = loss_class(preds, batch)
            avg_loss = loss['loss']
@@ -216,8 +222,8 @@ def train(config,
                batch = [item.numpy() for item in batch]
                post_result = post_process_class(preds, batch[1])
                eval_class(post_result, batch)
-                metirc = eval_class.get_metric()
+                metric = eval_class.get_metric()
-                train_stats.update(metirc)
+                train_stats.update(metric)
            if vdl_writer is not None and dist.get_rank() == 0:
                for k, v in train_stats.get().items():
@@ -238,6 +244,13 @@ def train(config,
            # eval
            if global_step > start_eval_step and \
                    (global_step - start_eval_step) % eval_batch_step == 0 and dist.get_rank() == 0:
+                if model_average:
+                    Model_Average = paddle.incubate.optimizer.ModelAverage(
+                        0.15,
+                        parameters=model.parameters(),
+                        min_average_window=10000,
+                        max_average_window=15625)
+                    Model_Average.apply()
                cur_metric = eval(model, valid_dataloader, post_process_class,
                                  eval_class)
                cur_metric_str = 'cur metric, {}'.format(', '.join(
@@ -273,6 +286,7 @@ def train(config,
                                          best_model_dict[main_indicator],
                                          global_step)
            global_step += 1
+            optimizer.clear_grad()
            batch_start = time.time()
        if dist.get_rank() == 0:
            save_model(
@@ -313,6 +327,10 @@ def eval(model, valid_dataloader, post_process_class, eval_class):
                break
            images = batch[0]
            start = time.time()
+            if "SRN" in str(model.head):
+                others = batch[-4:]
+                preds = model(images, others)
+            else:
                preds = model(images)
            batch = [item.numpy() for item in batch]

--- a/train.sh
+++ b/train.sh
-# for paddle.__version__ >= 2.0rc1
+# recommended paddle.__version__ == 2.0.0
 python3 -m paddle.distributed.launch --gpus '0,1,2,3,4,5,6,7'  tools/train.py -c configs/rec/rec_mv3_none_bilstm_ctc.yml
-# for paddle.__version__ < 2.0rc1
-# python3 -m paddle.distributed.launch --selected_gpus '0,1,2,3,4,5,6,7'  tools/train.py -c configs/rec/rec_mv3_none_bilstm_ctc.yml