first commit

f1506916 · sugon_cxj · 55c28ed5 · f1506916 · f1506916 · f1506916
Commit f1506916 authored May 18, 2023 by sugon_cxj
20 changed files
--- a/ppocr/modeling/necks/db_fpn.py
+++ b/ppocr/modeling/necks/db_fpn.py
+# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+import os
+import sys
+
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(__dir__)
+sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../../..')))
+
+from ppocr.modeling.backbones.det_mobilenet_v3 import SEModule
+
+
+class DSConv(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 padding,
+                 stride=1,
+                 groups=None,
+                 if_act=True,
+                 act="relu",
+                 **kwargs):
+        super(DSConv, self).__init__()
+        if groups == None:
+            groups = in_channels
+        self.if_act = if_act
+        self.act = act
+        self.conv1 = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            bias_attr=False)
+
+        self.bn1 = nn.BatchNorm(num_channels=in_channels, act=None)
+
+        self.conv2 = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=int(in_channels * 4),
+            kernel_size=1,
+            stride=1,
+            bias_attr=False)
+
+        self.bn2 = nn.BatchNorm(num_channels=int(in_channels * 4), act=None)
+
+        self.conv3 = nn.Conv2D(
+            in_channels=int(in_channels * 4),
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            bias_attr=False)
+        self._c = [in_channels, out_channels]
+        if in_channels != out_channels:
+            self.conv_end = nn.Conv2D(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=1,
+                stride=1,
+                bias_attr=False)
+
+    def forward(self, inputs):
+
+        x = self.conv1(inputs)
+        x = self.bn1(x)
+
+        x = self.conv2(x)
+        x = self.bn2(x)
+        if self.if_act:
+            if self.act == "relu":
+                x = F.relu(x)
+            elif self.act == "hardswish":
+                x = F.hardswish(x)
+            else:
+                print("The activation function({}) is selected incorrectly.".
+                      format(self.act))
+                exit()
+
+        x = self.conv3(x)
+        if self._c[0] != self._c[1]:
+            x = x + self.conv_end(inputs)
+        return x
+
+
+class DBFPN(nn.Layer):
+    def __init__(self, in_channels, out_channels, **kwargs):
+        super(DBFPN, self).__init__()
+        self.out_channels = out_channels
+        weight_attr = paddle.nn.initializer.KaimingUniform()
+
+        self.in2_conv = nn.Conv2D(
+            in_channels=in_channels[0],
+            out_channels=self.out_channels,
+            kernel_size=1,
+            weight_attr=ParamAttr(initializer=weight_attr),
+            bias_attr=False)
+        self.in3_conv = nn.Conv2D(
+            in_channels=in_channels[1],
+            out_channels=self.out_channels,
+            kernel_size=1,
+            weight_attr=ParamAttr(initializer=weight_attr),
+            bias_attr=False)
+        self.in4_conv = nn.Conv2D(
+            in_channels=in_channels[2],
+            out_channels=self.out_channels,
+            kernel_size=1,
+            weight_attr=ParamAttr(initializer=weight_attr),
+            bias_attr=False)
+        self.in5_conv = nn.Conv2D(
+            in_channels=in_channels[3],
+            out_channels=self.out_channels,
+            kernel_size=1,
+            weight_attr=ParamAttr(initializer=weight_attr),
+            bias_attr=False)
+        self.p5_conv = nn.Conv2D(
+            in_channels=self.out_channels,
+            out_channels=self.out_channels // 4,
+            kernel_size=3,
+            padding=1,
+            weight_attr=ParamAttr(initializer=weight_attr),
+            bias_attr=False)
+        self.p4_conv = nn.Conv2D(
+            in_channels=self.out_channels,
+            out_channels=self.out_channels // 4,
+            kernel_size=3,
+            padding=1,
+            weight_attr=ParamAttr(initializer=weight_attr),
+            bias_attr=False)
+        self.p3_conv = nn.Conv2D(
+            in_channels=self.out_channels,
+            out_channels=self.out_channels // 4,
+            kernel_size=3,
+            padding=1,
+            weight_attr=ParamAttr(initializer=weight_attr),
+            bias_attr=False)
+        self.p2_conv = nn.Conv2D(
+            in_channels=self.out_channels,
+            out_channels=self.out_channels // 4,
+            kernel_size=3,
+            padding=1,
+            weight_attr=ParamAttr(initializer=weight_attr),
+            bias_attr=False)
+
+    def forward(self, x):
+        c2, c3, c4, c5 = x
+
+        in5 = self.in5_conv(c5)
+        in4 = self.in4_conv(c4)
+        in3 = self.in3_conv(c3)
+        in2 = self.in2_conv(c2)
+
+        out4 = in4 + F.upsample(
+            in5, scale_factor=2, mode="nearest", align_mode=1)  # 1/16
+        out3 = in3 + F.upsample(
+            out4, scale_factor=2, mode="nearest", align_mode=1)  # 1/8
+        out2 = in2 + F.upsample(
+            out3, scale_factor=2, mode="nearest", align_mode=1)  # 1/4
+
+        p5 = self.p5_conv(in5)
+        p4 = self.p4_conv(out4)
+        p3 = self.p3_conv(out3)
+        p2 = self.p2_conv(out2)
+        p5 = F.upsample(p5, scale_factor=8, mode="nearest", align_mode=1)
+        p4 = F.upsample(p4, scale_factor=4, mode="nearest", align_mode=1)
+        p3 = F.upsample(p3, scale_factor=2, mode="nearest", align_mode=1)
+
+        fuse = paddle.concat([p5, p4, p3, p2], axis=1)
+        return fuse
+
+
+class RSELayer(nn.Layer):
+    def __init__(self, in_channels, out_channels, kernel_size, shortcut=True):
+        super(RSELayer, self).__init__()
+        weight_attr = paddle.nn.initializer.KaimingUniform()
+        self.out_channels = out_channels
+        self.in_conv = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=self.out_channels,
+            kernel_size=kernel_size,
+            padding=int(kernel_size // 2),
+            weight_attr=ParamAttr(initializer=weight_attr),
+            bias_attr=False)
+        self.se_block = SEModule(self.out_channels)
+        self.shortcut = shortcut
+
+    def forward(self, ins):
+        x = self.in_conv(ins)
+        if self.shortcut:
+            out = x + self.se_block(x)
+        else:
+            out = self.se_block(x)
+        return out
+
+
+class RSEFPN(nn.Layer):
+    def __init__(self, in_channels, out_channels, shortcut=True, **kwargs):
+        super(RSEFPN, self).__init__()
+        self.out_channels = out_channels
+        self.ins_conv = nn.LayerList()
+        self.inp_conv = nn.LayerList()
+
+        for i in range(len(in_channels)):
+            self.ins_conv.append(
+                RSELayer(
+                    in_channels[i],
+                    out_channels,
+                    kernel_size=1,
+                    shortcut=shortcut))
+            self.inp_conv.append(
+                RSELayer(
+                    out_channels,
+                    out_channels // 4,
+                    kernel_size=3,
+                    shortcut=shortcut))
+
+    def forward(self, x):
+        c2, c3, c4, c5 = x
+
+        in5 = self.ins_conv[3](c5)
+        in4 = self.ins_conv[2](c4)
+        in3 = self.ins_conv[1](c3)
+        in2 = self.ins_conv[0](c2)
+
+        out4 = in4 + F.upsample(
+            in5, scale_factor=2, mode="nearest", align_mode=1)  # 1/16
+        out3 = in3 + F.upsample(
+            out4, scale_factor=2, mode="nearest", align_mode=1)  # 1/8
+        out2 = in2 + F.upsample(
+            out3, scale_factor=2, mode="nearest", align_mode=1)  # 1/4
+
+        p5 = self.inp_conv[3](in5)
+        p4 = self.inp_conv[2](out4)
+        p3 = self.inp_conv[1](out3)
+        p2 = self.inp_conv[0](out2)
+
+        p5 = F.upsample(p5, scale_factor=8, mode="nearest", align_mode=1)
+        p4 = F.upsample(p4, scale_factor=4, mode="nearest", align_mode=1)
+        p3 = F.upsample(p3, scale_factor=2, mode="nearest", align_mode=1)
+
+        fuse = paddle.concat([p5, p4, p3, p2], axis=1)
+        return fuse
+
+
+class LKPAN(nn.Layer):
+    def __init__(self, in_channels, out_channels, mode='large', **kwargs):
+        super(LKPAN, self).__init__()
+        self.out_channels = out_channels
+        weight_attr = paddle.nn.initializer.KaimingUniform()
+
+        self.ins_conv = nn.LayerList()
+        self.inp_conv = nn.LayerList()
+        # pan head
+        self.pan_head_conv = nn.LayerList()
+        self.pan_lat_conv = nn.LayerList()
+
+        if mode.lower() == 'lite':
+            p_layer = DSConv
+        elif mode.lower() == 'large':
+            p_layer = nn.Conv2D
+        else:
+            raise ValueError(
+                "mode can only be one of ['lite', 'large'], but received {}".
+                format(mode))
+
+        for i in range(len(in_channels)):
+            self.ins_conv.append(
+                nn.Conv2D(
+                    in_channels=in_channels[i],
+                    out_channels=self.out_channels,
+                    kernel_size=1,
+                    weight_attr=ParamAttr(initializer=weight_attr),
+                    bias_attr=False))
+
+            self.inp_conv.append(
+                p_layer(
+                    in_channels=self.out_channels,
+                    out_channels=self.out_channels // 4,
+                    kernel_size=9,
+                    padding=4,
+                    weight_attr=ParamAttr(initializer=weight_attr),
+                    bias_attr=False))
+
+            if i > 0:
+                self.pan_head_conv.append(
+                    nn.Conv2D(
+                        in_channels=self.out_channels // 4,
+                        out_channels=self.out_channels // 4,
+                        kernel_size=3,
+                        padding=1,
+                        stride=2,
+                        weight_attr=ParamAttr(initializer=weight_attr),
+                        bias_attr=False))
+            self.pan_lat_conv.append(
+                p_layer(
+                    in_channels=self.out_channels // 4,
+                    out_channels=self.out_channels // 4,
+                    kernel_size=9,
+                    padding=4,
+                    weight_attr=ParamAttr(initializer=weight_attr),
+                    bias_attr=False))
+
+    def forward(self, x):
+        c2, c3, c4, c5 = x
+
+        in5 = self.ins_conv[3](c5)
+        in4 = self.ins_conv[2](c4)
+        in3 = self.ins_conv[1](c3)
+        in2 = self.ins_conv[0](c2)
+
+        out4 = in4 + F.upsample(
+            in5, scale_factor=2, mode="nearest", align_mode=1)  # 1/16
+        out3 = in3 + F.upsample(
+            out4, scale_factor=2, mode="nearest", align_mode=1)  # 1/8
+        out2 = in2 + F.upsample(
+            out3, scale_factor=2, mode="nearest", align_mode=1)  # 1/4
+
+        f5 = self.inp_conv[3](in5)
+        f4 = self.inp_conv[2](out4)
+        f3 = self.inp_conv[1](out3)
+        f2 = self.inp_conv[0](out2)
+
+        pan3 = f3 + self.pan_head_conv[0](f2)
+        pan4 = f4 + self.pan_head_conv[1](pan3)
+        pan5 = f5 + self.pan_head_conv[2](pan4)
+
+        p2 = self.pan_lat_conv[0](f2)
+        p3 = self.pan_lat_conv[1](pan3)
+        p4 = self.pan_lat_conv[2](pan4)
+        p5 = self.pan_lat_conv[3](pan5)
+
+        p5 = F.upsample(p5, scale_factor=8, mode="nearest", align_mode=1)
+        p4 = F.upsample(p4, scale_factor=4, mode="nearest", align_mode=1)
+        p3 = F.upsample(p3, scale_factor=2, mode="nearest", align_mode=1)
+
+        fuse = paddle.concat([p5, p4, p3, p2], axis=1)
+        return fuse
--- a/ppocr/modeling/necks/east_fpn.py
+++ b/ppocr/modeling/necks/east_fpn.py
+# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 groups=1,
+                 if_act=True,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self.if_act = if_act
+        self.act = act
+        self.conv = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + '_weights'),
+            bias_attr=False)
+
+        self.bn = nn.BatchNorm(
+            num_channels=out_channels,
+            act=act,
+            param_attr=ParamAttr(name="bn_" + name + "_scale"),
+            bias_attr=ParamAttr(name="bn_" + name + "_offset"),
+            moving_mean_name="bn_" + name + "_mean",
+            moving_variance_name="bn_" + name + "_variance")
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        return x
+
+
+class DeConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 groups=1,
+                 if_act=True,
+                 act=None,
+                 name=None):
+        super(DeConvBNLayer, self).__init__()
+        self.if_act = if_act
+        self.act = act
+        self.deconv = nn.Conv2DTranspose(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + '_weights'),
+            bias_attr=False)
+        self.bn = nn.BatchNorm(
+            num_channels=out_channels,
+            act=act,
+            param_attr=ParamAttr(name="bn_" + name + "_scale"),
+            bias_attr=ParamAttr(name="bn_" + name + "_offset"),
+            moving_mean_name="bn_" + name + "_mean",
+            moving_variance_name="bn_" + name + "_variance")
+
+    def forward(self, x):
+        x = self.deconv(x)
+        x = self.bn(x)
+        return x
+
+
+class EASTFPN(nn.Layer):
+    def __init__(self, in_channels, model_name, **kwargs):
+        super(EASTFPN, self).__init__()
+        self.model_name = model_name
+        if self.model_name == "large":
+            self.out_channels = 128
+        else:
+            self.out_channels = 64
+        self.in_channels = in_channels[::-1]
+        self.h1_conv = ConvBNLayer(
+            in_channels=self.out_channels+self.in_channels[1],
+            out_channels=self.out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            if_act=True,
+            act='relu',
+            name="unet_h_1")
+        self.h2_conv = ConvBNLayer(
+            in_channels=self.out_channels+self.in_channels[2],
+            out_channels=self.out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            if_act=True,
+            act='relu',
+            name="unet_h_2")
+        self.h3_conv = ConvBNLayer(
+            in_channels=self.out_channels+self.in_channels[3],
+            out_channels=self.out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            if_act=True,
+            act='relu',
+            name="unet_h_3")
+        self.g0_deconv = DeConvBNLayer(
+            in_channels=self.in_channels[0],
+            out_channels=self.out_channels,
+            kernel_size=4,
+            stride=2,
+            padding=1,
+            if_act=True,
+            act='relu',
+            name="unet_g_0")
+        self.g1_deconv = DeConvBNLayer(
+            in_channels=self.out_channels,
+            out_channels=self.out_channels,
+            kernel_size=4,
+            stride=2,
+            padding=1,
+            if_act=True,
+            act='relu',
+            name="unet_g_1")
+        self.g2_deconv = DeConvBNLayer(
+            in_channels=self.out_channels,
+            out_channels=self.out_channels,
+            kernel_size=4,
+            stride=2,
+            padding=1,
+            if_act=True,
+            act='relu',
+            name="unet_g_2")
+        self.g3_conv = ConvBNLayer(
+            in_channels=self.out_channels,
+            out_channels=self.out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            if_act=True,
+            act='relu',
+            name="unet_g_3")
+
+    def forward(self, x):
+        f = x[::-1]
+
+        h = f[0]
+        g = self.g0_deconv(h)
+        h = paddle.concat([g, f[1]], axis=1)
+        h = self.h1_conv(h)
+        g = self.g1_deconv(h)
+        h = paddle.concat([g, f[2]], axis=1)
+        h = self.h2_conv(h)
+        g = self.g2_deconv(h)
+        h = paddle.concat([g, f[3]], axis=1)
+        h = self.h3_conv(h)
+        g = self.g3_conv(h)
+
+        return g
\ No newline at end of file
--- a/ppocr/modeling/necks/fce_fpn.py
+++ b/ppocr/modeling/necks/fce_fpn.py
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is refer from:
+https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.3/ppdet/modeling/necks/fpn.py
+"""
+
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.nn.initializer import XavierUniform
+from paddle.nn.initializer import Normal
+from paddle.regularizer import L2Decay
+
+__all__ = ['FCEFPN']
+
+
+class ConvNormLayer(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 filter_size,
+                 stride,
+                 groups=1,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 norm_groups=32,
+                 lr_scale=1.,
+                 freeze_norm=False,
+                 initializer=Normal(
+                     mean=0., std=0.01)):
+        super(ConvNormLayer, self).__init__()
+        assert norm_type in ['bn', 'sync_bn', 'gn']
+
+        bias_attr = False
+
+        self.conv = nn.Conv2D(
+            in_channels=ch_in,
+            out_channels=ch_out,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(
+                initializer=initializer, learning_rate=1.),
+            bias_attr=bias_attr)
+
+        norm_lr = 0. if freeze_norm else 1.
+        param_attr = ParamAttr(
+            learning_rate=norm_lr,
+            regularizer=L2Decay(norm_decay) if norm_decay is not None else None)
+        bias_attr = ParamAttr(
+            learning_rate=norm_lr,
+            regularizer=L2Decay(norm_decay) if norm_decay is not None else None)
+        if norm_type == 'bn':
+            self.norm = nn.BatchNorm2D(
+                ch_out, weight_attr=param_attr, bias_attr=bias_attr)
+        elif norm_type == 'sync_bn':
+            self.norm = nn.SyncBatchNorm(
+                ch_out, weight_attr=param_attr, bias_attr=bias_attr)
+        elif norm_type == 'gn':
+            self.norm = nn.GroupNorm(
+                num_groups=norm_groups,
+                num_channels=ch_out,
+                weight_attr=param_attr,
+                bias_attr=bias_attr)
+
+    def forward(self, inputs):
+        out = self.conv(inputs)
+        out = self.norm(out)
+        return out
+
+
+class FCEFPN(nn.Layer):
+    """
+    Feature Pyramid Network, see https://arxiv.org/abs/1612.03144
+    Args:
+        in_channels (list[int]): input channels of each level which can be 
+            derived from the output shape of backbone by from_config
+        out_channels (list[int]): output channel of each level
+        spatial_scales (list[float]): the spatial scales between input feature
+            maps and original input image which can be derived from the output 
+            shape of backbone by from_config
+        has_extra_convs (bool): whether to add extra conv to the last level.
+            default False
+        extra_stage (int): the number of extra stages added to the last level.
+            default 1
+        use_c5 (bool): Whether to use c5 as the input of extra stage, 
+            otherwise p5 is used. default True
+        norm_type (string|None): The normalization type in FPN module. If 
+            norm_type is None, norm will not be used after conv and if 
+            norm_type is string, bn, gn, sync_bn are available. default None
+        norm_decay (float): weight decay for normalization layer weights.
+            default 0.
+        freeze_norm (bool): whether to freeze normalization layer.  
+            default False
+        relu_before_extra_convs (bool): whether to add relu before extra convs.
+            default False
+        
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 spatial_scales=[0.25, 0.125, 0.0625, 0.03125],
+                 has_extra_convs=False,
+                 extra_stage=1,
+                 use_c5=True,
+                 norm_type=None,
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 relu_before_extra_convs=True):
+        super(FCEFPN, self).__init__()
+        self.out_channels = out_channels
+        for s in range(extra_stage):
+            spatial_scales = spatial_scales + [spatial_scales[-1] / 2.]
+        self.spatial_scales = spatial_scales
+        self.has_extra_convs = has_extra_convs
+        self.extra_stage = extra_stage
+        self.use_c5 = use_c5
+        self.relu_before_extra_convs = relu_before_extra_convs
+        self.norm_type = norm_type
+        self.norm_decay = norm_decay
+        self.freeze_norm = freeze_norm
+
+        self.lateral_convs = []
+        self.fpn_convs = []
+        fan = out_channels * 3 * 3
+
+        # stage index 0,1,2,3 stands for res2,res3,res4,res5 on ResNet Backbone
+        # 0 <= st_stage < ed_stage <= 3
+        st_stage = 4 - len(in_channels)
+        ed_stage = st_stage + len(in_channels) - 1
+        for i in range(st_stage, ed_stage + 1):
+            if i == 3:
+                lateral_name = 'fpn_inner_res5_sum'
+            else:
+                lateral_name = 'fpn_inner_res{}_sum_lateral'.format(i + 2)
+            in_c = in_channels[i - st_stage]
+            if self.norm_type is not None:
+                lateral = self.add_sublayer(
+                    lateral_name,
+                    ConvNormLayer(
+                        ch_in=in_c,
+                        ch_out=out_channels,
+                        filter_size=1,
+                        stride=1,
+                        norm_type=self.norm_type,
+                        norm_decay=self.norm_decay,
+                        freeze_norm=self.freeze_norm,
+                        initializer=XavierUniform(fan_out=in_c)))
+            else:
+                lateral = self.add_sublayer(
+                    lateral_name,
+                    nn.Conv2D(
+                        in_channels=in_c,
+                        out_channels=out_channels,
+                        kernel_size=1,
+                        weight_attr=ParamAttr(
+                            initializer=XavierUniform(fan_out=in_c))))
+            self.lateral_convs.append(lateral)
+
+        for i in range(st_stage, ed_stage + 1):
+            fpn_name = 'fpn_res{}_sum'.format(i + 2)
+            if self.norm_type is not None:
+                fpn_conv = self.add_sublayer(
+                    fpn_name,
+                    ConvNormLayer(
+                        ch_in=out_channels,
+                        ch_out=out_channels,
+                        filter_size=3,
+                        stride=1,
+                        norm_type=self.norm_type,
+                        norm_decay=self.norm_decay,
+                        freeze_norm=self.freeze_norm,
+                        initializer=XavierUniform(fan_out=fan)))
+            else:
+                fpn_conv = self.add_sublayer(
+                    fpn_name,
+                    nn.Conv2D(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        kernel_size=3,
+                        padding=1,
+                        weight_attr=ParamAttr(
+                            initializer=XavierUniform(fan_out=fan))))
+            self.fpn_convs.append(fpn_conv)
+
+        # add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5)
+        if self.has_extra_convs:
+            for i in range(self.extra_stage):
+                lvl = ed_stage + 1 + i
+                if i == 0 and self.use_c5:
+                    in_c = in_channels[-1]
+                else:
+                    in_c = out_channels
+                extra_fpn_name = 'fpn_{}'.format(lvl + 2)
+                if self.norm_type is not None:
+                    extra_fpn_conv = self.add_sublayer(
+                        extra_fpn_name,
+                        ConvNormLayer(
+                            ch_in=in_c,
+                            ch_out=out_channels,
+                            filter_size=3,
+                            stride=2,
+                            norm_type=self.norm_type,
+                            norm_decay=self.norm_decay,
+                            freeze_norm=self.freeze_norm,
+                            initializer=XavierUniform(fan_out=fan)))
+                else:
+                    extra_fpn_conv = self.add_sublayer(
+                        extra_fpn_name,
+                        nn.Conv2D(
+                            in_channels=in_c,
+                            out_channels=out_channels,
+                            kernel_size=3,
+                            stride=2,
+                            padding=1,
+                            weight_attr=ParamAttr(
+                                initializer=XavierUniform(fan_out=fan))))
+                self.fpn_convs.append(extra_fpn_conv)
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {
+            'in_channels': [i.channels for i in input_shape],
+            'spatial_scales': [1.0 / i.stride for i in input_shape],
+        }
+
+    def forward(self, body_feats):
+        laterals = []
+        num_levels = len(body_feats)
+
+        for i in range(num_levels):
+            laterals.append(self.lateral_convs[i](body_feats[i]))
+
+        for i in range(1, num_levels):
+            lvl = num_levels - i
+            upsample = F.interpolate(
+                laterals[lvl],
+                scale_factor=2.,
+                mode='nearest', )
+            laterals[lvl - 1] += upsample
+
+        fpn_output = []
+        for lvl in range(num_levels):
+            fpn_output.append(self.fpn_convs[lvl](laterals[lvl]))
+
+        if self.extra_stage > 0:
+            # use max pool to get more levels on top of outputs (Faster R-CNN, Mask R-CNN)
+            if not self.has_extra_convs:
+                assert self.extra_stage == 1, 'extra_stage should be 1 if FPN has not extra convs'
+                fpn_output.append(F.max_pool2d(fpn_output[-1], 1, stride=2))
+            # add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5)
+            else:
+                if self.use_c5:
+                    extra_source = body_feats[-1]
+                else:
+                    extra_source = fpn_output[-1]
+                fpn_output.append(self.fpn_convs[num_levels](extra_source))
+
+                for i in range(1, self.extra_stage):
+                    if self.relu_before_extra_convs:
+                        fpn_output.append(self.fpn_convs[num_levels + i](F.relu(
+                            fpn_output[-1])))
+                    else:
+                        fpn_output.append(self.fpn_convs[num_levels + i](
+                            fpn_output[-1]))
+        return fpn_output
--- a/ppocr/modeling/necks/fpn.py
+++ b/ppocr/modeling/necks/fpn.py
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is refer from:
+https://github.com/whai362/PSENet/blob/python3/models/neck/fpn.py
+"""
+
+import paddle.nn as nn
+import paddle
+import math
+import paddle.nn.functional as F
+
+
+class Conv_BN_ReLU(nn.Layer):
+    def __init__(self,
+                 in_planes,
+                 out_planes,
+                 kernel_size=1,
+                 stride=1,
+                 padding=0):
+        super(Conv_BN_ReLU, self).__init__()
+        self.conv = nn.Conv2D(
+            in_planes,
+            out_planes,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            bias_attr=False)
+        self.bn = nn.BatchNorm2D(out_planes, momentum=0.1)
+        self.relu = nn.ReLU()
+
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels
+                m.weight = paddle.create_parameter(
+                    shape=m.weight.shape,
+                    dtype='float32',
+                    default_initializer=paddle.nn.initializer.Normal(
+                        0, math.sqrt(2. / n)))
+            elif isinstance(m, nn.BatchNorm2D):
+                m.weight = paddle.create_parameter(
+                    shape=m.weight.shape,
+                    dtype='float32',
+                    default_initializer=paddle.nn.initializer.Constant(1.0))
+                m.bias = paddle.create_parameter(
+                    shape=m.bias.shape,
+                    dtype='float32',
+                    default_initializer=paddle.nn.initializer.Constant(0.0))
+
+    def forward(self, x):
+        return self.relu(self.bn(self.conv(x)))
+
+
+class FPN(nn.Layer):
+    def __init__(self, in_channels, out_channels):
+        super(FPN, self).__init__()
+
+        # Top layer
+        self.toplayer_ = Conv_BN_ReLU(
+            in_channels[3], out_channels, kernel_size=1, stride=1, padding=0)
+        # Lateral layers
+        self.latlayer1_ = Conv_BN_ReLU(
+            in_channels[2], out_channels, kernel_size=1, stride=1, padding=0)
+
+        self.latlayer2_ = Conv_BN_ReLU(
+            in_channels[1], out_channels, kernel_size=1, stride=1, padding=0)
+
+        self.latlayer3_ = Conv_BN_ReLU(
+            in_channels[0], out_channels, kernel_size=1, stride=1, padding=0)
+
+        # Smooth layers
+        self.smooth1_ = Conv_BN_ReLU(
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+
+        self.smooth2_ = Conv_BN_ReLU(
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+
+        self.smooth3_ = Conv_BN_ReLU(
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+
+        self.out_channels = out_channels * 4
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels
+                m.weight = paddle.create_parameter(
+                    shape=m.weight.shape,
+                    dtype='float32',
+                    default_initializer=paddle.nn.initializer.Normal(
+                        0, math.sqrt(2. / n)))
+            elif isinstance(m, nn.BatchNorm2D):
+                m.weight = paddle.create_parameter(
+                    shape=m.weight.shape,
+                    dtype='float32',
+                    default_initializer=paddle.nn.initializer.Constant(1.0))
+                m.bias = paddle.create_parameter(
+                    shape=m.bias.shape,
+                    dtype='float32',
+                    default_initializer=paddle.nn.initializer.Constant(0.0))
+
+    def _upsample(self, x, scale=1):
+        return F.upsample(x, scale_factor=scale, mode='bilinear')
+
+    def _upsample_add(self, x, y, scale=1):
+        return F.upsample(x, scale_factor=scale, mode='bilinear') + y
+
+    def forward(self, x):
+        f2, f3, f4, f5 = x
+        p5 = self.toplayer_(f5)
+
+        f4 = self.latlayer1_(f4)
+        p4 = self._upsample_add(p5, f4, 2)
+        p4 = self.smooth1_(p4)
+
+        f3 = self.latlayer2_(f3)
+        p3 = self._upsample_add(p4, f3, 2)
+        p3 = self.smooth2_(p3)
+
+        f2 = self.latlayer3_(f2)
+        p2 = self._upsample_add(p3, f2, 2)
+        p2 = self.smooth3_(p2)
+
+        p3 = self._upsample(p3, 2)
+        p4 = self._upsample(p4, 4)
+        p5 = self._upsample(p5, 8)
+
+        fuse = paddle.concat([p2, p3, p4, p5], axis=1)
+        return fuse
--- a/ppocr/modeling/necks/pg_fpn.py
+++ b/ppocr/modeling/necks/pg_fpn.py
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 groups=1,
+                 is_vd_mode=False,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        self.is_vd_mode = is_vd_mode
+        self._pool2d_avg = nn.AvgPool2D(
+            kernel_size=2, stride=2, padding=0, ceil_mode=True)
+        self._conv = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=(kernel_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        self._batch_norm = nn.BatchNorm(
+            out_channels,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance',
+            use_global_stats=False)
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class DeConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=4,
+                 stride=2,
+                 padding=1,
+                 groups=1,
+                 if_act=True,
+                 act=None,
+                 name=None):
+        super(DeConvBNLayer, self).__init__()
+
+        self.if_act = if_act
+        self.act = act
+        self.deconv = nn.Conv2DTranspose(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + '_weights'),
+            bias_attr=False)
+        self.bn = nn.BatchNorm(
+            num_channels=out_channels,
+            act=act,
+            param_attr=ParamAttr(name="bn_" + name + "_scale"),
+            bias_attr=ParamAttr(name="bn_" + name + "_offset"),
+            moving_mean_name="bn_" + name + "_mean",
+            moving_variance_name="bn_" + name + "_variance",
+            use_global_stats=False)
+
+    def forward(self, x):
+        x = self.deconv(x)
+        x = self.bn(x)
+        return x
+
+
+class PGFPN(nn.Layer):
+    def __init__(self, in_channels, **kwargs):
+        super(PGFPN, self).__init__()
+        num_inputs = [2048, 2048, 1024, 512, 256]
+        num_outputs = [256, 256, 192, 192, 128]
+        self.out_channels = 128
+        self.conv_bn_layer_1 = ConvBNLayer(
+            in_channels=3,
+            out_channels=32,
+            kernel_size=3,
+            stride=1,
+            act=None,
+            name='FPN_d1')
+        self.conv_bn_layer_2 = ConvBNLayer(
+            in_channels=64,
+            out_channels=64,
+            kernel_size=3,
+            stride=1,
+            act=None,
+            name='FPN_d2')
+        self.conv_bn_layer_3 = ConvBNLayer(
+            in_channels=256,
+            out_channels=128,
+            kernel_size=3,
+            stride=1,
+            act=None,
+            name='FPN_d3')
+        self.conv_bn_layer_4 = ConvBNLayer(
+            in_channels=32,
+            out_channels=64,
+            kernel_size=3,
+            stride=2,
+            act=None,
+            name='FPN_d4')
+        self.conv_bn_layer_5 = ConvBNLayer(
+            in_channels=64,
+            out_channels=64,
+            kernel_size=3,
+            stride=1,
+            act='relu',
+            name='FPN_d5')
+        self.conv_bn_layer_6 = ConvBNLayer(
+            in_channels=64,
+            out_channels=128,
+            kernel_size=3,
+            stride=2,
+            act=None,
+            name='FPN_d6')
+        self.conv_bn_layer_7 = ConvBNLayer(
+            in_channels=128,
+            out_channels=128,
+            kernel_size=3,
+            stride=1,
+            act='relu',
+            name='FPN_d7')
+        self.conv_bn_layer_8 = ConvBNLayer(
+            in_channels=128,
+            out_channels=128,
+            kernel_size=1,
+            stride=1,
+            act=None,
+            name='FPN_d8')
+
+        self.conv_h0 = ConvBNLayer(
+            in_channels=num_inputs[0],
+            out_channels=num_outputs[0],
+            kernel_size=1,
+            stride=1,
+            act=None,
+            name="conv_h{}".format(0))
+        self.conv_h1 = ConvBNLayer(
+            in_channels=num_inputs[1],
+            out_channels=num_outputs[1],
+            kernel_size=1,
+            stride=1,
+            act=None,
+            name="conv_h{}".format(1))
+        self.conv_h2 = ConvBNLayer(
+            in_channels=num_inputs[2],
+            out_channels=num_outputs[2],
+            kernel_size=1,
+            stride=1,
+            act=None,
+            name="conv_h{}".format(2))
+        self.conv_h3 = ConvBNLayer(
+            in_channels=num_inputs[3],
+            out_channels=num_outputs[3],
+            kernel_size=1,
+            stride=1,
+            act=None,
+            name="conv_h{}".format(3))
+        self.conv_h4 = ConvBNLayer(
+            in_channels=num_inputs[4],
+            out_channels=num_outputs[4],
+            kernel_size=1,
+            stride=1,
+            act=None,
+            name="conv_h{}".format(4))
+
+        self.dconv0 = DeConvBNLayer(
+            in_channels=num_outputs[0],
+            out_channels=num_outputs[0 + 1],
+            name="dconv_{}".format(0))
+        self.dconv1 = DeConvBNLayer(
+            in_channels=num_outputs[1],
+            out_channels=num_outputs[1 + 1],
+            act=None,
+            name="dconv_{}".format(1))
+        self.dconv2 = DeConvBNLayer(
+            in_channels=num_outputs[2],
+            out_channels=num_outputs[2 + 1],
+            act=None,
+            name="dconv_{}".format(2))
+        self.dconv3 = DeConvBNLayer(
+            in_channels=num_outputs[3],
+            out_channels=num_outputs[3 + 1],
+            act=None,
+            name="dconv_{}".format(3))
+        self.conv_g1 = ConvBNLayer(
+            in_channels=num_outputs[1],
+            out_channels=num_outputs[1],
+            kernel_size=3,
+            stride=1,
+            act='relu',
+            name="conv_g{}".format(1))
+        self.conv_g2 = ConvBNLayer(
+            in_channels=num_outputs[2],
+            out_channels=num_outputs[2],
+            kernel_size=3,
+            stride=1,
+            act='relu',
+            name="conv_g{}".format(2))
+        self.conv_g3 = ConvBNLayer(
+            in_channels=num_outputs[3],
+            out_channels=num_outputs[3],
+            kernel_size=3,
+            stride=1,
+            act='relu',
+            name="conv_g{}".format(3))
+        self.conv_g4 = ConvBNLayer(
+            in_channels=num_outputs[4],
+            out_channels=num_outputs[4],
+            kernel_size=3,
+            stride=1,
+            act='relu',
+            name="conv_g{}".format(4))
+        self.convf = ConvBNLayer(
+            in_channels=num_outputs[4],
+            out_channels=num_outputs[4],
+            kernel_size=1,
+            stride=1,
+            act=None,
+            name="conv_f{}".format(4))
+
+    def forward(self, x):
+        c0, c1, c2, c3, c4, c5, c6 = x
+        # FPN_Down_Fusion
+        f = [c0, c1, c2]
+        g = [None, None, None]
+        h = [None, None, None]
+        h[0] = self.conv_bn_layer_1(f[0])
+        h[1] = self.conv_bn_layer_2(f[1])
+        h[2] = self.conv_bn_layer_3(f[2])
+
+        g[0] = self.conv_bn_layer_4(h[0])
+        g[1] = paddle.add(g[0], h[1])
+        g[1] = F.relu(g[1])
+        g[1] = self.conv_bn_layer_5(g[1])
+        g[1] = self.conv_bn_layer_6(g[1])
+
+        g[2] = paddle.add(g[1], h[2])
+        g[2] = F.relu(g[2])
+        g[2] = self.conv_bn_layer_7(g[2])
+        f_down = self.conv_bn_layer_8(g[2])
+
+        # FPN UP Fusion
+        f1 = [c6, c5, c4, c3, c2]
+        g = [None, None, None, None, None]
+        h = [None, None, None, None, None]
+        h[0] = self.conv_h0(f1[0])
+        h[1] = self.conv_h1(f1[1])
+        h[2] = self.conv_h2(f1[2])
+        h[3] = self.conv_h3(f1[3])
+        h[4] = self.conv_h4(f1[4])
+
+        g[0] = self.dconv0(h[0])
+        g[1] = paddle.add(g[0], h[1])
+        g[1] = F.relu(g[1])
+        g[1] = self.conv_g1(g[1])
+        g[1] = self.dconv1(g[1])
+
+        g[2] = paddle.add(g[1], h[2])
+        g[2] = F.relu(g[2])
+        g[2] = self.conv_g2(g[2])
+        g[2] = self.dconv2(g[2])
+
+        g[3] = paddle.add(g[2], h[3])
+        g[3] = F.relu(g[3])
+        g[3] = self.conv_g3(g[3])
+        g[3] = self.dconv3(g[3])
+
+        g[4] = paddle.add(x=g[3], y=h[4])
+        g[4] = F.relu(g[4])
+        g[4] = self.conv_g4(g[4])
+        f_up = self.convf(g[4])
+        f_common = paddle.add(f_down, f_up)
+        f_common = F.relu(f_common)
+        return f_common
--- a/ppocr/modeling/necks/pren_fpn.py
+++ b/ppocr/modeling/necks/pren_fpn.py
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Code is refer from:
+https://github.com/RuijieJ/pren/blob/main/Nets/Aggregation.py
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+
+
+class PoolAggregate(nn.Layer):
+    def __init__(self, n_r, d_in, d_middle=None, d_out=None):
+        super(PoolAggregate, self).__init__()
+        if not d_middle:
+            d_middle = d_in
+        if not d_out:
+            d_out = d_in
+
+        self.d_in = d_in
+        self.d_middle = d_middle
+        self.d_out = d_out
+        self.act = nn.Swish()
+
+        self.n_r = n_r
+        self.aggs = self._build_aggs()
+
+    def _build_aggs(self):
+        aggs = []
+        for i in range(self.n_r):
+            aggs.append(
+                self.add_sublayer(
+                    '{}'.format(i),
+                    nn.Sequential(
+                        ('conv1', nn.Conv2D(
+                            self.d_in, self.d_middle, 3, 2, 1, bias_attr=False)
+                         ), ('bn1', nn.BatchNorm(self.d_middle)),
+                        ('act', self.act), ('conv2', nn.Conv2D(
+                            self.d_middle, self.d_out, 3, 2, 1, bias_attr=False
+                        )), ('bn2', nn.BatchNorm(self.d_out)))))
+        return aggs
+
+    def forward(self, x):
+        b = x.shape[0]
+        outs = []
+        for agg in self.aggs:
+            y = agg(x)
+            p = F.adaptive_avg_pool2d(y, 1)
+            outs.append(p.reshape((b, 1, self.d_out)))
+        out = paddle.concat(outs, 1)
+        return out
+
+
+class WeightAggregate(nn.Layer):
+    def __init__(self, n_r, d_in, d_middle=None, d_out=None):
+        super(WeightAggregate, self).__init__()
+        if not d_middle:
+            d_middle = d_in
+        if not d_out:
+            d_out = d_in
+
+        self.n_r = n_r
+        self.d_out = d_out
+        self.act = nn.Swish()
+
+        self.conv_n = nn.Sequential(
+            ('conv1', nn.Conv2D(
+                d_in, d_in, 3, 1, 1,
+                bias_attr=False)), ('bn1', nn.BatchNorm(d_in)),
+            ('act1', self.act), ('conv2', nn.Conv2D(
+                d_in, n_r, 1, bias_attr=False)), ('bn2', nn.BatchNorm(n_r)),
+            ('act2', nn.Sigmoid()))
+        self.conv_d = nn.Sequential(
+            ('conv1', nn.Conv2D(
+                d_in, d_middle, 3, 1, 1,
+                bias_attr=False)), ('bn1', nn.BatchNorm(d_middle)),
+            ('act1', self.act), ('conv2', nn.Conv2D(
+                d_middle, d_out, 1,
+                bias_attr=False)), ('bn2', nn.BatchNorm(d_out)))
+
+    def forward(self, x):
+        b, _, h, w = x.shape
+
+        hmaps = self.conv_n(x)
+        fmaps = self.conv_d(x)
+        r = paddle.bmm(
+            hmaps.reshape((b, self.n_r, h * w)),
+            fmaps.reshape((b, self.d_out, h * w)).transpose((0, 2, 1)))
+        return r
+
+
+class GCN(nn.Layer):
+    def __init__(self, d_in, n_in, d_out=None, n_out=None, dropout=0.1):
+        super(GCN, self).__init__()
+        if not d_out:
+            d_out = d_in
+        if not n_out:
+            n_out = d_in
+
+        self.conv_n = nn.Conv1D(n_in, n_out, 1)
+        self.linear = nn.Linear(d_in, d_out)
+        self.dropout = nn.Dropout(dropout)
+        self.act = nn.Swish()
+
+    def forward(self, x):
+        x = self.conv_n(x)
+        x = self.dropout(self.linear(x))
+        return self.act(x)
+
+
+class PRENFPN(nn.Layer):
+    def __init__(self, in_channels, n_r, d_model, max_len, dropout):
+        super(PRENFPN, self).__init__()
+        assert len(in_channels) == 3, "in_channels' length must be 3."
+        c1, c2, c3 = in_channels  # the depths are from big to small
+        # build fpn
+        assert d_model % 3 == 0, "{} can't be divided by 3.".format(d_model)
+        self.agg_p1 = PoolAggregate(n_r, c1, d_out=d_model // 3)
+        self.agg_p2 = PoolAggregate(n_r, c2, d_out=d_model // 3)
+        self.agg_p3 = PoolAggregate(n_r, c3, d_out=d_model // 3)
+
+        self.agg_w1 = WeightAggregate(n_r, c1, 4 * c1, d_model // 3)
+        self.agg_w2 = WeightAggregate(n_r, c2, 4 * c2, d_model // 3)
+        self.agg_w3 = WeightAggregate(n_r, c3, 4 * c3, d_model // 3)
+
+        self.gcn_pool = GCN(d_model, n_r, d_model, max_len, dropout)
+        self.gcn_weight = GCN(d_model, n_r, d_model, max_len, dropout)
+
+        self.out_channels = d_model
+
+    def forward(self, inputs):
+        f3, f5, f7 = inputs
+
+        rp1 = self.agg_p1(f3)
+        rp2 = self.agg_p2(f5)
+        rp3 = self.agg_p3(f7)
+        rp = paddle.concat([rp1, rp2, rp3], 2)  # [b,nr,d]
+
+        rw1 = self.agg_w1(f3)
+        rw2 = self.agg_w2(f5)
+        rw3 = self.agg_w3(f7)
+        rw = paddle.concat([rw1, rw2, rw3], 2)  # [b,nr,d]
+
+        y1 = self.gcn_pool(rp)
+        y2 = self.gcn_weight(rw)
+        y = 0.5 * (y1 + y2)
+        return y  # [b,max_len,d]
--- a/ppocr/modeling/necks/rnn.py
+++ b/ppocr/modeling/necks/rnn.py
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from paddle import nn
+
+from ppocr.modeling.heads.rec_ctc_head import get_para_bias_attr
+from ppocr.modeling.backbones.rec_svtrnet import Block, ConvBNLayer, trunc_normal_, zeros_, ones_
+
+
+class Im2Seq(nn.Layer):
+    def __init__(self, in_channels, **kwargs):
+        super().__init__()
+        self.out_channels = in_channels
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        assert H == 1
+        x = x.squeeze(axis=2)
+        x = x.transpose([0, 2, 1])  # (NTC)(batch, width, channels)
+        return x
+
+
+class EncoderWithRNN(nn.Layer):
+    def __init__(self, in_channels, hidden_size):
+        super(EncoderWithRNN, self).__init__()
+        self.out_channels = hidden_size * 2
+        self.lstm = nn.LSTM(
+            in_channels, hidden_size, direction='bidirectional', num_layers=2)
+
+    def forward(self, x):
+        x, _ = self.lstm(x)
+        return x
+
+
+class EncoderWithFC(nn.Layer):
+    def __init__(self, in_channels, hidden_size):
+        super(EncoderWithFC, self).__init__()
+        self.out_channels = hidden_size
+        weight_attr, bias_attr = get_para_bias_attr(
+            l2_decay=0.00001, k=in_channels)
+        self.fc = nn.Linear(
+            in_channels,
+            hidden_size,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            name='reduce_encoder_fea')
+
+    def forward(self, x):
+        x = self.fc(x)
+        return x
+
+
+class EncoderWithSVTR(nn.Layer):
+    def __init__(
+            self,
+            in_channels,
+            dims=64,  # XS
+            depth=2,
+            hidden_dims=120,
+            use_guide=False,
+            num_heads=8,
+            qkv_bias=True,
+            mlp_ratio=2.0,
+            drop_rate=0.1,
+            attn_drop_rate=0.1,
+            drop_path=0.,
+            qk_scale=None):
+        super(EncoderWithSVTR, self).__init__()
+        self.depth = depth
+        self.use_guide = use_guide
+        self.conv1 = ConvBNLayer(
+            in_channels, in_channels // 8, padding=1, act=nn.Swish)
+        self.conv2 = ConvBNLayer(
+            in_channels // 8, hidden_dims, kernel_size=1, act=nn.Swish)
+
+        self.svtr_block = nn.LayerList([
+            Block(
+                dim=hidden_dims,
+                num_heads=num_heads,
+                mixer='Global',
+                HW=None,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                act_layer=nn.Swish,
+                attn_drop=attn_drop_rate,
+                drop_path=drop_path,
+                norm_layer='nn.LayerNorm',
+                epsilon=1e-05,
+                prenorm=False) for i in range(depth)
+        ])
+        self.norm = nn.LayerNorm(hidden_dims, epsilon=1e-6)
+        self.conv3 = ConvBNLayer(
+            hidden_dims, in_channels, kernel_size=1, act=nn.Swish)
+        # last conv-nxn, the input is concat of input tensor and conv3 output tensor
+        self.conv4 = ConvBNLayer(
+            2 * in_channels, in_channels // 8, padding=1, act=nn.Swish)
+
+        self.conv1x1 = ConvBNLayer(
+            in_channels // 8, dims, kernel_size=1, act=nn.Swish)
+        self.out_channels = dims
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def forward(self, x):
+        # for use guide
+        if self.use_guide:
+            z = x.clone()
+            z.stop_gradient = True
+        else:
+            z = x
+        # for short cut
+        h = z
+        # reduce dim
+        z = self.conv1(z)
+        z = self.conv2(z)
+        # SVTR global block
+        B, C, H, W = z.shape
+        z = z.flatten(2).transpose([0, 2, 1])
+        for blk in self.svtr_block:
+            z = blk(z)
+        z = self.norm(z)
+        # last stage
+        z = z.reshape([0, H, W, C]).transpose([0, 3, 1, 2])
+        z = self.conv3(z)
+        z = paddle.concat((h, z), axis=1)
+        z = self.conv1x1(self.conv4(z))
+        return z
+
+
+class SequenceEncoder(nn.Layer):
+    def __init__(self, in_channels, encoder_type, hidden_size=48, **kwargs):
+        super(SequenceEncoder, self).__init__()
+        self.encoder_reshape = Im2Seq(in_channels)
+        self.out_channels = self.encoder_reshape.out_channels
+        self.encoder_type = encoder_type
+        if encoder_type == 'reshape':
+            self.only_reshape = True
+        else:
+            support_encoder_dict = {
+                'reshape': Im2Seq,
+                'fc': EncoderWithFC,
+                'rnn': EncoderWithRNN,
+                'svtr': EncoderWithSVTR
+            }
+            assert encoder_type in support_encoder_dict, '{} must in {}'.format(
+                encoder_type, support_encoder_dict.keys())
+            if encoder_type == "svtr":
+                self.encoder = support_encoder_dict[encoder_type](
+                    self.encoder_reshape.out_channels, **kwargs)
+            else:
+                self.encoder = support_encoder_dict[encoder_type](
+                    self.encoder_reshape.out_channels, hidden_size)
+            self.out_channels = self.encoder.out_channels
+            self.only_reshape = False
+
+    def forward(self, x):
+        if self.encoder_type != 'svtr':
+            x = self.encoder_reshape(x)
+            if not self.only_reshape:
+                x = self.encoder(x)
+            return x
+        else:
+            x = self.encoder(x)
+            x = self.encoder_reshape(x)
+            return x
--- a/ppocr/modeling/necks/sast_fpn.py
+++ b/ppocr/modeling/necks/sast_fpn.py
+# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 groups=1,
+                 if_act=True,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self.if_act = if_act
+        self.act = act
+        self.conv = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=(kernel_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + '_weights'),
+            bias_attr=False)
+  
+        self.bn = nn.BatchNorm(
+            num_channels=out_channels,
+            act=act,
+            param_attr=ParamAttr(name="bn_" + name + "_scale"),
+            bias_attr=ParamAttr(name="bn_" + name + "_offset"),
+            moving_mean_name="bn_" + name + "_mean",
+            moving_variance_name="bn_" + name + "_variance")
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        return x
+
+
+class DeConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 groups=1,
+                 if_act=True,
+                 act=None,
+                 name=None):
+        super(DeConvBNLayer, self).__init__()
+        self.if_act = if_act
+        self.act = act
+        self.deconv = nn.Conv2DTranspose(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=(kernel_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + '_weights'),
+            bias_attr=False)
+        self.bn = nn.BatchNorm(
+            num_channels=out_channels,
+            act=act,
+            param_attr=ParamAttr(name="bn_" + name + "_scale"),
+            bias_attr=ParamAttr(name="bn_" + name + "_offset"),
+            moving_mean_name="bn_" + name + "_mean",
+            moving_variance_name="bn_" + name + "_variance")
+
+    def forward(self, x):
+        x = self.deconv(x)
+        x = self.bn(x)
+        return x
+
+
+class FPN_Up_Fusion(nn.Layer):
+    def __init__(self, in_channels):
+        super(FPN_Up_Fusion, self).__init__()
+        in_channels = in_channels[::-1]
+        out_channels = [256, 256, 192, 192, 128]
+                
+        self.h0_conv = ConvBNLayer(in_channels[0], out_channels[0], 1, 1, act=None, name='fpn_up_h0')
+        self.h1_conv = ConvBNLayer(in_channels[1], out_channels[1], 1, 1, act=None, name='fpn_up_h1')
+        self.h2_conv = ConvBNLayer(in_channels[2], out_channels[2], 1, 1, act=None, name='fpn_up_h2')
+        self.h3_conv = ConvBNLayer(in_channels[3], out_channels[3], 1, 1, act=None, name='fpn_up_h3')
+        self.h4_conv = ConvBNLayer(in_channels[4], out_channels[4], 1, 1, act=None, name='fpn_up_h4')
+
+        self.g0_conv = DeConvBNLayer(out_channels[0], out_channels[1], 4, 2, act=None, name='fpn_up_g0')
+
+        self.g1_conv = nn.Sequential(
+            ConvBNLayer(out_channels[1], out_channels[1], 3, 1, act='relu', name='fpn_up_g1_1'),
+            DeConvBNLayer(out_channels[1], out_channels[2], 4, 2, act=None, name='fpn_up_g1_2')
+        )
+        self.g2_conv = nn.Sequential(
+            ConvBNLayer(out_channels[2], out_channels[2], 3, 1, act='relu', name='fpn_up_g2_1'),
+            DeConvBNLayer(out_channels[2], out_channels[3], 4, 2, act=None, name='fpn_up_g2_2')
+        )
+        self.g3_conv = nn.Sequential(
+            ConvBNLayer(out_channels[3], out_channels[3], 3, 1, act='relu', name='fpn_up_g3_1'),
+            DeConvBNLayer(out_channels[3], out_channels[4], 4, 2, act=None, name='fpn_up_g3_2')
+        )
+
+        self.g4_conv = nn.Sequential(
+            ConvBNLayer(out_channels[4], out_channels[4], 3, 1, act='relu', name='fpn_up_fusion_1'),
+            ConvBNLayer(out_channels[4], out_channels[4], 1, 1, act=None, name='fpn_up_fusion_2')
+        )
+
+    def _add_relu(self, x1, x2):
+        x = paddle.add(x=x1, y=x2)
+        x = F.relu(x)
+        return x
+
+    def forward(self, x):
+        f = x[2:][::-1]
+        h0 = self.h0_conv(f[0])
+        h1 = self.h1_conv(f[1])
+        h2 = self.h2_conv(f[2])
+        h3 = self.h3_conv(f[3])
+        h4 = self.h4_conv(f[4])
+
+        g0 = self.g0_conv(h0)
+        g1 = self._add_relu(g0, h1)
+        g1 = self.g1_conv(g1)
+        g2 = self.g2_conv(self._add_relu(g1, h2))
+        g3 = self.g3_conv(self._add_relu(g2, h3))
+        g4 = self.g4_conv(self._add_relu(g3, h4))
+
+        return g4
+
+
+class FPN_Down_Fusion(nn.Layer):
+    def __init__(self, in_channels):
+        super(FPN_Down_Fusion, self).__init__()
+        out_channels = [32, 64, 128]
+
+        self.h0_conv = ConvBNLayer(in_channels[0], out_channels[0], 3, 1, act=None, name='fpn_down_h0')
+        self.h1_conv = ConvBNLayer(in_channels[1], out_channels[1], 3, 1, act=None, name='fpn_down_h1')
+        self.h2_conv = ConvBNLayer(in_channels[2], out_channels[2], 3, 1, act=None, name='fpn_down_h2')
+
+        self.g0_conv = ConvBNLayer(out_channels[0], out_channels[1], 3, 2, act=None, name='fpn_down_g0')
+
+        self.g1_conv = nn.Sequential(
+            ConvBNLayer(out_channels[1], out_channels[1], 3, 1, act='relu', name='fpn_down_g1_1'),
+            ConvBNLayer(out_channels[1], out_channels[2], 3, 2, act=None, name='fpn_down_g1_2')            
+        )
+
+        self.g2_conv = nn.Sequential(
+            ConvBNLayer(out_channels[2], out_channels[2], 3, 1, act='relu', name='fpn_down_fusion_1'),
+            ConvBNLayer(out_channels[2], out_channels[2], 1, 1, act=None, name='fpn_down_fusion_2')            
+        )
+
+    def forward(self, x):
+        f = x[:3]
+        h0 = self.h0_conv(f[0])
+        h1 = self.h1_conv(f[1])
+        h2 = self.h2_conv(f[2])
+        g0 = self.g0_conv(h0)
+        g1 = paddle.add(x=g0, y=h1)
+        g1 = F.relu(g1)
+        g1 = self.g1_conv(g1)
+        g2 = paddle.add(x=g1, y=h2)
+        g2 = F.relu(g2)
+        g2 = self.g2_conv(g2)
+        return g2
+
+
+class Cross_Attention(nn.Layer):
+    def __init__(self, in_channels):
+        super(Cross_Attention, self).__init__()
+        self.theta_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act='relu', name='f_theta')
+        self.phi_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act='relu', name='f_phi')
+        self.g_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act='relu', name='f_g')
+
+        self.fh_weight_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act=None, name='fh_weight')
+        self.fh_sc_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act=None, name='fh_sc')
+
+        self.fv_weight_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act=None, name='fv_weight')
+        self.fv_sc_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act=None, name='fv_sc')
+
+        self.f_attn_conv = ConvBNLayer(in_channels * 2, in_channels, 1, 1, act='relu', name='f_attn')
+
+    def _cal_fweight(self, f, shape):
+        f_theta, f_phi, f_g = f
+        #flatten
+        f_theta = paddle.transpose(f_theta, [0, 2, 3, 1])
+        f_theta = paddle.reshape(f_theta, [shape[0] * shape[1], shape[2], 128])
+        f_phi = paddle.transpose(f_phi, [0, 2, 3, 1])
+        f_phi = paddle.reshape(f_phi, [shape[0] * shape[1], shape[2], 128])
+        f_g = paddle.transpose(f_g, [0, 2, 3, 1])
+        f_g = paddle.reshape(f_g, [shape[0] * shape[1], shape[2], 128])
+        #correlation
+        f_attn = paddle.matmul(f_theta, paddle.transpose(f_phi, [0, 2, 1]))
+        #scale
+        f_attn = f_attn / (128**0.5)
+        f_attn = F.softmax(f_attn)
+        #weighted sum
+        f_weight = paddle.matmul(f_attn, f_g)
+        f_weight = paddle.reshape(
+            f_weight, [shape[0], shape[1], shape[2], 128])
+        return f_weight
+
+    def forward(self, f_common):
+        f_shape = paddle.shape(f_common)
+        # print('f_shape: ', f_shape)
+
+        f_theta = self.theta_conv(f_common)
+        f_phi = self.phi_conv(f_common)
+        f_g = self.g_conv(f_common)
+
+        ######## horizon ########
+        fh_weight = self._cal_fweight([f_theta, f_phi, f_g], 
+                                        [f_shape[0], f_shape[2], f_shape[3]])
+        fh_weight = paddle.transpose(fh_weight, [0, 3, 1, 2])
+        fh_weight = self.fh_weight_conv(fh_weight)
+        #short cut
+        fh_sc = self.fh_sc_conv(f_common)
+        f_h = F.relu(fh_weight + fh_sc)
+
+        ######## vertical ########
+        fv_theta = paddle.transpose(f_theta, [0, 1, 3, 2])
+        fv_phi = paddle.transpose(f_phi, [0, 1, 3, 2])
+        fv_g = paddle.transpose(f_g, [0, 1, 3, 2])
+        fv_weight = self._cal_fweight([fv_theta, fv_phi, fv_g], 
+                                        [f_shape[0], f_shape[3], f_shape[2]])
+        fv_weight = paddle.transpose(fv_weight, [0, 3, 2, 1])
+        fv_weight = self.fv_weight_conv(fv_weight)
+        #short cut
+        fv_sc = self.fv_sc_conv(f_common)
+        f_v = F.relu(fv_weight + fv_sc)
+
+        ######## merge ########
+        f_attn = paddle.concat([f_h, f_v], axis=1)
+        f_attn = self.f_attn_conv(f_attn)
+        return f_attn
+
+
+class SASTFPN(nn.Layer):
+    def __init__(self, in_channels, with_cab=False, **kwargs):
+        super(SASTFPN, self).__init__()
+        self.in_channels = in_channels
+        self.with_cab = with_cab
+        self.FPN_Down_Fusion = FPN_Down_Fusion(self.in_channels)
+        self.FPN_Up_Fusion = FPN_Up_Fusion(self.in_channels)
+        self.out_channels = 128
+        self.cross_attention = Cross_Attention(self.out_channels)
+
+    def forward(self, x):
+        #down fpn
+        f_down = self.FPN_Down_Fusion(x)
+
+        #up fpn
+        f_up = self.FPN_Up_Fusion(x)
+
+        #fusion
+        f_common = paddle.add(x=f_down, y=f_up)
+        f_common = F.relu(f_common)
+
+        if self.with_cab:
+            # print('enhence f_common with CAB.')
+            f_common = self.cross_attention(f_common)
+
+        return f_common
--- a/ppocr/modeling/necks/table_fpn.py
+++ b/ppocr/modeling/necks/table_fpn.py
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+
+
+class TableFPN(nn.Layer):
+    def __init__(self, in_channels, out_channels, **kwargs):
+        super(TableFPN, self).__init__()
+        self.out_channels = 512
+        weight_attr = paddle.nn.initializer.KaimingUniform()
+        self.in2_conv = nn.Conv2D(
+            in_channels=in_channels[0],
+            out_channels=self.out_channels,
+            kernel_size=1,
+            weight_attr=ParamAttr(initializer=weight_attr),
+            bias_attr=False)
+        self.in3_conv = nn.Conv2D(
+            in_channels=in_channels[1],
+            out_channels=self.out_channels,
+            kernel_size=1,
+            stride = 1,
+            weight_attr=ParamAttr(initializer=weight_attr),
+            bias_attr=False)
+        self.in4_conv = nn.Conv2D(
+            in_channels=in_channels[2],
+            out_channels=self.out_channels,
+            kernel_size=1,
+            weight_attr=ParamAttr(initializer=weight_attr),
+            bias_attr=False)
+        self.in5_conv = nn.Conv2D(
+            in_channels=in_channels[3],
+            out_channels=self.out_channels,
+            kernel_size=1,
+            weight_attr=ParamAttr(initializer=weight_attr),
+            bias_attr=False)
+        self.p5_conv = nn.Conv2D(
+            in_channels=self.out_channels,
+            out_channels=self.out_channels // 4,
+            kernel_size=3,
+            padding=1,
+            weight_attr=ParamAttr(initializer=weight_attr),
+            bias_attr=False)
+        self.p4_conv = nn.Conv2D(
+            in_channels=self.out_channels,
+            out_channels=self.out_channels // 4,
+            kernel_size=3,
+            padding=1,
+            weight_attr=ParamAttr(initializer=weight_attr),
+            bias_attr=False)
+        self.p3_conv = nn.Conv2D(
+            in_channels=self.out_channels,
+            out_channels=self.out_channels // 4,
+            kernel_size=3,
+            padding=1,
+            weight_attr=ParamAttr(initializer=weight_attr),
+            bias_attr=False)
+        self.p2_conv = nn.Conv2D(
+            in_channels=self.out_channels,
+            out_channels=self.out_channels // 4,
+            kernel_size=3,
+            padding=1,
+            weight_attr=ParamAttr(initializer=weight_attr),
+            bias_attr=False)
+        self.fuse_conv = nn.Conv2D(
+            in_channels=self.out_channels * 4,
+            out_channels=512,
+            kernel_size=3,
+            padding=1,
+            weight_attr=ParamAttr(initializer=weight_attr), bias_attr=False)
+
+    def forward(self, x):
+        c2, c3, c4, c5 = x
+
+        in5 = self.in5_conv(c5)
+        in4 = self.in4_conv(c4)
+        in3 = self.in3_conv(c3)
+        in2 = self.in2_conv(c2)
+
+        out4 = in4 + F.upsample(
+            in5, size=in4.shape[2:4], mode="nearest", align_mode=1)  # 1/16
+        out3 = in3 + F.upsample(
+            out4, size=in3.shape[2:4], mode="nearest", align_mode=1)  # 1/8
+        out2 = in2 + F.upsample(
+            out3, size=in2.shape[2:4], mode="nearest", align_mode=1)  # 1/4
+
+        p4 = F.upsample(out4, size=in5.shape[2:4], mode="nearest", align_mode=1)
+        p3 = F.upsample(out3, size=in5.shape[2:4], mode="nearest", align_mode=1)
+        p2 = F.upsample(out2, size=in5.shape[2:4], mode="nearest", align_mode=1)
+        fuse = paddle.concat([in5, p4, p3, p2], axis=1)
+        fuse_conv = self.fuse_conv(fuse) * 0.005
+        return [c5 + fuse_conv]
--- a/ppocr/modeling/transforms/__init__.py
+++ b/ppocr/modeling/transforms/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ['build_transform']
+
+
+def build_transform(config):
+    from .tps import TPS
+    from .stn import STN_ON
+
+    support_dict = ['TPS', 'STN_ON']
+
+    module_name = config.pop('name')
+    assert module_name in support_dict, Exception(
+        'transform only support {}'.format(support_dict))
+    module_class = eval(module_name)(**config)
+    return module_class
--- a/ppocr/modeling/transforms/__pycache__/__init__.cpython-37.pyc
+++ b/ppocr/modeling/transforms/__pycache__/__init__.cpython-37.pyc
--- a/ppocr/modeling/transforms/stn.py
+++ b/ppocr/modeling/transforms/stn.py
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is refer from:
+https://github.com/ayumiymk/aster.pytorch/blob/master/lib/models/stn_head.py
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import paddle
+from paddle import nn, ParamAttr
+from paddle.nn import functional as F
+import numpy as np
+
+from .tps_spatial_transformer import TPSSpatialTransformer
+
+
+def conv3x3_block(in_channels, out_channels, stride=1):
+    n = 3 * 3 * out_channels
+    w = math.sqrt(2. / n)
+    conv_layer = nn.Conv2D(
+        in_channels,
+        out_channels,
+        kernel_size=3,
+        stride=stride,
+        padding=1,
+        weight_attr=nn.initializer.Normal(
+            mean=0.0, std=w),
+        bias_attr=nn.initializer.Constant(0))
+    block = nn.Sequential(conv_layer, nn.BatchNorm2D(out_channels), nn.ReLU())
+    return block
+
+
+class STN(nn.Layer):
+    def __init__(self, in_channels, num_ctrlpoints, activation='none'):
+        super(STN, self).__init__()
+        self.in_channels = in_channels
+        self.num_ctrlpoints = num_ctrlpoints
+        self.activation = activation
+        self.stn_convnet = nn.Sequential(
+            conv3x3_block(in_channels, 32),  #32x64
+            nn.MaxPool2D(
+                kernel_size=2, stride=2),
+            conv3x3_block(32, 64),  #16x32
+            nn.MaxPool2D(
+                kernel_size=2, stride=2),
+            conv3x3_block(64, 128),  # 8*16
+            nn.MaxPool2D(
+                kernel_size=2, stride=2),
+            conv3x3_block(128, 256),  # 4*8
+            nn.MaxPool2D(
+                kernel_size=2, stride=2),
+            conv3x3_block(256, 256),  # 2*4,
+            nn.MaxPool2D(
+                kernel_size=2, stride=2),
+            conv3x3_block(256, 256))  # 1*2
+        self.stn_fc1 = nn.Sequential(
+            nn.Linear(
+                2 * 256,
+                512,
+                weight_attr=nn.initializer.Normal(0, 0.001),
+                bias_attr=nn.initializer.Constant(0)),
+            nn.BatchNorm1D(512),
+            nn.ReLU())
+        fc2_bias = self.init_stn()
+        self.stn_fc2 = nn.Linear(
+            512,
+            num_ctrlpoints * 2,
+            weight_attr=nn.initializer.Constant(0.0),
+            bias_attr=nn.initializer.Assign(fc2_bias))
+
+    def init_stn(self):
+        margin = 0.01
+        sampling_num_per_side = int(self.num_ctrlpoints / 2)
+        ctrl_pts_x = np.linspace(margin, 1. - margin, sampling_num_per_side)
+        ctrl_pts_y_top = np.ones(sampling_num_per_side) * margin
+        ctrl_pts_y_bottom = np.ones(sampling_num_per_side) * (1 - margin)
+        ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1)
+        ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1)
+        ctrl_points = np.concatenate(
+            [ctrl_pts_top, ctrl_pts_bottom], axis=0).astype(np.float32)
+        if self.activation == 'none':
+            pass
+        elif self.activation == 'sigmoid':
+            ctrl_points = -np.log(1. / ctrl_points - 1.)
+        ctrl_points = paddle.to_tensor(ctrl_points)
+        fc2_bias = paddle.reshape(
+            ctrl_points, shape=[ctrl_points.shape[0] * ctrl_points.shape[1]])
+        return fc2_bias
+
+    def forward(self, x):
+        x = self.stn_convnet(x)
+        batch_size, _, h, w = x.shape
+        x = paddle.reshape(x, shape=(batch_size, -1))
+        img_feat = self.stn_fc1(x)
+        x = self.stn_fc2(0.1 * img_feat)
+        if self.activation == 'sigmoid':
+            x = F.sigmoid(x)
+        x = paddle.reshape(x, shape=[-1, self.num_ctrlpoints, 2])
+        return img_feat, x
+
+
+class STN_ON(nn.Layer):
+    def __init__(self, in_channels, tps_inputsize, tps_outputsize,
+                 num_control_points, tps_margins, stn_activation):
+        super(STN_ON, self).__init__()
+        self.tps = TPSSpatialTransformer(
+            output_image_size=tuple(tps_outputsize),
+            num_control_points=num_control_points,
+            margins=tuple(tps_margins))
+        self.stn_head = STN(in_channels=in_channels,
+                            num_ctrlpoints=num_control_points,
+                            activation=stn_activation)
+        self.tps_inputsize = tps_inputsize
+        self.out_channels = in_channels
+
+    def forward(self, image):
+        stn_input = paddle.nn.functional.interpolate(
+            image, self.tps_inputsize, mode="bilinear", align_corners=True)
+        stn_img_feat, ctrl_points = self.stn_head(stn_input)
+        x, _ = self.tps(image, ctrl_points)
+        return x
--- a/ppocr/modeling/transforms/tps.py
+++ b/ppocr/modeling/transforms/tps.py
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is refer from:
+https://github.com/clovaai/deep-text-recognition-benchmark/blob/master/modules/transformation.py
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import paddle
+from paddle import nn, ParamAttr
+from paddle.nn import functional as F
+import numpy as np
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self.conv = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=(kernel_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        bn_name = "bn_" + name
+        self.bn = nn.BatchNorm(
+            out_channels,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        return x
+
+
+class LocalizationNetwork(nn.Layer):
+    def __init__(self, in_channels, num_fiducial, loc_lr, model_name):
+        super(LocalizationNetwork, self).__init__()
+        self.F = num_fiducial
+        F = num_fiducial
+        if model_name == "large":
+            num_filters_list = [64, 128, 256, 512]
+            fc_dim = 256
+        else:
+            num_filters_list = [16, 32, 64, 128]
+            fc_dim = 64
+
+        self.block_list = []
+        for fno in range(0, len(num_filters_list)):
+            num_filters = num_filters_list[fno]
+            name = "loc_conv%d" % fno
+            conv = self.add_sublayer(
+                name,
+                ConvBNLayer(
+                    in_channels=in_channels,
+                    out_channels=num_filters,
+                    kernel_size=3,
+                    act='relu',
+                    name=name))
+            self.block_list.append(conv)
+            if fno == len(num_filters_list) - 1:
+                pool = nn.AdaptiveAvgPool2D(1)
+            else:
+                pool = nn.MaxPool2D(kernel_size=2, stride=2, padding=0)
+            in_channels = num_filters
+            self.block_list.append(pool)
+        name = "loc_fc1"
+        stdv = 1.0 / math.sqrt(num_filters_list[-1] * 1.0)
+        self.fc1 = nn.Linear(
+            in_channels,
+            fc_dim,
+            weight_attr=ParamAttr(
+                learning_rate=loc_lr,
+                name=name + "_w",
+                initializer=nn.initializer.Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr(name=name + '.b_0'),
+            name=name)
+
+        # Init fc2 in LocalizationNetwork
+        initial_bias = self.get_initial_fiducials()
+        initial_bias = initial_bias.reshape(-1)
+        name = "loc_fc2"
+        param_attr = ParamAttr(
+            learning_rate=loc_lr,
+            initializer=nn.initializer.Assign(np.zeros([fc_dim, F * 2])),
+            name=name + "_w")
+        bias_attr = ParamAttr(
+            learning_rate=loc_lr,
+            initializer=nn.initializer.Assign(initial_bias),
+            name=name + "_b")
+        self.fc2 = nn.Linear(
+            fc_dim,
+            F * 2,
+            weight_attr=param_attr,
+            bias_attr=bias_attr,
+            name=name)
+        self.out_channels = F * 2
+
+    def forward(self, x):
+        """
+           Estimating parameters of geometric transformation
+           Args:
+               image: input
+           Return:
+               batch_C_prime: the matrix of the geometric transformation
+        """
+        B = x.shape[0]
+        i = 0
+        for block in self.block_list:
+            x = block(x)
+        x = x.squeeze(axis=2).squeeze(axis=2)
+        x = self.fc1(x)
+
+        x = F.relu(x)
+        x = self.fc2(x)
+        x = x.reshape(shape=[-1, self.F, 2])
+        return x
+
+    def get_initial_fiducials(self):
+        """ see RARE paper Fig. 6 (a) """
+        F = self.F
+        ctrl_pts_x = np.linspace(-1.0, 1.0, int(F / 2))
+        ctrl_pts_y_top = np.linspace(0.0, -1.0, num=int(F / 2))
+        ctrl_pts_y_bottom = np.linspace(1.0, 0.0, num=int(F / 2))
+        ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1)
+        ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1)
+        initial_bias = np.concatenate([ctrl_pts_top, ctrl_pts_bottom], axis=0)
+        return initial_bias
+
+
+class GridGenerator(nn.Layer):
+    def __init__(self, in_channels, num_fiducial):
+        super(GridGenerator, self).__init__()
+        self.eps = 1e-6
+        self.F = num_fiducial
+
+        name = "ex_fc"
+        initializer = nn.initializer.Constant(value=0.0)
+        param_attr = ParamAttr(
+            learning_rate=0.0, initializer=initializer, name=name + "_w")
+        bias_attr = ParamAttr(
+            learning_rate=0.0, initializer=initializer, name=name + "_b")
+        self.fc = nn.Linear(
+            in_channels,
+            6,
+            weight_attr=param_attr,
+            bias_attr=bias_attr,
+            name=name)
+
+    def forward(self, batch_C_prime, I_r_size):
+        """
+        Generate the grid for the grid_sampler.
+        Args:
+            batch_C_prime: the matrix of the geometric transformation
+            I_r_size: the shape of the input image
+        Return:
+            batch_P_prime: the grid for the grid_sampler
+        """
+        C = self.build_C_paddle()
+        P = self.build_P_paddle(I_r_size)
+
+        inv_delta_C_tensor = self.build_inv_delta_C_paddle(C).astype('float32')
+        P_hat_tensor = self.build_P_hat_paddle(
+            C, paddle.to_tensor(P)).astype('float32')
+
+        inv_delta_C_tensor.stop_gradient = True
+        P_hat_tensor.stop_gradient = True
+
+        batch_C_ex_part_tensor = self.get_expand_tensor(batch_C_prime)
+
+        batch_C_ex_part_tensor.stop_gradient = True
+
+        batch_C_prime_with_zeros = paddle.concat(
+            [batch_C_prime, batch_C_ex_part_tensor], axis=1)
+        batch_T = paddle.matmul(inv_delta_C_tensor, batch_C_prime_with_zeros)
+        batch_P_prime = paddle.matmul(P_hat_tensor, batch_T)
+        return batch_P_prime
+
+    def build_C_paddle(self):
+        """ Return coordinates of fiducial points in I_r; C """
+        F = self.F
+        ctrl_pts_x = paddle.linspace(-1.0, 1.0, int(F / 2), dtype='float64')
+        ctrl_pts_y_top = -1 * paddle.ones([int(F / 2)], dtype='float64')
+        ctrl_pts_y_bottom = paddle.ones([int(F / 2)], dtype='float64')
+        ctrl_pts_top = paddle.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1)
+        ctrl_pts_bottom = paddle.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1)
+        C = paddle.concat([ctrl_pts_top, ctrl_pts_bottom], axis=0)
+        return C  # F x 2
+
+    def build_P_paddle(self, I_r_size):
+        I_r_height, I_r_width = I_r_size
+        I_r_grid_x = (paddle.arange(
+            -I_r_width, I_r_width, 2, dtype='float64') + 1.0
+                      ) / paddle.to_tensor(np.array([I_r_width]))
+
+        I_r_grid_y = (paddle.arange(
+            -I_r_height, I_r_height, 2, dtype='float64') + 1.0
+                      ) / paddle.to_tensor(np.array([I_r_height]))
+
+        # P: self.I_r_width x self.I_r_height x 2
+        P = paddle.stack(paddle.meshgrid(I_r_grid_x, I_r_grid_y), axis=2)
+        P = paddle.transpose(P, perm=[1, 0, 2])
+        # n (= self.I_r_width x self.I_r_height) x 2
+        return P.reshape([-1, 2])
+
+    def build_inv_delta_C_paddle(self, C):
+        """ Return inv_delta_C which is needed to calculate T """
+        F = self.F
+        hat_eye = paddle.eye(F, dtype='float64')  # F x F
+        hat_C = paddle.norm(
+            C.reshape([1, F, 2]) - C.reshape([F, 1, 2]), axis=2) + hat_eye
+        hat_C = (hat_C**2) * paddle.log(hat_C)
+        delta_C = paddle.concat(  # F+3 x F+3
+            [
+                paddle.concat(
+                    [paddle.ones(
+                        (F, 1), dtype='float64'), C, hat_C], axis=1),  # F x F+3
+                paddle.concat(
+                    [
+                        paddle.zeros(
+                            (2, 3), dtype='float64'), paddle.transpose(
+                                C, perm=[1, 0])
+                    ],
+                    axis=1),  # 2 x F+3
+                paddle.concat(
+                    [
+                        paddle.zeros(
+                            (1, 3), dtype='float64'), paddle.ones(
+                                (1, F), dtype='float64')
+                    ],
+                    axis=1)  # 1 x F+3
+            ],
+            axis=0)
+        inv_delta_C = paddle.inverse(delta_C)
+        return inv_delta_C  # F+3 x F+3
+
+    def build_P_hat_paddle(self, C, P):
+        F = self.F
+        eps = self.eps
+        n = P.shape[0]  # n (= self.I_r_width x self.I_r_height)
+        # P_tile: n x 2 -> n x 1 x 2 -> n x F x 2
+        P_tile = paddle.tile(paddle.unsqueeze(P, axis=1), (1, F, 1))
+        C_tile = paddle.unsqueeze(C, axis=0)  # 1 x F x 2
+        P_diff = P_tile - C_tile  # n x F x 2
+        # rbf_norm: n x F
+        rbf_norm = paddle.norm(P_diff, p=2, axis=2, keepdim=False)
+
+        # rbf: n x F
+        rbf = paddle.multiply(
+            paddle.square(rbf_norm), paddle.log(rbf_norm + eps))
+        P_hat = paddle.concat(
+            [paddle.ones(
+                (n, 1), dtype='float64'), P, rbf], axis=1)
+        return P_hat  # n x F+3
+
+    def get_expand_tensor(self, batch_C_prime):
+        B, H, C = batch_C_prime.shape
+        batch_C_prime = batch_C_prime.reshape([B, H * C])
+        batch_C_ex_part_tensor = self.fc(batch_C_prime)
+        batch_C_ex_part_tensor = batch_C_ex_part_tensor.reshape([-1, 3, 2])
+        return batch_C_ex_part_tensor
+
+
+class TPS(nn.Layer):
+    def __init__(self, in_channels, num_fiducial, loc_lr, model_name):
+        super(TPS, self).__init__()
+        self.loc_net = LocalizationNetwork(in_channels, num_fiducial, loc_lr,
+                                           model_name)
+        self.grid_generator = GridGenerator(self.loc_net.out_channels,
+                                            num_fiducial)
+        self.out_channels = in_channels
+
+    def forward(self, image):
+        image.stop_gradient = False
+        batch_C_prime = self.loc_net(image)
+        batch_P_prime = self.grid_generator(batch_C_prime, image.shape[2:])
+        batch_P_prime = batch_P_prime.reshape(
+            [-1, image.shape[2], image.shape[3], 2])
+        batch_I_r = F.grid_sample(x=image, grid=batch_P_prime)
+        return batch_I_r
--- a/ppocr/modeling/transforms/tps_spatial_transformer.py
+++ b/ppocr/modeling/transforms/tps_spatial_transformer.py
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is refer from:
+https://github.com/ayumiymk/aster.pytorch/blob/master/lib/models/tps_spatial_transformer.py
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import paddle
+from paddle import nn, ParamAttr
+from paddle.nn import functional as F
+import numpy as np
+import itertools
+
+
+def grid_sample(input, grid, canvas=None):
+    input.stop_gradient = False
+    output = F.grid_sample(input, grid)
+    if canvas is None:
+        return output
+    else:
+        input_mask = paddle.ones(shape=input.shape)
+        output_mask = F.grid_sample(input_mask, grid)
+        padded_output = output * output_mask + canvas * (1 - output_mask)
+        return padded_output
+
+
+# phi(x1, x2) = r^2 * log(r), where r = ||x1 - x2||_2
+def compute_partial_repr(input_points, control_points):
+    N = input_points.shape[0]
+    M = control_points.shape[0]
+    pairwise_diff = paddle.reshape(
+        input_points, shape=[N, 1, 2]) - paddle.reshape(
+            control_points, shape=[1, M, 2])
+    # original implementation, very slow
+    # pairwise_dist = torch.sum(pairwise_diff ** 2, dim = 2) # square of distance
+    pairwise_diff_square = pairwise_diff * pairwise_diff
+    pairwise_dist = pairwise_diff_square[:, :, 0] + pairwise_diff_square[:, :,
+                                                                         1]
+    repr_matrix = 0.5 * pairwise_dist * paddle.log(pairwise_dist)
+    # fix numerical error for 0 * log(0), substitute all nan with 0
+    mask = np.array(repr_matrix != repr_matrix)
+    repr_matrix[mask] = 0
+    return repr_matrix
+
+
+# output_ctrl_pts are specified, according to our task.
+def build_output_control_points(num_control_points, margins):
+    margin_x, margin_y = margins
+    num_ctrl_pts_per_side = num_control_points // 2
+    ctrl_pts_x = np.linspace(margin_x, 1.0 - margin_x, num_ctrl_pts_per_side)
+    ctrl_pts_y_top = np.ones(num_ctrl_pts_per_side) * margin_y
+    ctrl_pts_y_bottom = np.ones(num_ctrl_pts_per_side) * (1.0 - margin_y)
+    ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1)
+    ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1)
+    output_ctrl_pts_arr = np.concatenate(
+        [ctrl_pts_top, ctrl_pts_bottom], axis=0)
+    output_ctrl_pts = paddle.to_tensor(output_ctrl_pts_arr)
+    return output_ctrl_pts
+
+
+class TPSSpatialTransformer(nn.Layer):
+    def __init__(self,
+                 output_image_size=None,
+                 num_control_points=None,
+                 margins=None):
+        super(TPSSpatialTransformer, self).__init__()
+        self.output_image_size = output_image_size
+        self.num_control_points = num_control_points
+        self.margins = margins
+
+        self.target_height, self.target_width = output_image_size
+        target_control_points = build_output_control_points(num_control_points,
+                                                            margins)
+        N = num_control_points
+
+        # create padded kernel matrix
+        forward_kernel = paddle.zeros(shape=[N + 3, N + 3])
+        target_control_partial_repr = compute_partial_repr(
+            target_control_points, target_control_points)
+        target_control_partial_repr = paddle.cast(target_control_partial_repr,
+                                                  forward_kernel.dtype)
+        forward_kernel[:N, :N] = target_control_partial_repr
+        forward_kernel[:N, -3] = 1
+        forward_kernel[-3, :N] = 1
+        target_control_points = paddle.cast(target_control_points,
+                                            forward_kernel.dtype)
+        forward_kernel[:N, -2:] = target_control_points
+        forward_kernel[-2:, :N] = paddle.transpose(
+            target_control_points, perm=[1, 0])
+        # compute inverse matrix
+        inverse_kernel = paddle.inverse(forward_kernel)
+
+        # create target cordinate matrix
+        HW = self.target_height * self.target_width
+        target_coordinate = list(
+            itertools.product(
+                range(self.target_height), range(self.target_width)))
+        target_coordinate = paddle.to_tensor(target_coordinate)  # HW x 2
+        Y, X = paddle.split(
+            target_coordinate, target_coordinate.shape[1], axis=1)
+        Y = Y / (self.target_height - 1)
+        X = X / (self.target_width - 1)
+        target_coordinate = paddle.concat(
+            [X, Y], axis=1)  # convert from (y, x) to (x, y)
+        target_coordinate_partial_repr = compute_partial_repr(
+            target_coordinate, target_control_points)
+        target_coordinate_repr = paddle.concat(
+            [
+                target_coordinate_partial_repr, paddle.ones(shape=[HW, 1]),
+                target_coordinate
+            ],
+            axis=1)
+
+        # register precomputed matrices
+        self.inverse_kernel = inverse_kernel
+        self.padding_matrix = paddle.zeros(shape=[3, 2])
+        self.target_coordinate_repr = target_coordinate_repr
+        self.target_control_points = target_control_points
+
+    def forward(self, input, source_control_points):
+        assert source_control_points.ndimension() == 3
+        assert source_control_points.shape[1] == self.num_control_points
+        assert source_control_points.shape[2] == 2
+        batch_size = paddle.shape(source_control_points)[0]
+
+        padding_matrix = paddle.expand(
+            self.padding_matrix, shape=[batch_size, 3, 2])
+        Y = paddle.concat([source_control_points, padding_matrix], 1)
+        mapping_matrix = paddle.matmul(self.inverse_kernel, Y)
+        source_coordinate = paddle.matmul(self.target_coordinate_repr,
+                                          mapping_matrix)
+
+        grid = paddle.reshape(
+            source_coordinate,
+            shape=[-1, self.target_height, self.target_width, 2])
+        grid = paddle.clip(grid, 0,
+                           1)  # the source_control_points may be out of [0, 1].
+        # the input to grid_sample is normalized [-1, 1], but what we get is [0, 1]
+        grid = 2.0 * grid - 1.0
+        output_maps = grid_sample(input, grid, canvas=None)
+        return output_maps, source_coordinate
--- a/ppocr/optimizer/__init__.py
+++ b/ppocr/optimizer/__init__.py
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import copy
+import paddle
+
+__all__ = ['build_optimizer']
+
+
+def build_lr_scheduler(lr_config, epochs, step_each_epoch):
+    from . import learning_rate
+    lr_config.update({'epochs': epochs, 'step_each_epoch': step_each_epoch})
+    lr_name = lr_config.pop('name', 'Const')
+    lr = getattr(learning_rate, lr_name)(**lr_config)()
+    return lr
+
+
+def build_optimizer(config, epochs, step_each_epoch, model):
+    from . import regularizer, optimizer
+    config = copy.deepcopy(config)
+    # step1 build lr
+    lr = build_lr_scheduler(config.pop('lr'), epochs, step_each_epoch)
+
+    # step2 build regularization
+    if 'regularizer' in config and config['regularizer'] is not None:
+        reg_config = config.pop('regularizer')
+        reg_name = reg_config.pop('name')
+        if not hasattr(regularizer, reg_name):
+            reg_name += 'Decay'
+        reg = getattr(regularizer, reg_name)(**reg_config)()
+    elif 'weight_decay' in config:
+        reg = config.pop('weight_decay')
+    else:
+        reg = None
+
+    # step3 build optimizer
+    optim_name = config.pop('name')
+    if 'clip_norm' in config:
+        clip_norm = config.pop('clip_norm')
+        grad_clip = paddle.nn.ClipGradByNorm(clip_norm=clip_norm)
+    else:
+        grad_clip = None
+    optim = getattr(optimizer, optim_name)(learning_rate=lr,
+                                           weight_decay=reg,
+                                           grad_clip=grad_clip,
+                                           **config)
+    return optim(model), lr
--- a/ppocr/optimizer/__pycache__/__init__.cpython-37.pyc
+++ b/ppocr/optimizer/__pycache__/__init__.cpython-37.pyc
--- a/ppocr/optimizer/__pycache__/learning_rate.cpython-37.pyc
+++ b/ppocr/optimizer/__pycache__/learning_rate.cpython-37.pyc
--- a/ppocr/optimizer/__pycache__/lr_scheduler.cpython-37.pyc
+++ b/ppocr/optimizer/__pycache__/lr_scheduler.cpython-37.pyc
--- a/ppocr/optimizer/__pycache__/optimizer.cpython-37.pyc
+++ b/ppocr/optimizer/__pycache__/optimizer.cpython-37.pyc
--- a/ppocr/optimizer/__pycache__/regularizer.cpython-37.pyc
+++ b/ppocr/optimizer/__pycache__/regularizer.cpython-37.pyc