dygraph first commit

aad3093a · WenmuZhou · 10f7e519 · 10f7e519 · aad3093a · 10f7e519
Commit aad3093a authored Oct 13, 2020 by WenmuZhou
20 changed files
--- a/ppocr/modeling/backbones/det_resnet_vd_sast.py
+++ b/ppocr/modeling/backbones/det_resnet_vd_sast.py
-#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle.fluid as fluid
-from paddle.fluid.param_attr import ParamAttr
-
-__all__ = ["ResNet"]
-
-
-class ResNet(object):
-    def __init__(self, params):
-        """
-        the Resnet backbone network for detection module.
-        Args:
-            params(dict): the super parameters for network build
-        """
-        self.layers = params['layers']
-        supported_layers = [18, 34, 50, 101, 152]
-        assert self.layers in supported_layers, \
-            "supported layers are {} but input layer is {}".format(supported_layers, self.layers)
-        self.is_3x3 = True
-
-    def __call__(self, input):
-        layers = self.layers
-        is_3x3 = self.is_3x3
-        # if layers == 18:
-        #     depth = [2, 2, 2, 2]
-        # elif layers == 34 or layers == 50:
-        #     depth = [3, 4, 6, 3]
-        # elif layers == 101:
-        #     depth = [3, 4, 23, 3]
-        # elif layers == 152:
-        #     depth = [3, 8, 36, 3]
-        # elif layers == 200:
-        #     depth = [3, 12, 48, 3]
-        # num_filters = [64, 128, 256, 512]
-        # outs = []
-
-        if layers == 18:
-            depth = [2, 2, 2, 2]#, 3, 3]
-        elif layers == 34 or layers == 50:
-            #depth = [3, 4, 6, 3]#,  3, 3]
-            depth = [3, 4, 6, 3, 3]#, 3]
-        elif layers == 101:
-            depth = [3, 4, 23, 3]#,  3, 3]
-        elif layers == 152:
-            depth = [3, 8, 36, 3]#,  3, 3]
-        num_filters = [64, 128, 256, 512, 512]#, 512]
-        blocks = {}
-
-        idx = 'block_0'
-        blocks[idx] = input
-
-        if is_3x3 == False:
-            conv = self.conv_bn_layer(
-                input=input,
-                num_filters=64,
-                filter_size=7,
-                stride=2,
-                act='relu')
-        else:
-            conv = self.conv_bn_layer(
-                input=input,
-                num_filters=32,
-                filter_size=3,
-                stride=2,
-                act='relu',
-                name='conv1_1')
-            conv = self.conv_bn_layer(
-                input=conv,
-                num_filters=32,
-                filter_size=3,
-                stride=1,
-                act='relu',
-                name='conv1_2')
-            conv = self.conv_bn_layer(
-                input=conv,
-                num_filters=64,
-                filter_size=3,
-                stride=1,
-                act='relu',
-                name='conv1_3')
-        idx = 'block_1'
-        blocks[idx] = conv
-
-        conv = fluid.layers.pool2d(
-            input=conv,
-            pool_size=3,
-            pool_stride=2,
-            pool_padding=1,
-            pool_type='max')
-
-        if layers >= 50:
-            for block in range(len(depth)):
-                for i in range(depth[block]):
-                    if layers in [101, 152, 200] and block == 2:
-                        if i == 0:
-                            conv_name = "res" + str(block + 2) + "a"
-                        else:
-                            conv_name = "res" + str(block + 2) + "b" + str(i)
-                    else:
-                        conv_name = "res" + str(block + 2) + chr(97 + i)
-                    conv = self.bottleneck_block(
-                        input=conv,
-                        num_filters=num_filters[block],
-                        stride=2 if i == 0 and block != 0 else 1,
-                        if_first=block == i == 0,
-                        name=conv_name)
-                # outs.append(conv)
-                idx = 'block_' + str(block + 2)
-                blocks[idx] = conv
-        else:
-            for block in range(len(depth)):
-                for i in range(depth[block]):
-                    conv_name = "res" + str(block + 2) + chr(97 + i)
-                    conv = self.basic_block(
-                        input=conv,
-                        num_filters=num_filters[block],
-                        stride=2 if i == 0 and block != 0 else 1,
-                        if_first=block == i == 0,
-                        name=conv_name)
-                # outs.append(conv)
-                idx = 'block_' + str(block + 2)
-                blocks[idx] = conv
-        # return outs
-        return blocks
-
-    def conv_bn_layer(self,
-                      input,
-                      num_filters,
-                      filter_size,
-                      stride=1,
-                      groups=1,
-                      act=None,
-                      name=None):
-        conv = fluid.layers.conv2d(
-            input=input,
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=stride,
-            padding=(filter_size - 1) // 2,
-            groups=groups,
-            act=None,
-            param_attr=ParamAttr(name=name + "_weights"),
-            bias_attr=False)
-        if name == "conv1":
-            bn_name = "bn_" + name
-        else:
-            bn_name = "bn" + name[3:]
-        return fluid.layers.batch_norm(
-            input=conv,
-            act=act,
-            param_attr=ParamAttr(name=bn_name + '_scale'),
-            bias_attr=ParamAttr(bn_name + '_offset'),
-            moving_mean_name=bn_name + '_mean',
-            moving_variance_name=bn_name + '_variance')
-
-    def conv_bn_layer_new(self,
-                          input,
-                          num_filters,
-                          filter_size,
-                          stride=1,
-                          groups=1,
-                          act=None,
-                          name=None):
-        pool = fluid.layers.pool2d(
-            input=input,
-            pool_size=2,
-            pool_stride=2,
-            pool_padding=0,
-            pool_type='avg',
-            ceil_mode=True)
-
-        conv = fluid.layers.conv2d(
-            input=pool,
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=1,
-            padding=(filter_size - 1) // 2,
-            groups=groups,
-            act=None,
-            param_attr=ParamAttr(name=name + "_weights"),
-            bias_attr=False)
-        if name == "conv1":
-            bn_name = "bn_" + name
-        else:
-            bn_name = "bn" + name[3:]
-        return fluid.layers.batch_norm(
-            input=conv,
-            act=act,
-            param_attr=ParamAttr(name=bn_name + '_scale'),
-            bias_attr=ParamAttr(bn_name + '_offset'),
-            moving_mean_name=bn_name + '_mean',
-            moving_variance_name=bn_name + '_variance')
-
-    def shortcut(self, input, ch_out, stride, name, if_first=False):
-        ch_in = input.shape[1]
-        if ch_in != ch_out or stride != 1:
-            if if_first:
-                return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
-            else:
-                return self.conv_bn_layer_new(
-                    input, ch_out, 1, stride, name=name)
-        elif if_first:
-            return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
-        else:
-            return input
-
-    def bottleneck_block(self, input, num_filters, stride, name, if_first):
-        conv0 = self.conv_bn_layer(
-            input=input,
-            num_filters=num_filters,
-            filter_size=1,
-            act='relu',
-            name=name + "_branch2a")
-        conv1 = self.conv_bn_layer(
-            input=conv0,
-            num_filters=num_filters,
-            filter_size=3,
-            stride=stride,
-            act='relu',
-            name=name + "_branch2b")
-        conv2 = self.conv_bn_layer(
-            input=conv1,
-            num_filters=num_filters * 4,
-            filter_size=1,
-            act=None,
-            name=name + "_branch2c")
-
-        short = self.shortcut(
-            input,
-            num_filters * 4,
-            stride,
-            if_first=if_first,
-            name=name + "_branch1")
-
-        return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
-
-    def basic_block(self, input, num_filters, stride, name, if_first):
-        conv0 = self.conv_bn_layer(
-            input=input,
-            num_filters=num_filters,
-            filter_size=3,
-            act='relu',
-            stride=stride,
-            name=name + "_branch2a")
-        conv1 = self.conv_bn_layer(
-            input=conv0,
-            num_filters=num_filters,
-            filter_size=3,
-            act=None,
-            name=name + "_branch2b")
-        short = self.shortcut(
-            input,
-            num_filters,
-            stride,
-            if_first=if_first,
-            name=name + "_branch1")
-        return fluid.layers.elementwise_add(x=short, y=conv1, act='relu')
--- a/ppocr/modeling/backbones/rec_mobilenet_v3.py
+++ b/ppocr/modeling/backbones/rec_mobilenet_v3.py
-#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from paddle import nn

-import paddle.fluid as fluid
-from paddle.fluid.initializer import MSRA
-from paddle.fluid.param_attr import ParamAttr
+from ppocr.modeling.backbones.det_mobilenet_v3 import ResidualUnit, ConvBNLayer, make_divisible

-__all__ = [
-    'MobileNetV3', 'MobileNetV3_small_x0_35', 'MobileNetV3_small_x0_5',
-    'MobileNetV3_small_x0_75', 'MobileNetV3_small_x1_0',
-    'MobileNetV3_small_x1_25', 'MobileNetV3_large_x0_35',
-    'MobileNetV3_large_x0_5', 'MobileNetV3_large_x0_75',
-    'MobileNetV3_large_x1_0', 'MobileNetV3_large_x1_25'
-]
+__all__ = ['MobileNetV3']


-class MobileNetV3():
-    def __init__(self, params):
-        self.scale = params.get("scale", 0.5)
-        model_name = params.get("model_name", "small")
-        large_stride = params.get("large_stride", [1, 2, 2, 2])
-        small_stride = params.get("small_stride", [2, 2, 2, 2])
+class MobileNetV3(nn.Layer):
+    def __init__(self,
+                 in_channels=3,
+                 model_name='small',
+                 scale=0.5,
+                 large_stride=None,
+                 small_stride=None,
+                 **kwargs):
+        super(MobileNetV3, self).__init__()
+        if small_stride is None:
+            small_stride = [2, 2, 2, 2]
+        if large_stride is None:
+            large_stride = [1, 2, 2, 2]

        assert isinstance(large_stride, list), "large_stride type must " \
-            "be list but got {}".format(type(large_stride))
+                                               "be list but got {}".format(type(large_stride))
        assert isinstance(small_stride, list), "small_stride type must " \
-            "be list but got {}".format(type(small_stride))
+                                               "be list but got {}".format(type(small_stride))
        assert len(large_stride) == 4, "large_stride length must be " \
-            "4 but got {}".format(len(large_stride))
+                                       "4 but got {}".format(len(large_stride))
        assert len(small_stride) == 4, "small_stride length must be " \
-            "4 but got {}".format(len(small_stride))
+                                       "4 but got {}".format(len(small_stride))

-        self.inplanes = 16
        if model_name == "large":
-            self.cfg = [
+            cfg = [
                # k, exp, c,  se,     nl,  s,
                [3, 16, 16, False, 'relu', large_stride[0]],
                [3, 64, 24, False, 'relu', (large_stride[1], 1)],
@@ -65,10 +61,9 @@ class MobileNetV3():
                [5, 960, 160, True, 'hard_swish', 1],
                [5, 960, 160, True, 'hard_swish', 1],
            ]
-            self.cls_ch_squeeze = 960
-            self.cls_ch_expand = 1280
+            cls_ch_squeeze = 960
        elif model_name == "small":
-            self.cfg = [
+            cfg = [
                # k, exp, c,  se,     nl,  s,
                [3, 16, 16, True, 'relu', (small_stride[0], 1)],
                [3, 72, 24, False, 'relu', (small_stride[1], 1)],
@@ -82,186 +77,72 @@ class MobileNetV3():
                [5, 576, 96, True, 'hard_swish', 1],
                [5, 576, 96, True, 'hard_swish', 1],
            ]
-            self.cls_ch_squeeze = 576
-            self.cls_ch_expand = 1280
+            cls_ch_squeeze = 576
        else:
            raise NotImplementedError("mode[" + model_name +
                                      "_model] is not implemented!")

        supported_scale = [0.35, 0.5, 0.75, 1.0, 1.25]
-        assert self.scale in supported_scale, \
-            "supported scales are {} but input scale is {}".format(supported_scale, self.scale)
-
-    def __call__(self, input):
-        scale = self.scale
-        inplanes = self.inplanes
-        cfg = self.cfg
-        cls_ch_squeeze = self.cls_ch_squeeze
-        cls_ch_expand = self.cls_ch_expand
-        #conv1
-        conv = self.conv_bn_layer(
-            input,
-            filter_size=3,
-            num_filters=self.make_divisible(inplanes * scale),
+        assert scale in supported_scale, \
+            "supported scales are {} but input scale is {}".format(supported_scale, scale)
+
+        inplanes = 16
+        # conv1
+        self.conv1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=make_divisible(inplanes * scale),
+            kernel_size=3,
            stride=2,
            padding=1,
-            num_groups=1,
+            groups=1,
            if_act=True,
            act='hard_swish',
            name='conv1')
        i = 0
-        inplanes = self.make_divisible(inplanes * scale)
-        for layer_cfg in cfg:
-            conv = self.residual_unit(
-                input=conv,
-                num_in_filter=inplanes,
-                num_mid_filter=self.make_divisible(scale * layer_cfg[1]),
-                num_out_filter=self.make_divisible(scale * layer_cfg[2]),
-                act=layer_cfg[4],
-                stride=layer_cfg[5],
-                filter_size=layer_cfg[0],
-                use_se=layer_cfg[3],
-                name='conv' + str(i + 2))
-            inplanes = self.make_divisible(scale * layer_cfg[2])
+        block_list = []
+        inplanes = make_divisible(inplanes * scale)
+        for (k, exp, c, se, nl, s) in cfg:
+            block_list.append(
+                ResidualUnit(
+                    in_channels=inplanes,
+                    mid_channels=make_divisible(scale * exp),
+                    out_channels=make_divisible(scale * c),
+                    kernel_size=k,
+                    stride=s,
+                    use_se=se,
+                    act=nl,
+                    name='conv' + str(i + 2)))
+            inplanes = make_divisible(scale * c)
            i += 1
+        self.blocks = nn.Sequential(*block_list)

-        conv = self.conv_bn_layer(
-            input=conv,
-            filter_size=1,
-            num_filters=self.make_divisible(scale * cls_ch_squeeze),
+        self.conv2 = ConvBNLayer(
+            in_channels=inplanes,
+            out_channels=make_divisible(scale * cls_ch_squeeze),
+            kernel_size=1,
            stride=1,
            padding=0,
-            num_groups=1,
+            groups=1,
            if_act=True,
            act='hard_swish',
            name='conv_last')

-        conv = fluid.layers.pool2d(
-            input=conv,
-            pool_size=2,
-            pool_stride=2,
-            pool_padding=0,
-            pool_type='max')
-        return conv
-
-    def conv_bn_layer(self,
-                      input,
-                      filter_size,
-                      num_filters,
-                      stride,
-                      padding,
-                      num_groups=1,
-                      if_act=True,
-                      act=None,
-                      name=None,
-                      use_cudnn=True,
-                      res_last_bn_init=False):
-        conv = fluid.layers.conv2d(
-            input=input,
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=stride,
-            padding=padding,
-            groups=num_groups,
-            act=None,
-            use_cudnn=use_cudnn,
-            param_attr=ParamAttr(name=name + '_weights'),
-            bias_attr=False)
-        bn_name = name + '_bn'
-        bn = fluid.layers.batch_norm(
-            input=conv,
-            param_attr=ParamAttr(
-                name=bn_name + "_scale",
-                regularizer=fluid.regularizer.L2DecayRegularizer(
-                    regularization_coeff=0.0)),
-            bias_attr=ParamAttr(
-                name=bn_name + "_offset",
-                regularizer=fluid.regularizer.L2DecayRegularizer(
-                    regularization_coeff=0.0)),
-            moving_mean_name=bn_name + '_mean',
-            moving_variance_name=bn_name + '_variance')
-        if if_act:
-            if act == 'relu':
-                bn = fluid.layers.relu(bn)
-            elif act == 'hard_swish':
-                bn = fluid.layers.hard_swish(bn)
-        return bn
-
-    def make_divisible(self, v, divisor=8, min_value=None):
-        if min_value is None:
-            min_value = divisor
-        new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
-        if new_v < 0.9 * v:
-            new_v += divisor
-        return new_v
-
-    def se_block(self, input, num_out_filter, ratio=4, name=None):
-        num_mid_filter = num_out_filter // ratio
-        pool = fluid.layers.pool2d(
-            input=input, pool_type='avg', global_pooling=True, use_cudnn=False)
-        conv1 = fluid.layers.conv2d(
-            input=pool,
-            filter_size=1,
-            num_filters=num_mid_filter,
-            act='relu',
-            param_attr=ParamAttr(name=name + '_1_weights'),
-            bias_attr=ParamAttr(name=name + '_1_offset'))
-        conv2 = fluid.layers.conv2d(
-            input=conv1,
-            filter_size=1,
-            num_filters=num_out_filter,
-            act='hard_sigmoid',
-            param_attr=ParamAttr(name=name + '_2_weights'),
-            bias_attr=ParamAttr(name=name + '_2_offset'))
-        scale = fluid.layers.elementwise_mul(x=input, y=conv2, axis=0)
-        return scale
-
-    def residual_unit(self,
-                      input,
-                      num_in_filter,
-                      num_mid_filter,
-                      num_out_filter,
-                      stride,
-                      filter_size,
-                      act=None,
-                      use_se=False,
-                      name=None):
-
-        conv0 = self.conv_bn_layer(
-            input=input,
-            filter_size=1,
-            num_filters=num_mid_filter,
-            stride=1,
-            padding=0,
-            if_act=True,
-            act=act,
-            name=name + '_expand')
-
-        conv1 = self.conv_bn_layer(
-            input=conv0,
-            filter_size=filter_size,
-            num_filters=num_mid_filter,
-            stride=stride,
-            padding=int((filter_size - 1) // 2),
-            if_act=True,
-            act=act,
-            num_groups=num_mid_filter,
-            use_cudnn=False,
-            name=name + '_depthwise')
-        if use_se:
-            conv1 = self.se_block(
-                input=conv1, num_out_filter=num_mid_filter, name=name + '_se')
-
-        conv2 = self.conv_bn_layer(
-            input=conv1,
-            filter_size=1,
-            num_filters=num_out_filter,
-            stride=1,
-            padding=0,
-            if_act=False,
-            name=name + '_linear',
-            res_last_bn_init=True)
-        if num_in_filter != num_out_filter or stride != 1:
-            return conv2
-        else:
-            return fluid.layers.elementwise_add(x=input, y=conv2, act=None)
+        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
+        self.out_channels = make_divisible(scale * cls_ch_squeeze)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.blocks(x)
+        x = self.conv2(x)
+        x = self.pool(x)
+        return x
+
+
+if __name__ == '__main__':
+    import paddle
+    paddle.disable_static()
+    x = paddle.zeros((1, 3, 32, 320))
+    x = paddle.to_variable(x)
+    net = MobileNetV3(model_name='small', small_stride=[1, 2, 2, 2])
+    y = net(x)
+    print(y.shape)
--- a/ppocr/modeling/backbones/rec_resnet_fpn.py
+++ b/ppocr/modeling/backbones/rec_resnet_fpn.py
-#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid.param_attr import ParamAttr
-
-__all__ = [
-    "ResNet", "ResNet18", "ResNet34", "ResNet50", "ResNet101", "ResNet152"
-]
-
-Trainable = True
-w_nolr = fluid.ParamAttr(trainable=Trainable)
-train_parameters = {
-    "input_size": [3, 224, 224],
-    "input_mean": [0.485, 0.456, 0.406],
-    "input_std": [0.229, 0.224, 0.225],
-    "learning_strategy": {
-        "name": "piecewise_decay",
-        "batch_size": 256,
-        "epochs": [30, 60, 90],
-        "steps": [0.1, 0.01, 0.001, 0.0001]
-    }
-}
-
-
-class ResNet():
-    def __init__(self, params):
-        self.layers = params['layers']
-        self.params = train_parameters
-
-    def __call__(self, input):
-        layers = self.layers
-        supported_layers = [18, 34, 50, 101, 152]
-        assert layers in supported_layers, \
-            "supported layers are {} but input layer is {}".format(supported_layers, layers)
-
-        if layers == 18:
-            depth = [2, 2, 2, 2]
-        elif layers == 34 or layers == 50:
-            depth = [3, 4, 6, 3]
-        elif layers == 101:
-            depth = [3, 4, 23, 3]
-        elif layers == 152:
-            depth = [3, 8, 36, 3]
-        stride_list = [(2, 2), (2, 2), (1, 1), (1, 1)]
-        num_filters = [64, 128, 256, 512]
-
-        conv = self.conv_bn_layer(
-            input=input,
-            num_filters=64,
-            filter_size=7,
-            stride=2,
-            act='relu',
-            name="conv1")
-        F = []
-        if layers >= 50:
-            for block in range(len(depth)):
-                for i in range(depth[block]):
-                    if layers in [101, 152] and block == 2:
-                        if i == 0:
-                            conv_name = "res" + str(block + 2) + "a"
-                        else:
-                            conv_name = "res" + str(block + 2) + "b" + str(i)
-                    else:
-                        conv_name = "res" + str(block + 2) + chr(97 + i)
-                    conv = self.bottleneck_block(
-                        input=conv,
-                        num_filters=num_filters[block],
-                        stride=stride_list[block] if i == 0 else 1,
-                        name=conv_name)
-                F.append(conv)
-        else:
-            for block in range(len(depth)):
-                for i in range(depth[block]):
-                    conv_name = "res" + str(block + 2) + chr(97 + i)
-
-                    if i == 0 and block != 0:
-                        stride = (2, 1)
-                    else:
-                        stride = (1, 1)
-
-                    conv = self.basic_block(
-                        input=conv,
-                        num_filters=num_filters[block],
-                        stride=stride,
-                        if_first=block == i == 0,
-                        name=conv_name)
-                F.append(conv)
-
-        base = F[-1]
-        for i in [-2, -3]:
-            b, c, w, h = F[i].shape
-            if (w, h) == base.shape[2:]:
-                base = base
-            else:
-                base = fluid.layers.conv2d_transpose(
-                    input=base,
-                    num_filters=c,
-                    filter_size=4,
-                    stride=2,
-                    padding=1,
-                    act=None,
-                    param_attr=w_nolr,
-                    bias_attr=w_nolr)
-                base = fluid.layers.batch_norm(
-                    base, act="relu", param_attr=w_nolr, bias_attr=w_nolr)
-            base = fluid.layers.concat([base, F[i]], axis=1)
-            base = fluid.layers.conv2d(
-                base,
-                num_filters=c,
-                filter_size=1,
-                param_attr=w_nolr,
-                bias_attr=w_nolr)
-            base = fluid.layers.conv2d(
-                base,
-                num_filters=c,
-                filter_size=3,
-                padding=1,
-                param_attr=w_nolr,
-                bias_attr=w_nolr)
-            base = fluid.layers.batch_norm(
-                base, act="relu", param_attr=w_nolr, bias_attr=w_nolr)
-
-        base = fluid.layers.conv2d(
-            base,
-            num_filters=512,
-            filter_size=1,
-            bias_attr=w_nolr,
-            param_attr=w_nolr)
-
-        return base
-
-    def conv_bn_layer(self,
-                      input,
-                      num_filters,
-                      filter_size,
-                      stride=1,
-                      groups=1,
-                      act=None,
-                      name=None):
-        conv = fluid.layers.conv2d(
-            input=input,
-            num_filters=num_filters,
-            filter_size=2 if stride == (1, 1) else filter_size,
-            dilation=2 if stride == (1, 1) else 1,
-            stride=stride,
-            padding=(filter_size - 1) // 2,
-            groups=groups,
-            act=None,
-            param_attr=ParamAttr(
-                name=name + "_weights", trainable=Trainable),
-            bias_attr=False,
-            name=name + '.conv2d.output.1')
-
-        if name == "conv1":
-            bn_name = "bn_" + name
-        else:
-            bn_name = "bn" + name[3:]
-        return fluid.layers.batch_norm(
-            input=conv,
-            act=act,
-            name=bn_name + '.output.1',
-            param_attr=ParamAttr(
-                name=bn_name + '_scale', trainable=Trainable),
-            bias_attr=ParamAttr(
-                bn_name + '_offset', trainable=Trainable),
-            moving_mean_name=bn_name + '_mean',
-            moving_variance_name=bn_name + '_variance', )
-
-    def shortcut(self, input, ch_out, stride, is_first, name):
-        ch_in = input.shape[1]
-        if ch_in != ch_out or stride != 1 or is_first == True:
-            if stride == (1, 1):
-                return self.conv_bn_layer(input, ch_out, 1, 1, name=name)
-            else:  #stride == (2,2)
-                return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
-
-        else:
-            return input
-
-    def bottleneck_block(self, input, num_filters, stride, name):
-        conv0 = self.conv_bn_layer(
-            input=input,
-            num_filters=num_filters,
-            filter_size=1,
-            act='relu',
-            name=name + "_branch2a")
-        conv1 = self.conv_bn_layer(
-            input=conv0,
-            num_filters=num_filters,
-            filter_size=3,
-            stride=stride,
-            act='relu',
-            name=name + "_branch2b")
-        conv2 = self.conv_bn_layer(
-            input=conv1,
-            num_filters=num_filters * 4,
-            filter_size=1,
-            act=None,
-            name=name + "_branch2c")
-
-        short = self.shortcut(
-            input,
-            num_filters * 4,
-            stride,
-            is_first=False,
-            name=name + "_branch1")
-
-        return fluid.layers.elementwise_add(
-            x=short, y=conv2, act='relu', name=name + ".add.output.5")
-
-    def basic_block(self, input, num_filters, stride, is_first, name):
-        conv0 = self.conv_bn_layer(
-            input=input,
-            num_filters=num_filters,
-            filter_size=3,
-            act='relu',
-            stride=stride,
-            name=name + "_branch2a")
-        conv1 = self.conv_bn_layer(
-            input=conv0,
-            num_filters=num_filters,
-            filter_size=3,
-            act=None,
-            name=name + "_branch2b")
-        short = self.shortcut(
-            input, num_filters, stride, is_first, name=name + "_branch1")
-        return fluid.layers.elementwise_add(x=short, y=conv1, act='relu')
--- a/ppocr/modeling/backbones/rec_resnet_vd.py
+++ b/ppocr/modeling/backbones/rec_resnet_vd.py
-#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import math
+from paddle import nn, ParamAttr
+from paddle.nn import functional as F

-import paddle
-import paddle.fluid as fluid
-from paddle.fluid.param_attr import ParamAttr
+__all__ = ["ResNet"]

-__all__ = [
-    "ResNet", "ResNet18_vd", "ResNet34_vd", "ResNet50_vd", "ResNet101_vd",
-    "ResNet152_vd", "ResNet200_vd"
-]

+class ResNet(nn.Layer):
+    def __init__(self, in_channels=3, layers=34):
+        super(ResNet, self).__init__()
+        supported_layers = {
+            18: {
+                'depth': [2, 2, 2, 2],
+                'block_class': BasicBlock
+            },
+            34: {
+                'depth': [3, 4, 6, 3],
+                'block_class': BasicBlock
+            },
+            50: {
+                'depth': [3, 4, 6, 3],
+                'block_class': BottleneckBlock
+            },
+            101: {
+                'depth': [3, 4, 23, 3],
+                'block_class': BottleneckBlock
+            },
+            152: {
+                'depth': [3, 8, 36, 3],
+                'block_class': BottleneckBlock
+            },
+            200: {
+                'depth': [3, 12, 48, 3],
+                'block_class': BottleneckBlock
+            }
+        }
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers.keys(), layers)
+        is_3x3 = True

-class ResNet():
-    def __init__(self, params):
-        self.layers = params['layers']
-        self.is_3x3 = True
-        supported_layers = [18, 34, 50, 101, 152, 200]
-        assert self.layers in supported_layers, \
-            "supported layers are {} but input layer is {}".format(supported_layers, self.layers)
-
-    def __call__(self, input):
-        is_3x3 = self.is_3x3
-        layers = self.layers
-
-        if layers == 18:
-            depth = [2, 2, 2, 2]
-        elif layers == 34 or layers == 50:
-            depth = [3, 4, 6, 3]
-        elif layers == 101:
-            depth = [3, 4, 23, 3]
-        elif layers == 152:
-            depth = [3, 8, 36, 3]
-        elif layers == 200:
-            depth = [3, 12, 48, 3]
        num_filters = [64, 128, 256, 512]
+        depth = supported_layers[layers]['depth']
+        block_class = supported_layers[layers]['block_class']
+        conv = []
        if is_3x3 == False:
-            conv = self.conv_bn_layer(
-                input=input,
-                num_filters=64,
-                filter_size=7,
-                stride=1,
-                act='relu')
+            conv.append(
+                ConvBNLayer(
+                    in_channels=in_channels,
+                    out_channels=64,
+                    kernel_size=7,
+                    stride=1,
+                    act='relu'))
        else:
-            conv = self.conv_bn_layer(
-                input=input,
-                num_filters=32,
-                filter_size=3,
-                stride=1,
-                act='relu',
-                name='conv1_1')
-            conv = self.conv_bn_layer(
-                input=conv,
-                num_filters=32,
-                filter_size=3,
-                stride=1,
-                act='relu',
-                name='conv1_2')
-            conv = self.conv_bn_layer(
-                input=conv,
-                num_filters=64,
-                filter_size=3,
-                stride=1,
-                act='relu',
-                name='conv1_3')
+            conv.append(
+                ConvBNLayer(
+                    in_channels=in_channels,
+                    out_channels=32,
+                    kernel_size=3,
+                    stride=1,
+                    act='relu',
+                    name='conv1_1'))
+            conv.append(
+                ConvBNLayer(
+                    in_channels=32,
+                    out_channels=32,
+                    kernel_size=3,
+                    stride=1,
+                    act='relu',
+                    name='conv1_2'))
+            conv.append(
+                ConvBNLayer(
+                    in_channels=32,
+                    out_channels=64,
+                    kernel_size=3,
+                    stride=1,
+                    act='relu',
+                    name='conv1_3'))
+        self.conv1 = nn.Sequential(*conv)

-        conv = fluid.layers.pool2d(
-            input=conv,
-            pool_size=3,
-            pool_stride=2,
-            pool_padding=1,
-            pool_type='max')
+        self.pool = nn.MaxPool2d(
+            kernel_size=3,
+            stride=2,
+            padding=1, )

-        if layers >= 50:
-            for block in range(len(depth)):
-                for i in range(depth[block]):
-                    if layers in [101, 152, 200] and block == 2:
+        block_list = []
+        in_ch = 64
+        for block_index in range(len(depth)):
+            for i in range(depth[block_index]):
+                if layers >= 50:
+                    if layers in [101, 152, 200] and block_index == 2:
                        if i == 0:
-                            conv_name = "res" + str(block + 2) + "a"
+                            conv_name = "res" + str(block_index + 2) + "a"
                        else:
-                            conv_name = "res" + str(block + 2) + "b" + str(i)
-                    else:
-                        conv_name = "res" + str(block + 2) + chr(97 + i)
-
-                    if i == 0 and block != 0:
-                        stride = (2, 1)
-                    else:
-                        stride = (1, 1)
-
-                    conv = self.bottleneck_block(
-                        input=conv,
-                        num_filters=num_filters[block],
-                        stride=stride,
-                        if_first=block == i == 0,
-                        name=conv_name)
-        else:
-            for block in range(len(depth)):
-                for i in range(depth[block]):
-                    conv_name = "res" + str(block + 2) + chr(97 + i)
-
-                    if i == 0 and block != 0:
-                        stride = (2, 1)
+                            conv_name = "res" + str(block_index +
+                                                    2) + "b" + str(i)
                    else:
-                        stride = (1, 1)
-
-                    conv = self.basic_block(
-                        input=conv,
-                        num_filters=num_filters[block],
+                        conv_name = "res" + str(block_index + 2) + chr(97 + i)
+                else:
+                    conv_name = "res" + str(block_index + 2) + chr(97 + i)
+                if i == 0 and block_index != 0:
+                    stride = (2, 1)
+                else:
+                    stride = (1, 1)
+                block_list.append(
+                    block_class(
+                        in_channels=in_ch,
+                        out_channels=num_filters[block_index],
                        stride=stride,
-                        if_first=block == i == 0,
-                        name=conv_name)
+                        if_first=block_index == i == 0,
+                        name=conv_name))
+                in_ch = block_list[-1].out_channels
+        self.block_list = nn.Sequential(*block_list)
+        self.add_sublayer(sublayer=self.block_list, name="block_list")
+        self.pool_out = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
+        self.out_channels = in_ch

-        conv = fluid.layers.pool2d(
-            input=conv,
-            pool_size=2,
-            pool_stride=2,
-            pool_padding=0,
-            pool_type='max')
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.pool(x)
+        x = self.block_list(x)
+        x = self.pool_out(x)
+        return x

-        return conv

-    def conv_bn_layer(self,
-                      input,
-                      num_filters,
-                      filter_size,
-                      stride=1,
-                      groups=1,
-                      act=None,
-                      name=None):
-        conv = fluid.layers.conv2d(
-            input=input,
-            num_filters=num_filters,
-            filter_size=filter_size,
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
            stride=stride,
-            padding=(filter_size - 1) // 2,
+            padding=(kernel_size - 1) // 2,
            groups=groups,
-            act=None,
-            param_attr=ParamAttr(name=name + "_weights"),
+            weight_attr=ParamAttr(name=name + "_weights"),
            bias_attr=False)
        if name == "conv1":
            bn_name = "bn_" + name
        else:
            bn_name = "bn" + name[3:]
-        return fluid.layers.batch_norm(
-            input=conv,
+        self.bn = nn.BatchNorm(
+            num_channels=out_channels,
            act=act,
-            param_attr=ParamAttr(name=bn_name + '_scale'),
-            bias_attr=ParamAttr(bn_name + '_offset'),
-            moving_mean_name=bn_name + '_mean',
-            moving_variance_name=bn_name + '_variance')
+            param_attr=ParamAttr(name=bn_name + "_scale"),
+            bias_attr=ParamAttr(name=bn_name + "_offset"),
+            moving_mean_name=bn_name + "_mean",
+            moving_variance_name=bn_name + "_variance")
+
+    def __call__(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        return x

-    def conv_bn_layer_new(self,
-                          input,
-                          num_filters,
-                          filter_size,
-                          stride=1,
-                          groups=1,
-                          act=None,
-                          name=None):
-        pool = fluid.layers.pool2d(
-            input=input,
-            pool_size=stride,
-            pool_stride=stride,
-            pool_padding=0,
-            pool_type='avg',
-            ceil_mode=True)

-        conv = fluid.layers.conv2d(
-            input=pool,
-            num_filters=num_filters,
-            filter_size=filter_size,
+class ConvBNLayerNew(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 name=None):
+        super(ConvBNLayerNew, self).__init__()
+        self.pool = nn.AvgPool2d(
+            kernel_size=stride, stride=stride, padding=0, ceil_mode=True)
+
+        self.conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
            stride=1,
-            padding=(filter_size - 1) // 2,
+            padding=(kernel_size - 1) // 2,
            groups=groups,
-            act=None,
-            param_attr=ParamAttr(name=name + "_weights"),
+            weight_attr=ParamAttr(name=name + "_weights"),
            bias_attr=False)
-
        if name == "conv1":
            bn_name = "bn_" + name
        else:
            bn_name = "bn" + name[3:]
-        return fluid.layers.batch_norm(
-            input=conv,
+        self.bn = nn.BatchNorm(
+            num_channels=out_channels,
            act=act,
-            param_attr=ParamAttr(name=bn_name + '_scale'),
-            bias_attr=ParamAttr(bn_name + '_offset'),
-            moving_mean_name=bn_name + '_mean',
-            moving_variance_name=bn_name + '_variance')
+            param_attr=ParamAttr(name=bn_name + "_scale"),
+            bias_attr=ParamAttr(name=bn_name + "_offset"),
+            moving_mean_name=bn_name + "_mean",
+            moving_variance_name=bn_name + "_variance")
+
+    def __call__(self, x):
+        x = self.pool(x)
+        x = self.conv(x)
+        x = self.bn(x)
+        return x
+
+
+class ShortCut(nn.Layer):
+    def __init__(self, in_channels, out_channels, stride, name, if_first=False):
+        super(ShortCut, self).__init__()
+        self.use_conv = True

-    def shortcut(self, input, ch_out, stride, name, if_first=False):
-        ch_in = input.shape[1]
-        if ch_in != ch_out or stride[0] != 1:
+        if in_channels != out_channels or stride[0] != 1:
            if if_first:
-                return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
+                self.conv = ConvBNLayer(
+                    in_channels, out_channels, 1, stride, name=name)
            else:
-                return self.conv_bn_layer_new(
-                    input, ch_out, 1, stride, name=name)
+                self.conv = ConvBNLayerNew(
+                    in_channels, out_channels, 1, stride, name=name)
        elif if_first:
-            return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
+            self.conv = ConvBNLayer(
+                in_channels, out_channels, 1, stride, name=name)
        else:
-            return input
+            self.use_conv = False

-    def bottleneck_block(self, input, num_filters, stride, name, if_first):
-        conv0 = self.conv_bn_layer(
-            input=input,
-            num_filters=num_filters,
-            filter_size=1,
+    def forward(self, x):
+        if self.use_conv:
+            x = self.conv(x)
+        return x
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self, in_channels, out_channels, stride, name, if_first):
+        super(BottleneckBlock, self).__init__()
+        self.conv0 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
            act='relu',
            name=name + "_branch2a")
-        conv1 = self.conv_bn_layer(
-            input=conv0,
-            num_filters=num_filters,
-            filter_size=3,
+        self.conv1 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=3,
            stride=stride,
            act='relu',
            name=name + "_branch2b")
-        conv2 = self.conv_bn_layer(
-            input=conv1,
-            num_filters=num_filters * 4,
-            filter_size=1,
+        self.conv2 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels * 4,
+            kernel_size=1,
            act=None,
            name=name + "_branch2c")

-        short = self.shortcut(
-            input,
-            num_filters * 4,
-            stride,
+        self.short = ShortCut(
+            in_channels=in_channels,
+            out_channels=out_channels * 4,
+            stride=stride,
            if_first=if_first,
            name=name + "_branch1")
+        self.out_channels = out_channels * 4

-        return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
+    def forward(self, x):
+        y = self.conv0(x)
+        y = self.conv1(y)
+        y = self.conv2(y)
+        y = y + self.short(x)
+        y = F.relu(y)
+        return y

-    def basic_block(self, input, num_filters, stride, name, if_first):
-        conv0 = self.conv_bn_layer(
-            input=input,
-            num_filters=num_filters,
-            filter_size=3,
+
+class BasicBlock(nn.Layer):
+    def __init__(self, in_channels, out_channels, stride, name, if_first):
+        super(BasicBlock, self).__init__()
+        self.conv0 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=3,
            act='relu',
            stride=stride,
            name=name + "_branch2a")
-        conv1 = self.conv_bn_layer(
-            input=conv0,
-            num_filters=num_filters,
-            filter_size=3,
+        self.conv1 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=3,
            act=None,
            name=name + "_branch2b")
-        short = self.shortcut(
-            input,
-            num_filters,
-            stride,
+        self.short = ShortCut(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            stride=stride,
            if_first=if_first,
            name=name + "_branch1")
-        return fluid.layers.elementwise_add(x=short, y=conv1, act='relu')
+        self.out_channels = out_channels
+
+    def forward(self, x):
+        y = self.conv0(x)
+        y = self.conv1(y)
+        y = y + self.short(x)
+        return F.relu(y)
--- a/ppocr/modeling/common_functions.py
+++ b/ppocr/modeling/common_functions.py
-#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid.param_attr import ParamAttr
-import math
-
-
-def get_para_bias_attr(l2_decay, k, name):
-    regularizer = fluid.regularizer.L2Decay(l2_decay)
-    stdv = 1.0 / math.sqrt(k * 1.0)
-    initializer = fluid.initializer.Uniform(-stdv, stdv)
-    para_attr = fluid.ParamAttr(
-        regularizer=regularizer, initializer=initializer, name=name + "_w_attr")
-    bias_attr = fluid.ParamAttr(
-        regularizer=regularizer, initializer=initializer, name=name + "_b_attr")
-    return [para_attr, bias_attr]
-
-
-def conv_bn_layer(input,
-                  num_filters,
-                  filter_size,
-                  stride=1,
-                  groups=1,
-                  act=None,
-                  name=None):
-    conv = fluid.layers.conv2d(
-        input=input,
-        num_filters=num_filters,
-        filter_size=filter_size,
-        stride=stride,
-        padding=(filter_size - 1) // 2,
-        groups=groups,
-        act=None,
-        param_attr=ParamAttr(name=name + "_weights"),
-        bias_attr=False,
-        name=name + '.conv2d')
-
-    bn_name = "bn_" + name
-    return fluid.layers.batch_norm(
-        input=conv,
-        act=act,
-        name=bn_name + '.output',
-        param_attr=ParamAttr(name=bn_name + '_scale'),
-        bias_attr=ParamAttr(bn_name + '_offset'),
-        moving_mean_name=bn_name + '_mean',
-        moving_variance_name=bn_name + '_variance')
-
-
-def deconv_bn_layer(input,
-                    num_filters,
-                    filter_size=4,
-                    stride=2,
-                    act='relu',
-                    name=None):
-    deconv = fluid.layers.conv2d_transpose(
-        input=input,
-        num_filters=num_filters,
-        filter_size=filter_size,
-        stride=stride,
-        padding=1,
-        act=None,
-        param_attr=ParamAttr(name=name + "_weights"),
-        bias_attr=False,
-        name=name + '.deconv2d')
-    bn_name = "bn_" + name
-    return fluid.layers.batch_norm(
-        input=deconv,
-        act=act,
-        name=bn_name + '.output',
-        param_attr=ParamAttr(name=bn_name + '_scale'),
-        bias_attr=ParamAttr(bn_name + '_offset'),
-        moving_mean_name=bn_name + '_mean',
-        moving_variance_name=bn_name + '_variance')
-
-
-def create_tmp_var(program, name, dtype, shape, lod_level=0):
-    return program.current_block().create_var(
-        name=name, dtype=dtype, shape=shape, lod_level=lod_level)
--- a/ppocr/modeling/heads/__init__.py
+++ b/ppocr/modeling/heads/__init__.py
@@ -11,3 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+__all__ = ['build_head']
+
+
+def build_head(config):
+    # det head
+    from .det_db_head import DBHead
+
+    # rec head
+    from .rec_ctc_head import CTC
+    support_dict = ['DBHead', 'CTC']
+
+    module_name = config.pop('name')
+    assert module_name in support_dict, Exception('head only support {}'.format(
+        support_dict))
+    module_class = eval(module_name)(**config)
+    return module_class
--- a/ppocr/modeling/heads/det_db_head.py
+++ b/ppocr/modeling/heads/det_db_head.py
-#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

 import math
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+from paddle import ParamAttr

-import paddle.fluid as fluid

+def get_bias_attr(k, name):
+    stdv = 1.0 / math.sqrt(k * 1.0)
+    initializer = paddle.nn.initializer.Uniform(-stdv, stdv)
+    bias_attr = ParamAttr(initializer=initializer, name=name + "_b_attr")
+    return bias_attr

-class DBHead(object):
-    """
-    Differentiable Binarization (DB) for text detection:
-        see https://arxiv.org/abs/1911.08947
-    args:
-        params(dict): super parameters for build DB network
-    """
-
-    def __init__(self, params):
-        self.k = params['k']
-        self.inner_channels = params['inner_channels']
-        self.C, self.H, self.W = params['image_shape']
-        print(self.C, self.H, self.W)

-    def binarize(self, x):
-        conv1 = fluid.layers.conv2d(
-            input=x,
-            num_filters=self.inner_channels // 4,
-            filter_size=3,
+class Head(nn.Layer):
+    def __init__(self, in_channels, name_list):
+        super(Head, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=in_channels // 4,
+            kernel_size=3,
            padding=1,
-            param_attr=fluid.initializer.MSRAInitializer(uniform=False),
+            weight_attr=ParamAttr(name=name_list[0] + '.w_0'),
            bias_attr=False)
-        conv_bn1 = fluid.layers.batch_norm(
-            input=conv1,
-            param_attr=fluid.initializer.ConstantInitializer(value=1.0),
-            bias_attr=fluid.initializer.ConstantInitializer(value=1e-4),
-            act="relu")
-        conv2 = fluid.layers.conv2d_transpose(
-            input=conv_bn1,
-            num_filters=self.inner_channels // 4,
-            filter_size=2,
+        self.conv_bn1 = nn.BatchNorm(
+            num_channels=in_channels // 4,
+            param_attr=ParamAttr(
+                name=name_list[1] + '.w_0',
+                initializer=paddle.nn.initializer.Constant(value=1.0)),
+            bias_attr=ParamAttr(
+                name=name_list[1] + '.b_0',
+                initializer=paddle.nn.initializer.Constant(value=1e-4)),
+            moving_mean_name=name_list[1] + '.w_1',
+            moving_variance_name=name_list[1] + '.w_2',
+            act='relu')
+        self.conv2 = nn.ConvTranspose2d(
+            in_channels=in_channels // 4,
+            out_channels=in_channels // 4,
+            kernel_size=2,
            stride=2,
-            param_attr=fluid.initializer.MSRAInitializer(uniform=False),
-            bias_attr=self._get_bias_attr(0.0004, conv_bn1.shape[1], "conv2"),
-            act=None)
-        conv_bn2 = fluid.layers.batch_norm(
-            input=conv2,
-            param_attr=fluid.initializer.ConstantInitializer(value=1.0),
-            bias_attr=fluid.initializer.ConstantInitializer(value=1e-4),
+            weight_attr=ParamAttr(
+                name=name_list[2] + '.w_0',
+                initializer=paddle.nn.initializer.MSRA(uniform=False)),
+            bias_attr=get_bias_attr(in_channels // 4, name_list[-1] + "conv2"))
+        self.conv_bn2 = nn.BatchNorm(
+            num_channels=in_channels // 4,
+            param_attr=ParamAttr(
+                name=name_list[3] + '.w_0',
+                initializer=paddle.nn.initializer.Constant(value=1.0)),
+            bias_attr=ParamAttr(
+                name=name_list[3] + '.b_0',
+                initializer=paddle.nn.initializer.Constant(value=1e-4)),
+            moving_mean_name=name_list[3] + '.w_1',
+            moving_variance_name=name_list[3] + '.w_2',
            act="relu")
-        conv3 = fluid.layers.conv2d_transpose(
-            input=conv_bn2,
-            num_filters=1,
-            filter_size=2,
+        self.conv3 = nn.ConvTranspose2d(
+            in_channels=in_channels // 4,
+            out_channels=1,
+            kernel_size=2,
            stride=2,
-            param_attr=fluid.initializer.MSRAInitializer(uniform=False),
-            bias_attr=self._get_bias_attr(0.0004, conv_bn2.shape[1], "conv3"),
-            act=None)
-        out = fluid.layers.sigmoid(conv3)
-        return out
+            weight_attr=ParamAttr(
+                name=name_list[4] + '.w_0',
+                initializer=paddle.nn.initializer.MSRA(uniform=False)),
+            bias_attr=get_bias_attr(in_channels // 4, name_list[-1] + "conv3"),
+        )

-    def thresh(self, x):
-        conv1 = fluid.layers.conv2d(
-            input=x,
-            num_filters=self.inner_channels // 4,
-            filter_size=3,
-            padding=1,
-            param_attr=fluid.initializer.MSRAInitializer(uniform=False),
-            bias_attr=False)
-        conv_bn1 = fluid.layers.batch_norm(
-            input=conv1,
-            param_attr=fluid.initializer.ConstantInitializer(value=1.0),
-            bias_attr=fluid.initializer.ConstantInitializer(value=1e-4),
-            act="relu")
-        conv2 = fluid.layers.conv2d_transpose(
-            input=conv_bn1,
-            num_filters=self.inner_channels // 4,
-            filter_size=2,
-            stride=2,
-            param_attr=fluid.initializer.MSRAInitializer(uniform=False),
-            bias_attr=self._get_bias_attr(0.0004, conv_bn1.shape[1], "conv2"),
-            act=None)
-        conv_bn2 = fluid.layers.batch_norm(
-            input=conv2,
-            param_attr=fluid.initializer.ConstantInitializer(value=1.0),
-            bias_attr=fluid.initializer.ConstantInitializer(value=1e-4),
-            act="relu")
-        conv3 = fluid.layers.conv2d_transpose(
-            input=conv_bn2,
-            num_filters=1,
-            filter_size=2,
-            stride=2,
-            param_attr=fluid.initializer.MSRAInitializer(uniform=False),
-            bias_attr=self._get_bias_attr(0.0004, conv_bn2.shape[1], "conv3"),
-            act=None)
-        out = fluid.layers.sigmoid(conv3)
-        return out
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.conv_bn1(x)
+        x = self.conv2(x)
+        x = self.conv_bn2(x)
+        x = self.conv3(x)
+        x = F.sigmoid(x)
+        return x

-    def _get_bias_attr(self, l2_decay, k, name, gradient_clip=None):
-        regularizer = fluid.regularizer.L2Decay(l2_decay)
-        stdv = 1.0 / math.sqrt(k * 1.0)
-        initializer = fluid.initializer.Uniform(-stdv, stdv)
-        bias_attr = fluid.ParamAttr(
-            regularizer=regularizer,
-            initializer=initializer,
-            name=name + "_b_attr")
-        return bias_attr

-    def step_function(self, x, y):
-        return fluid.layers.reciprocal(1 + fluid.layers.exp(-self.k * (x - y)))
+class DBHead(nn.Layer):
+    """
+    Differentiable Binarization (DB) for text detection:
+        see https://arxiv.org/abs/1911.08947
+    args:
+        params(dict): super parameters for build DB network
+    """

-    def __call__(self, conv_features, mode="train"):
-        c2, c3, c4, c5 = conv_features
-        param_attr = fluid.initializer.MSRAInitializer(uniform=False)
-        in5 = fluid.layers.conv2d(
-            input=c5,
-            num_filters=self.inner_channels,
-            filter_size=1,
-            param_attr=param_attr,
-            bias_attr=False)
-        in4 = fluid.layers.conv2d(
-            input=c4,
-            num_filters=self.inner_channels,
-            filter_size=1,
-            param_attr=param_attr,
-            bias_attr=False)
-        in3 = fluid.layers.conv2d(
-            input=c3,
-            num_filters=self.inner_channels,
-            filter_size=1,
-            param_attr=param_attr,
-            bias_attr=False)
-        in2 = fluid.layers.conv2d(
-            input=c2,
-            num_filters=self.inner_channels,
-            filter_size=1,
-            param_attr=param_attr,
-            bias_attr=False)
+    def __init__(self, in_channels, k=50, **kwargs):
+        super(DBHead, self).__init__()
+        self.k = k
+        binarize_name_list = [
+            'conv2d_56', 'batch_norm_47', 'conv2d_transpose_0', 'batch_norm_48',
+            'conv2d_transpose_1', 'binarize'
+        ]
+        thresh_name_list = [
+            'conv2d_57', 'batch_norm_49', 'conv2d_transpose_2', 'batch_norm_50',
+            'conv2d_transpose_3', 'thresh'
+        ]
+        self.binarize = Head(in_channels, binarize_name_list)
+        self.thresh = Head(in_channels, thresh_name_list)

-        out4 = fluid.layers.elementwise_add(
-            x=fluid.layers.resize_nearest(
-                input=in5, scale=2), y=in4)  # 1/16
-        out3 = fluid.layers.elementwise_add(
-            x=fluid.layers.resize_nearest(
-                input=out4, scale=2), y=in3)  # 1/8
-        out2 = fluid.layers.elementwise_add(
-            x=fluid.layers.resize_nearest(
-                input=out3, scale=2), y=in2)  # 1/4
+    def step_function(self, x, y):
+        return paddle.reciprocal(1 + paddle.exp(-self.k * (x - y)))

-        p5 = fluid.layers.conv2d(
-            input=in5,
-            num_filters=self.inner_channels // 4,
-            filter_size=3,
-            padding=1,
-            param_attr=param_attr,
-            bias_attr=False)
-        p5 = fluid.layers.resize_nearest(input=p5, scale=8)
-        p4 = fluid.layers.conv2d(
-            input=out4,
-            num_filters=self.inner_channels // 4,
-            filter_size=3,
-            padding=1,
-            param_attr=param_attr,
-            bias_attr=False)
-        p4 = fluid.layers.resize_nearest(input=p4, scale=4)
-        p3 = fluid.layers.conv2d(
-            input=out3,
-            num_filters=self.inner_channels // 4,
-            filter_size=3,
-            padding=1,
-            param_attr=param_attr,
-            bias_attr=False)
-        p3 = fluid.layers.resize_nearest(input=p3, scale=2)
-        p2 = fluid.layers.conv2d(
-            input=out2,
-            num_filters=self.inner_channels // 4,
-            filter_size=3,
-            padding=1,
-            param_attr=param_attr,
-            bias_attr=False)
+    def forward(self, x):
+        shrink_maps = self.binarize(x)
+        if not self.training:
+            return shrink_maps

-        fuse = fluid.layers.concat(input=[p5, p4, p3, p2], axis=1)
-        shrink_maps = self.binarize(fuse)
-        if mode != "train":
-            return {"maps": shrink_maps}
-        threshold_maps = self.thresh(fuse)
+        threshold_maps = self.thresh(x)
        binary_maps = self.step_function(shrink_maps, threshold_maps)
-        y = fluid.layers.concat(
-            input=[shrink_maps, threshold_maps, binary_maps], axis=1)
-        predicts = {}
-        predicts['maps'] = y
-        return predicts
+        y = paddle.concat([shrink_maps, threshold_maps, binary_maps], axis=1)
+        return y
--- a/ppocr/modeling/heads/det_east_head.py
+++ b/ppocr/modeling/heads/det_east_head.py
-#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle.fluid as fluid
-from ..common_functions import conv_bn_layer, deconv_bn_layer
-from collections import OrderedDict
-
-
-class EASTHead(object):
-    """
-    EAST: An Efficient and Accurate Scene Text Detector
-        see arxiv: https://arxiv.org/abs/1704.03155
-    args:
-        params(dict): the super parameters for network build
-    """
-
-    def __init__(self, params):
-
-        self.model_name = params['model_name']
-
-    def unet_fusion(self, inputs):
-        f = inputs[::-1]
-        if self.model_name == "large":
-            num_outputs = [128, 128, 128, 128]
-        else:
-            num_outputs = [64, 64, 64, 64]
-        g = [None, None, None, None]
-        h = [None, None, None, None]
-        for i in range(4):
-            if i == 0:
-                h[i] = f[i]
-            else:
-                h[i] = fluid.layers.concat([g[i - 1], f[i]], axis=1)
-                h[i] = conv_bn_layer(
-                    input=h[i],
-                    num_filters=num_outputs[i],
-                    filter_size=3,
-                    stride=1,
-                    act='relu',
-                    name="unet_h_%d" % (i))
-            if i <= 2:
-                #can be replaced with unpool
-                g[i] = deconv_bn_layer(
-                    input=h[i],
-                    num_filters=num_outputs[i],
-                    name="unet_g_%d" % (i))
-            else:
-                g[i] = conv_bn_layer(
-                    input=h[i],
-                    num_filters=num_outputs[i],
-                    filter_size=3,
-                    stride=1,
-                    act='relu',
-                    name="unet_g_%d" % (i))
-        return g[3]
-
-    def detector_header(self, f_common):
-        if self.model_name == "large":
-            num_outputs = [128, 64, 1, 8]
-        else:
-            num_outputs = [64, 32, 1, 8]
-        f_det = conv_bn_layer(
-            input=f_common,
-            num_filters=num_outputs[0],
-            filter_size=3,
-            stride=1,
-            act='relu',
-            name="det_head1")
-        f_det = conv_bn_layer(
-            input=f_det,
-            num_filters=num_outputs[1],
-            filter_size=3,
-            stride=1,
-            act='relu',
-            name="det_head2")
-        #f_score
-        f_score = conv_bn_layer(
-            input=f_det,
-            num_filters=num_outputs[2],
-            filter_size=1,
-            stride=1,
-            act=None,
-            name="f_score")
-        f_score = fluid.layers.sigmoid(f_score)
-        #f_geo
-        f_geo = conv_bn_layer(
-            input=f_det,
-            num_filters=num_outputs[3],
-            filter_size=1,
-            stride=1,
-            act=None,
-            name="f_geo")
-        f_geo = (fluid.layers.sigmoid(f_geo) - 0.5) * 2 * 800
-        return f_score, f_geo
-
-    def __call__(self, inputs):
-        f_common = self.unet_fusion(inputs)
-        f_score, f_geo = self.detector_header(f_common)
-        predicts = OrderedDict()
-        predicts['f_score'] = f_score
-        predicts['f_geo'] = f_geo
-        return predicts
--- a/ppocr/modeling/heads/det_sast_head.py
+++ b/ppocr/modeling/heads/det_sast_head.py
-#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle.fluid as fluid
-from ..common_functions import conv_bn_layer, deconv_bn_layer
-from collections import OrderedDict
-
-
-class SASTHead(object):
-    """
-    SAST: 
-        see arxiv: https://arxiv.org/abs/1908.05498
-    args:
-        params(dict): the super parameters for network build
-    """
-
-    def __init__(self, params):
-        self.model_name = params['model_name']
-        self.with_cab = params['with_cab']
-
-    def FPN_Up_Fusion(self, blocks):
-        """
-        blocks{}: contain block_2, block_3, block_4, block_5, block_6, block_7 with
-                1/4, 1/8, 1/16, 1/32, 1/64, 1/128 resolution.
-        """
-        f = [blocks['block_6'], blocks['block_5'], blocks['block_4'], blocks['block_3'], blocks['block_2']]
-        num_outputs = [256, 256, 192, 192, 128]
-        g = [None, None, None, None, None]
-        h = [None, None, None, None, None] 
-        for i in range(5):
-            h[i] = conv_bn_layer(input=f[i], num_filters=num_outputs[i],
-                                filter_size=1, stride=1, act=None, name='fpn_up_h'+str(i))
-
-        for i in range(4):
-            if i == 0:
-                g[i] = deconv_bn_layer(input=h[i], num_filters=num_outputs[i + 1], act=None, name='fpn_up_g0')
-                #print("g[{}] shape: {}".format(i, g[i].shape))
-            else:
-                g[i] = fluid.layers.elementwise_add(x=g[i - 1], y=h[i])
-                g[i] = fluid.layers.relu(g[i])
-                #g[i] = conv_bn_layer(input=g[i], num_filters=num_outputs[i],
-                #                    filter_size=1, stride=1, act='relu')
-                g[i] = conv_bn_layer(input=g[i], num_filters=num_outputs[i],
-                                    filter_size=3, stride=1, act='relu', name='fpn_up_g%d_1'%i)
-                g[i] = deconv_bn_layer(input=g[i], num_filters=num_outputs[i + 1], act=None, name='fpn_up_g%d_2'%i)
-                #print("g[{}] shape: {}".format(i, g[i].shape))
-
-        g[4] = fluid.layers.elementwise_add(x=g[3], y=h[4])
-        g[4] = fluid.layers.relu(g[4])
-        g[4] = conv_bn_layer(input=g[4], num_filters=num_outputs[4],
-                            filter_size=3, stride=1, act='relu', name='fpn_up_fusion_1')
-        g[4] = conv_bn_layer(input=g[4], num_filters=num_outputs[4],
-                            filter_size=1, stride=1, act=None, name='fpn_up_fusion_2')
-        
-        return g[4]
-
-    def FPN_Down_Fusion(self, blocks):
-        """
-        blocks{}: contain block_2, block_3, block_4, block_5, block_6, block_7 with
-                1/4, 1/8, 1/16, 1/32, 1/64, 1/128 resolution.
-        """
-        f = [blocks['block_0'], blocks['block_1'], blocks['block_2']]
-        num_outputs = [32, 64, 128]
-        g = [None, None, None]
-        h = [None, None, None] 
-        for i in range(3):
-            h[i] = conv_bn_layer(input=f[i], num_filters=num_outputs[i],
-                                filter_size=3, stride=1, act=None, name='fpn_down_h'+str(i))
-        for i in range(2):
-            if i == 0:
-                g[i] = conv_bn_layer(input=h[i], num_filters=num_outputs[i+1], filter_size=3, stride=2, act=None, name='fpn_down_g0')
-            else:
-                g[i] = fluid.layers.elementwise_add(x=g[i - 1], y=h[i])
-                g[i] = fluid.layers.relu(g[i])
-                g[i] = conv_bn_layer(input=g[i], num_filters=num_outputs[i], filter_size=3, stride=1, act='relu', name='fpn_down_g%d_1'%i)
-                g[i] = conv_bn_layer(input=g[i], num_filters=num_outputs[i+1], filter_size=3, stride=2, act=None, name='fpn_down_g%d_2'%i)
-            # print("g[{}] shape: {}".format(i, g[i].shape)) 
-        g[2] = fluid.layers.elementwise_add(x=g[1], y=h[2])
-        g[2] = fluid.layers.relu(g[2])
-        g[2] = conv_bn_layer(input=g[2], num_filters=num_outputs[2],
-                            filter_size=3, stride=1, act='relu', name='fpn_down_fusion_1')
-        g[2] = conv_bn_layer(input=g[2], num_filters=num_outputs[2],
-                            filter_size=1, stride=1, act=None, name='fpn_down_fusion_2')
-        return g[2]
-
-    def SAST_Header1(self, f_common):
-        """Detector header."""
-        #f_score
-        f_score = conv_bn_layer(input=f_common, num_filters=64, filter_size=1, stride=1, act='relu', name='f_score1')
-        f_score = conv_bn_layer(input=f_score, num_filters=64, filter_size=3, stride=1, act='relu', name='f_score2')
-        f_score = conv_bn_layer(input=f_score, num_filters=128, filter_size=1, stride=1, act='relu', name='f_score3')
-        f_score = conv_bn_layer(input=f_score, num_filters=1, filter_size=3, stride=1, name='f_score4')
-        f_score = fluid.layers.sigmoid(f_score)
-        # print("f_score shape: {}".format(f_score.shape))
-
-        #f_boder
-        f_border = conv_bn_layer(input=f_common, num_filters=64, filter_size=1, stride=1, act='relu', name='f_border1')
-        f_border = conv_bn_layer(input=f_border, num_filters=64, filter_size=3, stride=1, act='relu', name='f_border2')
-        f_border = conv_bn_layer(input=f_border, num_filters=128, filter_size=1, stride=1, act='relu', name='f_border3')
-        f_border = conv_bn_layer(input=f_border, num_filters=4, filter_size=3, stride=1, name='f_border4')
-        # print("f_border shape: {}".format(f_border.shape))
-        
-        return f_score, f_border
-
-    def SAST_Header2(self, f_common):
-        """Detector header.""" 
-        #f_tvo
-        f_tvo = conv_bn_layer(input=f_common, num_filters=64, filter_size=1, stride=1, act='relu', name='f_tvo1')
-        f_tvo = conv_bn_layer(input=f_tvo, num_filters=64, filter_size=3, stride=1, act='relu', name='f_tvo2')
-        f_tvo = conv_bn_layer(input=f_tvo, num_filters=128, filter_size=1, stride=1, act='relu', name='f_tvo3')
-        f_tvo = conv_bn_layer(input=f_tvo, num_filters=8, filter_size=3, stride=1, name='f_tvo4')
-        # print("f_tvo shape: {}".format(f_tvo.shape))
-
-        #f_tco
-        f_tco = conv_bn_layer(input=f_common, num_filters=64, filter_size=1, stride=1, act='relu', name='f_tco1')
-        f_tco = conv_bn_layer(input=f_tco, num_filters=64, filter_size=3, stride=1, act='relu', name='f_tco2')
-        f_tco = conv_bn_layer(input=f_tco, num_filters=128, filter_size=1, stride=1, act='relu', name='f_tco3')
-        f_tco = conv_bn_layer(input=f_tco, num_filters=2, filter_size=3, stride=1, name='f_tco4')
-        # print("f_tco shape: {}".format(f_tco.shape))
-        
-        return f_tvo, f_tco
-
-    def cross_attention(self, f_common):
-        """
-        """
-        f_shape = fluid.layers.shape(f_common)
-        f_theta = conv_bn_layer(input=f_common, num_filters=128, filter_size=1, stride=1, act='relu', name='f_theta')
-        f_phi = conv_bn_layer(input=f_common, num_filters=128, filter_size=1, stride=1, act='relu', name='f_phi')
-        f_g = conv_bn_layer(input=f_common, num_filters=128, filter_size=1, stride=1, act='relu', name='f_g')
-        ### horizon
-        fh_theta = f_theta
-        fh_phi = f_phi
-        fh_g = f_g
-        #flatten
-        fh_theta = fluid.layers.transpose(fh_theta, [0, 2, 3, 1])
-        fh_theta = fluid.layers.reshape(fh_theta, [f_shape[0] * f_shape[2], f_shape[3], 128])
-        fh_phi = fluid.layers.transpose(fh_phi, [0, 2, 3, 1])
-        fh_phi = fluid.layers.reshape(fh_phi, [f_shape[0] * f_shape[2], f_shape[3], 128])
-        fh_g = fluid.layers.transpose(fh_g, [0, 2, 3, 1])
-        fh_g = fluid.layers.reshape(fh_g, [f_shape[0] * f_shape[2], f_shape[3], 128])
-        #correlation
-        fh_attn = fluid.layers.matmul(fh_theta, fluid.layers.transpose(fh_phi, [0, 2, 1]))
-        #scale
-        fh_attn = fh_attn / (128 ** 0.5)
-        fh_attn = fluid.layers.softmax(fh_attn)
-        #weighted sum
-        fh_weight = fluid.layers.matmul(fh_attn, fh_g)
-        fh_weight = fluid.layers.reshape(fh_weight, [f_shape[0], f_shape[2], f_shape[3], 128])
-        # print("fh_weight: {}".format(fh_weight.shape))
-        fh_weight = fluid.layers.transpose(fh_weight, [0, 3, 1, 2])
-        fh_weight = conv_bn_layer(input=fh_weight, num_filters=128, filter_size=1, stride=1, name='fh_weight')
-        #short cut
-        fh_sc = conv_bn_layer(input=f_common, num_filters=128, filter_size=1, stride=1, name='fh_sc')
-        f_h = fluid.layers.relu(fh_weight + fh_sc)
-        ######
-        #vertical
-        fv_theta = fluid.layers.transpose(f_theta, [0, 1, 3, 2])
-        fv_phi = fluid.layers.transpose(f_phi, [0, 1, 3, 2])
-        fv_g = fluid.layers.transpose(f_g, [0, 1, 3, 2])
-        #flatten
-        fv_theta = fluid.layers.transpose(fv_theta, [0, 2, 3, 1])
-        fv_theta = fluid.layers.reshape(fv_theta, [f_shape[0] * f_shape[3], f_shape[2], 128])
-        fv_phi = fluid.layers.transpose(fv_phi, [0, 2, 3, 1])
-        fv_phi = fluid.layers.reshape(fv_phi, [f_shape[0] * f_shape[3], f_shape[2], 128])
-        fv_g = fluid.layers.transpose(fv_g, [0, 2, 3, 1])
-        fv_g = fluid.layers.reshape(fv_g, [f_shape[0] * f_shape[3], f_shape[2], 128])
-        #correlation
-        fv_attn = fluid.layers.matmul(fv_theta, fluid.layers.transpose(fv_phi, [0, 2, 1]))
-        #scale
-        fv_attn = fv_attn / (128 ** 0.5)
-        fv_attn = fluid.layers.softmax(fv_attn)
-        #weighted sum
-        fv_weight = fluid.layers.matmul(fv_attn, fv_g)
-        fv_weight = fluid.layers.reshape(fv_weight, [f_shape[0], f_shape[3], f_shape[2], 128])
-        # print("fv_weight: {}".format(fv_weight.shape))
-        fv_weight = fluid.layers.transpose(fv_weight, [0, 3, 2, 1])
-        fv_weight = conv_bn_layer(input=fv_weight, num_filters=128, filter_size=1, stride=1, name='fv_weight')
-        #short cut
-        fv_sc = conv_bn_layer(input=f_common, num_filters=128, filter_size=1, stride=1, name='fv_sc')
-        f_v = fluid.layers.relu(fv_weight + fv_sc)
-        ######
-        f_attn = fluid.layers.concat([f_h, f_v], axis=1)
-        f_attn = conv_bn_layer(input=f_attn, num_filters=128, filter_size=1, stride=1, act='relu', name='f_attn')  
-        return f_attn
-        
-    def __call__(self, blocks, with_cab=False):
-        # for k, v in blocks.items():
-        #     print(k, v.shape)
-
-        #down fpn
-        f_down = self.FPN_Down_Fusion(blocks)
-        # print("f_down shape: {}".format(f_down.shape))
-        #up fpn
-        f_up = self.FPN_Up_Fusion(blocks)
-        # print("f_up shape: {}".format(f_up.shape))
-        #fusion
-        f_common = fluid.layers.elementwise_add(x=f_down, y=f_up)
-        f_common = fluid.layers.relu(f_common)
-        # print("f_common: {}".format(f_common.shape))
-        
-        if self.with_cab:
-            # print('enhence f_common with CAB.')
-            f_common = self.cross_attention(f_common)
-            
-        f_score, f_border= self.SAST_Header1(f_common)
-        f_tvo, f_tco = self.SAST_Header2(f_common)
-
-        predicts = OrderedDict()
-        predicts['f_score'] = f_score
-        predicts['f_border'] = f_border
-        predicts['f_tvo'] = f_tvo
-        predicts['f_tco'] = f_tco
-        return predicts
\ No newline at end of file
--- a/ppocr/modeling/heads/rec_attention_head.py
+++ b/ppocr/modeling/heads/rec_attention_head.py
-#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-from .rec_seq_encoder import SequenceEncoder
-import numpy as np
-
-
-class AttentionPredict(object):
-    def __init__(self, params):
-        super(AttentionPredict, self).__init__()
-        self.char_num = params['char_num']
-        self.encoder = SequenceEncoder(params)
-        self.decoder_size = params['Attention']['decoder_size']
-        self.word_vector_dim = params['Attention']['word_vector_dim']
-        self.encoder_type = params['encoder_type']
-        self.max_length = params['max_text_length']
-
-    def simple_attention(self, encoder_vec, encoder_proj, decoder_state,
-                         decoder_size):
-        decoder_state_proj = layers.fc(input=decoder_state,
-                                       size=decoder_size,
-                                       bias_attr=False,
-                                       name="decoder_state_proj_fc")
-        decoder_state_expand = layers.sequence_expand(
-            x=decoder_state_proj, y=encoder_proj)
-        concated = layers.elementwise_add(encoder_proj, decoder_state_expand)
-        concated = layers.tanh(x=concated)
-        attention_weights = layers.fc(input=concated,
-                                      size=1,
-                                      act=None,
-                                      bias_attr=False,
-                                      name="attention_weights_fc")
-        attention_weights = layers.sequence_softmax(input=attention_weights)
-        weigths_reshape = layers.reshape(x=attention_weights, shape=[-1])
-        scaled = layers.elementwise_mul(
-            x=encoder_vec, y=weigths_reshape, axis=0)
-        context = layers.sequence_pool(input=scaled, pool_type='sum')
-        return context
-
-    def gru_decoder_with_attention(self, target_embedding, encoder_vec,
-                                   encoder_proj, decoder_boot, decoder_size,
-                                   char_num):
-        rnn = layers.DynamicRNN()
-        with rnn.block():
-            current_word = rnn.step_input(target_embedding)
-            encoder_vec = rnn.static_input(encoder_vec)
-            encoder_proj = rnn.static_input(encoder_proj)
-            hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True)
-            context = self.simple_attention(encoder_vec, encoder_proj,
-                                            hidden_mem, decoder_size)
-            fc_1 = layers.fc(input=context,
-                             size=decoder_size * 3,
-                             bias_attr=False,
-                             name="rnn_fc1")
-            fc_2 = layers.fc(input=current_word,
-                             size=decoder_size * 3,
-                             bias_attr=False,
-                             name="rnn_fc2")
-            decoder_inputs = fc_1 + fc_2
-            h, _, _ = layers.gru_unit(
-                input=decoder_inputs, hidden=hidden_mem, size=decoder_size * 3)
-            rnn.update_memory(hidden_mem, h)
-            out = layers.fc(input=h,
-                            size=char_num,
-                            bias_attr=True,
-                            act='softmax',
-                            name="rnn_out_fc")
-            rnn.output(out)
-        return rnn()
-
-    def gru_attention_infer(self, decoder_boot, max_length, char_num,
-                            word_vector_dim, encoded_vector, encoded_proj,
-                            decoder_size):
-        init_state = decoder_boot
-        beam_size = 1
-        array_len = layers.fill_constant(
-            shape=[1], dtype='int64', value=max_length)
-        counter = layers.zeros(shape=[1], dtype='int64', force_cpu=True)
-
-        # fill the first element with init_state
-        state_array = layers.create_array('float32')
-        layers.array_write(init_state, array=state_array, i=counter)
-
-        # ids, scores as memory
-        ids_array = layers.create_array('int64')
-        scores_array = layers.create_array('float32')
-        rois_shape = layers.shape(init_state)
-        batch_size = layers.slice(
-            rois_shape, axes=[0], starts=[0], ends=[1]) + 1
-        lod_level = layers.range(
-            start=0, end=batch_size, step=1, dtype=batch_size.dtype)
-
-        init_ids = layers.fill_constant_batch_size_like(
-            input=init_state, shape=[-1, 1], value=0, dtype='int64')
-        init_ids = layers.lod_reset(init_ids, lod_level)
-        init_ids = layers.lod_append(init_ids, lod_level)
-
-        init_scores = layers.fill_constant_batch_size_like(
-            input=init_state, shape=[-1, 1], value=1, dtype='float32')
-        init_scores = layers.lod_reset(init_scores, init_ids)
-        layers.array_write(init_ids, array=ids_array, i=counter)
-        layers.array_write(init_scores, array=scores_array, i=counter)
-
-        full_ids = fluid.layers.fill_constant_batch_size_like(
-            input=init_state, shape=[-1, 1], dtype='int64', value=1)
-        full_scores = fluid.layers.fill_constant_batch_size_like(
-            input=init_state, shape=[-1, 1], dtype='float32', value=1)
-
-        cond = layers.less_than(x=counter, y=array_len)
-        while_op = layers.While(cond=cond)
-        with while_op.block():
-            pre_ids = layers.array_read(array=ids_array, i=counter)
-            pre_state = layers.array_read(array=state_array, i=counter)
-            pre_score = layers.array_read(array=scores_array, i=counter)
-            pre_ids_emb = layers.embedding(
-                input=pre_ids,
-                size=[char_num, word_vector_dim],
-                dtype='float32')
-
-            context = self.simple_attention(encoded_vector, encoded_proj,
-                                            pre_state, decoder_size)
-
-            # expand the recursive_sequence_lengths of pre_state 
-            # to be the same with pre_score
-            pre_state_expanded = layers.sequence_expand(pre_state, pre_score)
-            context_expanded = layers.sequence_expand(context, pre_score)
-
-            fc_1 = layers.fc(input=context_expanded,
-                             size=decoder_size * 3,
-                             bias_attr=False,
-                             name="rnn_fc1")
-
-            fc_2 = layers.fc(input=pre_ids_emb,
-                             size=decoder_size * 3,
-                             bias_attr=False,
-                             name="rnn_fc2")
-
-            decoder_inputs = fc_1 + fc_2
-            current_state, _, _ = layers.gru_unit(
-                input=decoder_inputs,
-                hidden=pre_state_expanded,
-                size=decoder_size * 3)
-            current_state_with_lod = layers.lod_reset(
-                x=current_state, y=pre_score)
-            # use score to do beam search
-            current_score = layers.fc(input=current_state_with_lod,
-                                      size=char_num,
-                                      bias_attr=True,
-                                      act='softmax',
-                                      name="rnn_out_fc")
-            topk_scores, topk_indices = layers.topk(current_score, k=beam_size)
-
-            new_ids = fluid.layers.concat([full_ids, topk_indices], axis=1)
-            fluid.layers.assign(new_ids, full_ids)
-
-            new_scores = fluid.layers.concat([full_scores, topk_scores], axis=1)
-            fluid.layers.assign(new_scores, full_scores)
-            
-            layers.increment(x=counter, value=1, in_place=True)
-
-            # update the memories
-            layers.array_write(current_state, array=state_array, i=counter)
-            layers.array_write(topk_indices, array=ids_array, i=counter)
-            layers.array_write(topk_scores, array=scores_array, i=counter)
-
-            # update the break condition: 
-            # up to the max length or all candidates of
-            # source sentences have ended.
-            length_cond = layers.less_than(x=counter, y=array_len)
-            finish_cond = layers.logical_not(layers.is_empty(x=topk_indices))
-            layers.logical_and(x=length_cond, y=finish_cond, out=cond)
-        return full_ids, full_scores
-
-    def __call__(self, inputs, labels=None, mode=None):
-        encoder_features = self.encoder(inputs)
-        char_num = self.char_num
-        word_vector_dim = self.word_vector_dim
-        decoder_size = self.decoder_size
-
-        if self.encoder_type == "reshape":
-            encoder_input = encoder_features
-            encoded_vector = encoder_features
-        else:
-            encoder_input = encoder_features[1]
-            encoded_vector = layers.concat(encoder_features, axis=1)
-        encoded_proj = layers.fc(input=encoded_vector,
-                                 size=decoder_size,
-                                 bias_attr=False,
-                                 name="encoded_proj_fc")
-        backward_first = layers.sequence_pool(
-            input=encoder_input, pool_type='first')
-        decoder_boot = layers.fc(input=backward_first,
-                                 size=decoder_size,
-                                 bias_attr=False,
-                                 act="relu",
-                                 name='decoder_boot')
-
-        if mode == "train":
-            label_in = labels['label_in']
-            label_out = labels['label_out']
-            label_in = layers.cast(x=label_in, dtype='int64')
-            trg_embedding = layers.embedding(
-                input=label_in,
-                size=[char_num, word_vector_dim],
-                dtype='float32')
-            predict = self.gru_decoder_with_attention(
-                trg_embedding, encoded_vector, encoded_proj, decoder_boot,
-                decoder_size, char_num)
-            _, decoded_out = layers.topk(input=predict, k=1)
-            decoded_out = layers.lod_reset(decoded_out, y=label_out)
-            predicts = {'predict':predict, 'decoded_out':decoded_out}
-        else:
-            ids, predict = self.gru_attention_infer(
-                decoder_boot, self.max_length, char_num, word_vector_dim,
-                encoded_vector, encoded_proj, decoder_size)
-            predicts = {'predict':predict, 'decoded_out':ids}
-        return predicts
--- a/ppocr/modeling/heads/rec_ctc_head.py
+++ b/ppocr/modeling/heads/rec_ctc_head.py
-#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

 from __future__ import absolute_import
 from __future__ import division
@@ -19,34 +19,33 @@ from __future__ import print_function
 import math

 import paddle
-import paddle.fluid as fluid
-from paddle.fluid.param_attr import ParamAttr
-from .rec_seq_encoder import SequenceEncoder
-from ..common_functions import get_para_bias_attr
-import numpy as np
-
-
-class CTCPredict(object):
-    def __init__(self, params):
-        super(CTCPredict, self).__init__()
-        self.char_num = params['char_num']
-        self.encoder = SequenceEncoder(params)
-        self.encoder_type = params['encoder_type']
-        self.fc_decay = params.get("fc_decay", 0.0004)
-
-    def __call__(self, inputs, labels=None, mode=None):
-        encoder_features = self.encoder(inputs)
-        if self.encoder_type != "reshape":
-            encoder_features = fluid.layers.concat(encoder_features, axis=1)
-        name = "ctc_fc"
-        para_attr, bias_attr = get_para_bias_attr(
-            l2_decay=self.fc_decay, k=encoder_features.shape[1], name=name)
-        predict = fluid.layers.fc(input=encoder_features,
-                                  size=self.char_num + 1,
-                                  param_attr=para_attr,
-                                  bias_attr=bias_attr,
-                                  name=name)
-        decoded_out = fluid.layers.ctc_greedy_decoder(
-            input=predict, blank=self.char_num)
-        predicts = {'predict': predict, 'decoded_out': decoded_out}
+from paddle import ParamAttr, nn
+
+
+def get_para_bias_attr(l2_decay, k, name):
+    regularizer = paddle.fluid.regularizer.L2Decay(l2_decay)
+    stdv = 1.0 / math.sqrt(k * 1.0)
+    initializer = nn.initializer.Uniform(-stdv, stdv)
+    weight_attr = ParamAttr(
+        regularizer=regularizer, initializer=initializer, name=name + "_w_attr")
+    bias_attr = ParamAttr(
+        regularizer=regularizer, initializer=initializer, name=name + "_b_attr")
+    return [weight_attr, bias_attr]
+
+
+class CTC(nn.Layer):
+    def __init__(self, in_channels, out_channels, fc_decay=1e-5, **kwargs):
+        super(CTC, self).__init__()
+        weight_attr, bias_attr = get_para_bias_attr(
+            l2_decay=fc_decay, k=in_channels, name='ctc_fc')
+        self.fc = nn.Linear(
+            in_channels,
+            out_channels,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            name='ctc_fc')
+        self.out_channels = out_channels
+
+    def forward(self, x, labels=None):
+        predicts = self.fc(x)
        return predicts
--- a/ppocr/modeling/heads/rec_seq_encoder.py
+++ b/ppocr/modeling/heads/rec_seq_encoder.py
-#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-
-
-class EncoderWithReshape(object):
-    def __init__(self, params):
-        super(EncoderWithReshape, self).__init__()
-
-    def __call__(self, inputs):
-        sliced_feature = layers.im2sequence(
-            input=inputs,
-            stride=[1, 1],
-            filter_size=[inputs.shape[2], 1],
-            name="sliced_feature")
-        return sliced_feature
-
-
-class EncoderWithRNN(object):
-    def __init__(self, params):
-        super(EncoderWithRNN, self).__init__()
-        self.rnn_hidden_size = params['SeqRNN']['hidden_size']
-
-    def __call__(self, inputs):
-        lstm_list = []
-        name_prefix = "lstm"
-        rnn_hidden_size = self.rnn_hidden_size
-        for no in range(1, 3):
-            if no == 1:
-                is_reverse = False
-            else:
-                is_reverse = True
-            name = "%s_st1_fc%d" % (name_prefix, no)
-            fc = layers.fc(input=inputs,
-                           size=rnn_hidden_size * 4,
-                           param_attr=fluid.ParamAttr(name=name + "_w"),
-                           bias_attr=fluid.ParamAttr(name=name + "_b"),
-                           name=name)
-            name = "%s_st1_out%d" % (name_prefix, no)
-            lstm, _ = layers.dynamic_lstm(
-                input=fc,
-                size=rnn_hidden_size * 4,
-                is_reverse=is_reverse,
-                param_attr=fluid.ParamAttr(name=name + "_w"),
-                bias_attr=fluid.ParamAttr(name=name + "_b"),
-                use_peepholes=False)
-            name = "%s_st2_fc%d" % (name_prefix, no)
-            fc = layers.fc(input=lstm,
-                           size=rnn_hidden_size * 4,
-                           param_attr=fluid.ParamAttr(name=name + "_w"),
-                           bias_attr=fluid.ParamAttr(name=name + "_b"),
-                           name=name)
-            name = "%s_st2_out%d" % (name_prefix, no)
-            lstm, _ = layers.dynamic_lstm(
-                input=fc,
-                size=rnn_hidden_size * 4,
-                is_reverse=is_reverse,
-                param_attr=fluid.ParamAttr(name=name + "_w"),
-                bias_attr=fluid.ParamAttr(name=name + "_b"),
-                use_peepholes=False)
-            lstm_list.append(lstm)
-        return lstm_list
-
-
-class SequenceEncoder(object):
-    def __init__(self, params):
-        super(SequenceEncoder, self).__init__()
-        self.encoder_type = params['encoder_type']
-        self.encoder_reshape = EncoderWithReshape(params)
-        if self.encoder_type == "rnn":
-            self.encoder_rnn = EncoderWithRNN(params)
-
-    def __call__(self, inputs):
-        if self.encoder_type == "reshape":
-            encoder_features = self.encoder_reshape(inputs)
-        elif self.encoder_type == "rnn":
-            inputs = self.encoder_reshape(inputs)
-            encoder_features = self.encoder_rnn(inputs)
-        else:
-            assert False, "Unsupport encoder_type:%s"\
-                % self.encoder_type
-        return encoder_features
--- a/ppocr/modeling/heads/rec_srn_all_head.py
+++ b/ppocr/modeling/heads/rec_srn_all_head.py
-#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid.param_attr import ParamAttr
-import numpy as np
-from .self_attention.model import wrap_encoder
-from .self_attention.model import wrap_encoder_forFeature
-gradient_clip = 10
-
-
-class SRNPredict(object):
-    def __init__(self, params):
-        super(SRNPredict, self).__init__()
-        self.char_num = params['char_num']
-        self.max_length = params['max_text_length']
-
-        self.num_heads = params['num_heads']
-        self.num_encoder_TUs = params['num_encoder_TUs']
-        self.num_decoder_TUs = params['num_decoder_TUs']
-        self.hidden_dims = params['hidden_dims']
-
-    def pvam(self, inputs, others):
-
-        b, c, h, w = inputs.shape
-        conv_features = fluid.layers.reshape(x=inputs, shape=[-1, c, h * w])
-        conv_features = fluid.layers.transpose(x=conv_features, perm=[0, 2, 1])
-
-        #===== Transformer encoder =====
-        b, t, c = conv_features.shape
-        encoder_word_pos = others["encoder_word_pos"]
-        gsrm_word_pos = others["gsrm_word_pos"]
-
-        enc_inputs = [conv_features, encoder_word_pos, None]
-        word_features = wrap_encoder_forFeature(
-            src_vocab_size=-1,
-            max_length=t,
-            n_layer=self.num_encoder_TUs,
-            n_head=self.num_heads,
-            d_key=int(self.hidden_dims / self.num_heads),
-            d_value=int(self.hidden_dims / self.num_heads),
-            d_model=self.hidden_dims,
-            d_inner_hid=self.hidden_dims,
-            prepostprocess_dropout=0.1,
-            attention_dropout=0.1,
-            relu_dropout=0.1,
-            preprocess_cmd="n",
-            postprocess_cmd="da",
-            weight_sharing=True,
-            enc_inputs=enc_inputs, )
-        fluid.clip.set_gradient_clip(
-            fluid.clip.GradientClipByValue(gradient_clip))
-
-        #===== Parallel Visual Attention Module =====
-        b, t, c = word_features.shape
-
-        word_features = fluid.layers.fc(word_features, c, num_flatten_dims=2)
-        word_features_ = fluid.layers.reshape(word_features, [-1, 1, t, c])
-        word_features_ = fluid.layers.expand(word_features_,
-                                             [1, self.max_length, 1, 1])
-        word_pos_feature = fluid.layers.embedding(gsrm_word_pos,
-                                                  [self.max_length, c])
-        word_pos_ = fluid.layers.reshape(word_pos_feature,
-                                         [-1, self.max_length, 1, c])
-        word_pos_ = fluid.layers.expand(word_pos_, [1, 1, t, 1])
-        temp = fluid.layers.elementwise_add(
-            word_features_, word_pos_, act='tanh')
-
-        attention_weight = fluid.layers.fc(input=temp,
-                                           size=1,
-                                           num_flatten_dims=3,
-                                           bias_attr=False)
-        attention_weight = fluid.layers.reshape(
-            x=attention_weight, shape=[-1, self.max_length, t])
-        attention_weight = fluid.layers.softmax(input=attention_weight, axis=-1)
-
-        pvam_features = fluid.layers.matmul(attention_weight,
-                                            word_features)  #[b, max_length, c]
-
-        return pvam_features
-
-    def gsrm(self, pvam_features, others):
-
-        #===== GSRM Visual-to-semantic embedding block =====
-        b, t, c = pvam_features.shape
-        word_out = fluid.layers.fc(
-            input=fluid.layers.reshape(pvam_features, [-1, c]),
-            size=self.char_num,
-            act="softmax")
-        #word_out.stop_gradient = True
-        word_ids = fluid.layers.argmax(word_out, axis=1)
-        word_ids.stop_gradient = True
-        word_ids = fluid.layers.reshape(x=word_ids, shape=[-1, t, 1])
-
-        #===== GSRM Semantic reasoning block =====
-        """
-        This module is achieved through bi-transformers,
-        ngram_feature1 is the froward one, ngram_fetaure2 is the backward one
-        """
-        pad_idx = self.char_num
-        gsrm_word_pos = others["gsrm_word_pos"]
-        gsrm_slf_attn_bias1 = others["gsrm_slf_attn_bias1"]
-        gsrm_slf_attn_bias2 = others["gsrm_slf_attn_bias2"]
-
-        def prepare_bi(word_ids):
-            """
-            prepare bi for gsrm
-            word1 for forward; word2 for backward
-            """
-            word1 = fluid.layers.cast(word_ids, "float32")
-            word1 = fluid.layers.pad(word1, [0, 0, 1, 0, 0, 0],
-                                     pad_value=1.0 * pad_idx)
-            word1 = fluid.layers.cast(word1, "int64")
-            word1 = word1[:, :-1, :]
-            word2 = word_ids
-            return word1, word2
-
-        word1, word2 = prepare_bi(word_ids)
-        word1.stop_gradient = True
-        word2.stop_gradient = True
-        enc_inputs_1 = [word1, gsrm_word_pos, gsrm_slf_attn_bias1]
-        enc_inputs_2 = [word2, gsrm_word_pos, gsrm_slf_attn_bias2]
-
-        gsrm_feature1 = wrap_encoder(
-            src_vocab_size=self.char_num + 1,
-            max_length=self.max_length,
-            n_layer=self.num_decoder_TUs,
-            n_head=self.num_heads,
-            d_key=int(self.hidden_dims / self.num_heads),
-            d_value=int(self.hidden_dims / self.num_heads),
-            d_model=self.hidden_dims,
-            d_inner_hid=self.hidden_dims,
-            prepostprocess_dropout=0.1,
-            attention_dropout=0.1,
-            relu_dropout=0.1,
-            preprocess_cmd="n",
-            postprocess_cmd="da",
-            weight_sharing=True,
-            enc_inputs=enc_inputs_1, )
-        gsrm_feature2 = wrap_encoder(
-            src_vocab_size=self.char_num + 1,
-            max_length=self.max_length,
-            n_layer=self.num_decoder_TUs,
-            n_head=self.num_heads,
-            d_key=int(self.hidden_dims / self.num_heads),
-            d_value=int(self.hidden_dims / self.num_heads),
-            d_model=self.hidden_dims,
-            d_inner_hid=self.hidden_dims,
-            prepostprocess_dropout=0.1,
-            attention_dropout=0.1,
-            relu_dropout=0.1,
-            preprocess_cmd="n",
-            postprocess_cmd="da",
-            weight_sharing=True,
-            enc_inputs=enc_inputs_2, )
-        gsrm_feature2 = fluid.layers.pad(gsrm_feature2, [0, 0, 0, 1, 0, 0],
-                                         pad_value=0.)
-        gsrm_feature2 = gsrm_feature2[:, 1:, ]
-        gsrm_features = gsrm_feature1 + gsrm_feature2
-
-        b, t, c = gsrm_features.shape
-
-        gsrm_out = fluid.layers.matmul(
-            x=gsrm_features,
-            y=fluid.default_main_program().global_block().var(
-                "src_word_emb_table"),
-            transpose_y=True)
-        b, t, c = gsrm_out.shape
-        gsrm_out = fluid.layers.softmax(input=fluid.layers.reshape(gsrm_out,
-                                                                   [-1, c]))
-
-        return gsrm_features, word_out, gsrm_out
-
-    def vsfd(self, pvam_features, gsrm_features):
-
-        #===== Visual-Semantic Fusion Decoder Module =====
-        b, t, c1 = pvam_features.shape
-        b, t, c2 = gsrm_features.shape
-        combine_features_ = fluid.layers.concat(
-            [pvam_features, gsrm_features], axis=2)
-        img_comb_features_ = fluid.layers.reshape(
-            x=combine_features_, shape=[-1, c1 + c2])
-        img_comb_features_map = fluid.layers.fc(input=img_comb_features_,
-                                                size=c1,
-                                                act="sigmoid")
-        img_comb_features_map = fluid.layers.reshape(
-            x=img_comb_features_map, shape=[-1, t, c1])
-        combine_features = img_comb_features_map * pvam_features + (
-            1.0 - img_comb_features_map) * gsrm_features
-        img_comb_features = fluid.layers.reshape(
-            x=combine_features, shape=[-1, c1])
-
-        fc_out = fluid.layers.fc(input=img_comb_features,
-                                 size=self.char_num,
-                                 act="softmax")
-        return fc_out
-
-    def __call__(self, inputs, others, mode=None):
-
-        pvam_features = self.pvam(inputs, others)
-        gsrm_features, word_out, gsrm_out = self.gsrm(pvam_features, others)
-        final_out = self.vsfd(pvam_features, gsrm_features)
-
-        _, decoded_out = fluid.layers.topk(input=final_out, k=1)
-        predicts = {
-            'predict': final_out,
-            'decoded_out': decoded_out,
-            'word_out': word_out,
-            'gsrm_out': gsrm_out
-        }
-
-        return predicts
--- a/ppocr/modeling/heads/self_attention/__init__.py
+++ b/ppocr/modeling/heads/self_attention/__init__.py
--- a/ppocr/modeling/heads/self_attention/model.py
+++ b/ppocr/modeling/heads/self_attention/model.py
-from functools import partial
-import numpy as np
-
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-
-encoder_data_input_fields = (
-    "src_word",
-    "src_pos",
-    "src_slf_attn_bias", )
-
-
-def wrap_layer_with_block(layer, block_idx):
-    """
-    Make layer define support indicating block, by which we can add layers
-    to other blocks within current block. This will make it easy to define
-    cache among while loop.
-    """
-
-    class BlockGuard(object):
-        """
-        BlockGuard class.
-
-        BlockGuard class is used to switch to the given block in a program by
-        using the Python `with` keyword.
-        """
-
-        def __init__(self, block_idx=None, main_program=None):
-            self.main_program = fluid.default_main_program(
-            ) if main_program is None else main_program
-            self.old_block_idx = self.main_program.current_block().idx
-            self.new_block_idx = block_idx
-
-        def __enter__(self):
-            self.main_program.current_block_idx = self.new_block_idx
-
-        def __exit__(self, exc_type, exc_val, exc_tb):
-            self.main_program.current_block_idx = self.old_block_idx
-            if exc_type is not None:
-                return False  # re-raise exception
-            return True
-
-    def layer_wrapper(*args, **kwargs):
-        with BlockGuard(block_idx):
-            return layer(*args, **kwargs)
-
-    return layer_wrapper
-
-
-def multi_head_attention(queries,
-                         keys,
-                         values,
-                         attn_bias,
-                         d_key,
-                         d_value,
-                         d_model,
-                         n_head=1,
-                         dropout_rate=0.,
-                         cache=None,
-                         gather_idx=None,
-                         static_kv=False):
-    """
-    Multi-Head Attention. Note that attn_bias is added to the logit before
-    computing softmax activiation to mask certain selected positions so that
-    they will not considered in attention weights.
-    """
-    keys = queries if keys is None else keys
-    values = keys if values is None else values
-
-    if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
-        raise ValueError(
-            "Inputs: quries, keys and values should all be 3-D tensors.")
-
-    def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
-        """
-        Add linear projection to queries, keys, and values.
-        """
-        q = layers.fc(input=queries,
-                      size=d_key * n_head,
-                      bias_attr=False,
-                      num_flatten_dims=2)
-        # For encoder-decoder attention in inference, insert the ops and vars
-        # into global block to use as cache among beam search.
-        fc_layer = wrap_layer_with_block(
-            layers.fc, fluid.default_main_program().current_block()
-            .parent_idx) if cache is not None and static_kv else layers.fc
-        k = fc_layer(
-            input=keys,
-            size=d_key * n_head,
-            bias_attr=False,
-            num_flatten_dims=2)
-        v = fc_layer(
-            input=values,
-            size=d_value * n_head,
-            bias_attr=False,
-            num_flatten_dims=2)
-        return q, k, v
-
-    def __split_heads_qkv(queries, keys, values, n_head, d_key, d_value):
-        """
-        Reshape input tensors at the last dimension to split multi-heads
-        and then transpose. Specifically, transform the input tensor with shape
-        [bs, max_sequence_length, n_head * hidden_dim] to the output tensor
-        with shape [bs, n_head, max_sequence_length, hidden_dim].
-        """
-        # The value 0 in shape attr means copying the corresponding dimension
-        # size of the input as the output dimension size.
-        reshaped_q = layers.reshape(
-            x=queries, shape=[0, 0, n_head, d_key], inplace=True)
-        # permuate the dimensions into:
-        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
-        q = layers.transpose(x=reshaped_q, perm=[0, 2, 1, 3])
-        # For encoder-decoder attention in inference, insert the ops and vars
-        # into global block to use as cache among beam search.
-        reshape_layer = wrap_layer_with_block(
-            layers.reshape,
-            fluid.default_main_program().current_block()
-            .parent_idx) if cache is not None and static_kv else layers.reshape
-        transpose_layer = wrap_layer_with_block(
-            layers.transpose,
-            fluid.default_main_program().current_block().
-            parent_idx) if cache is not None and static_kv else layers.transpose
-        reshaped_k = reshape_layer(
-            x=keys, shape=[0, 0, n_head, d_key], inplace=True)
-        k = transpose_layer(x=reshaped_k, perm=[0, 2, 1, 3])
-        reshaped_v = reshape_layer(
-            x=values, shape=[0, 0, n_head, d_value], inplace=True)
-        v = transpose_layer(x=reshaped_v, perm=[0, 2, 1, 3])
-
-        if cache is not None:  # only for faster inference
-            if static_kv:  # For encoder-decoder attention in inference
-                cache_k, cache_v = cache["static_k"], cache["static_v"]
-                # To init the static_k and static_v in cache.
-                # Maybe we can use condition_op(if_else) to do these at the first
-                # step in while loop to replace these, however it might be less
-                # efficient.
-                static_cache_init = wrap_layer_with_block(
-                    layers.assign,
-                    fluid.default_main_program().current_block().parent_idx)
-                static_cache_init(k, cache_k)
-                static_cache_init(v, cache_v)
-            else:  # For decoder self-attention in inference
-                cache_k, cache_v = cache["k"], cache["v"]
-            # gather cell states corresponding to selected parent
-            select_k = layers.gather(cache_k, index=gather_idx)
-            select_v = layers.gather(cache_v, index=gather_idx)
-            if not static_kv:
-                # For self attention in inference, use cache and concat time steps.
-                select_k = layers.concat([select_k, k], axis=2)
-                select_v = layers.concat([select_v, v], axis=2)
-            # update cell states(caches) cached in global block
-            layers.assign(select_k, cache_k)
-            layers.assign(select_v, cache_v)
-            return q, select_k, select_v
-        return q, k, v
-
-    def __combine_heads(x):
-        """
-        Transpose and then reshape the last two dimensions of inpunt tensor x
-        so that it becomes one dimension, which is reverse to __split_heads.
-        """
-        if len(x.shape) != 4:
-            raise ValueError("Input(x) should be a 4-D Tensor.")
-
-        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
-        # The value 0 in shape attr means copying the corresponding dimension
-        # size of the input as the output dimension size.
-        return layers.reshape(
-            x=trans_x,
-            shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]],
-            inplace=True)
-
-    def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
-        """
-        Scaled Dot-Product Attention
-        """
-        # print(q)
-        # print(k)
-
-        product = layers.matmul(x=q, y=k, transpose_y=True, alpha=d_key**-0.5)
-        if attn_bias:
-            product += attn_bias
-        weights = layers.softmax(product)
-        if dropout_rate:
-            weights = layers.dropout(
-                weights, dropout_prob=dropout_rate, seed=None, is_test=False)
-        out = layers.matmul(weights, v)
-        return out
-
-    q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
-    q, k, v = __split_heads_qkv(q, k, v, n_head, d_key, d_value)
-
-    ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_model,
-                                                  dropout_rate)
-
-    out = __combine_heads(ctx_multiheads)
-
-    # Project back to the model size.
-    proj_out = layers.fc(input=out,
-                         size=d_model,
-                         bias_attr=False,
-                         num_flatten_dims=2)
-    return proj_out
-
-
-def positionwise_feed_forward(x, d_inner_hid, d_hid, dropout_rate):
-    """
-    Position-wise Feed-Forward Networks.
-    This module consists of two linear transformations with a ReLU activation
-    in between, which is applied to each position separately and identically.
-    """
-    hidden = layers.fc(input=x,
-                       size=d_inner_hid,
-                       num_flatten_dims=2,
-                       act="relu")
-    if dropout_rate:
-        hidden = layers.dropout(
-            hidden, dropout_prob=dropout_rate, seed=None, is_test=False)
-    out = layers.fc(input=hidden, size=d_hid, num_flatten_dims=2)
-    return out
-
-
-def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.):
-    """
-    Add residual connection, layer normalization and droput to the out tensor
-    optionally according to the value of process_cmd.
-    This will be used before or after multi-head attention and position-wise
-    feed-forward networks.
-    """
-    for cmd in process_cmd:
-        if cmd == "a":  # add residual connection
-            out = out + prev_out if prev_out else out
-        elif cmd == "n":  # add layer normalization
-            out = layers.layer_norm(
-                out,
-                begin_norm_axis=len(out.shape) - 1,
-                param_attr=fluid.initializer.Constant(1.),
-                bias_attr=fluid.initializer.Constant(0.))
-        elif cmd == "d":  # add dropout
-            if dropout_rate:
-                out = layers.dropout(
-                    out, dropout_prob=dropout_rate, seed=None, is_test=False)
-    return out
-
-
-pre_process_layer = partial(pre_post_process_layer, None)
-post_process_layer = pre_post_process_layer
-
-
-def prepare_encoder(
-        src_word,  # [b,t,c]
-        src_pos,
-        src_vocab_size,
-        src_emb_dim,
-        src_max_len,
-        dropout_rate=0.,
-        bos_idx=0,
-        word_emb_param_name=None,
-        pos_enc_param_name=None):
-    """Add word embeddings and position encodings.
-    The output tensor has a shape of:
-    [batch_size, max_src_length_in_batch, d_model].
-    This module is used at the bottom of the encoder stacks.
-    """
-
-    src_word_emb = src_word
-    src_word_emb = layers.cast(src_word_emb, 'float32')
-
-    src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim**0.5)
-    src_pos_enc = layers.embedding(
-        src_pos,
-        size=[src_max_len, src_emb_dim],
-        param_attr=fluid.ParamAttr(
-            name=pos_enc_param_name, trainable=False))
-    src_pos_enc.stop_gradient = True
-    enc_input = src_word_emb + src_pos_enc
-    return layers.dropout(
-        enc_input, dropout_prob=dropout_rate, seed=None,
-        is_test=False) if dropout_rate else enc_input
-
-
-def prepare_decoder(src_word,
-                    src_pos,
-                    src_vocab_size,
-                    src_emb_dim,
-                    src_max_len,
-                    dropout_rate=0.,
-                    bos_idx=0,
-                    word_emb_param_name=None,
-                    pos_enc_param_name=None):
-    """Add word embeddings and position encodings.
-        The output tensor has a shape of:
-        [batch_size, max_src_length_in_batch, d_model].
-        This module is used at the bottom of the encoder stacks.
-        """
-    src_word_emb = layers.embedding(
-        src_word,
-        size=[src_vocab_size, src_emb_dim],
-        padding_idx=bos_idx,  # set embedding of bos to 0
-        param_attr=fluid.ParamAttr(
-            name=word_emb_param_name,
-            initializer=fluid.initializer.Normal(0., src_emb_dim**-0.5)))
-
-    src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim**0.5)
-    src_pos_enc = layers.embedding(
-        src_pos,
-        size=[src_max_len, src_emb_dim],
-        param_attr=fluid.ParamAttr(
-            name=pos_enc_param_name, trainable=False))
-    src_pos_enc.stop_gradient = True
-    enc_input = src_word_emb + src_pos_enc
-    return layers.dropout(
-        enc_input, dropout_prob=dropout_rate, seed=None,
-        is_test=False) if dropout_rate else enc_input
-
-
-def encoder_layer(enc_input,
-                  attn_bias,
-                  n_head,
-                  d_key,
-                  d_value,
-                  d_model,
-                  d_inner_hid,
-                  prepostprocess_dropout,
-                  attention_dropout,
-                  relu_dropout,
-                  preprocess_cmd="n",
-                  postprocess_cmd="da"):
-    """The encoder layers that can be stacked to form a deep encoder.
-    This module consits of a multi-head (self) attention followed by
-    position-wise feed-forward networks and both the two components companied
-    with the post_process_layer to add residual connection, layer normalization
-    and droput.
-    """
-    attn_output = multi_head_attention(
-        pre_process_layer(enc_input, preprocess_cmd,
-                          prepostprocess_dropout), None, None, attn_bias, d_key,
-        d_value, d_model, n_head, attention_dropout)
-    attn_output = post_process_layer(enc_input, attn_output, postprocess_cmd,
-                                     prepostprocess_dropout)
-    ffd_output = positionwise_feed_forward(
-        pre_process_layer(attn_output, preprocess_cmd, prepostprocess_dropout),
-        d_inner_hid, d_model, relu_dropout)
-    return post_process_layer(attn_output, ffd_output, postprocess_cmd,
-                              prepostprocess_dropout)
-
-
-def encoder(enc_input,
-            attn_bias,
-            n_layer,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            prepostprocess_dropout,
-            attention_dropout,
-            relu_dropout,
-            preprocess_cmd="n",
-            postprocess_cmd="da"):
-    """
-    The encoder is composed of a stack of identical layers returned by calling
-    encoder_layer.
-    """
-    for i in range(n_layer):
-        enc_output = encoder_layer(
-            enc_input,
-            attn_bias,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            prepostprocess_dropout,
-            attention_dropout,
-            relu_dropout,
-            preprocess_cmd,
-            postprocess_cmd, )
-        enc_input = enc_output
-    enc_output = pre_process_layer(enc_output, preprocess_cmd,
-                                   prepostprocess_dropout)
-    return enc_output
-
-
-def wrap_encoder_forFeature(src_vocab_size,
-                            max_length,
-                            n_layer,
-                            n_head,
-                            d_key,
-                            d_value,
-                            d_model,
-                            d_inner_hid,
-                            prepostprocess_dropout,
-                            attention_dropout,
-                            relu_dropout,
-                            preprocess_cmd,
-                            postprocess_cmd,
-                            weight_sharing,
-                            enc_inputs=None,
-                            bos_idx=0):
-    """
-    The wrapper assembles together all needed layers for the encoder.
-    img, src_pos, src_slf_attn_bias = enc_inputs
-    img
-    """
-
-    conv_features, src_pos, src_slf_attn_bias = enc_inputs  #
-    b, t, c = conv_features.shape
-
-    enc_input = prepare_encoder(
-        conv_features,
-        src_pos,
-        src_vocab_size,
-        d_model,
-        max_length,
-        prepostprocess_dropout,
-        bos_idx=bos_idx,
-        word_emb_param_name="src_word_emb_table")
-
-    enc_output = encoder(
-        enc_input,
-        src_slf_attn_bias,
-        n_layer,
-        n_head,
-        d_key,
-        d_value,
-        d_model,
-        d_inner_hid,
-        prepostprocess_dropout,
-        attention_dropout,
-        relu_dropout,
-        preprocess_cmd,
-        postprocess_cmd, )
-    return enc_output
-
-
-def wrap_encoder(src_vocab_size,
-                 max_length,
-                 n_layer,
-                 n_head,
-                 d_key,
-                 d_value,
-                 d_model,
-                 d_inner_hid,
-                 prepostprocess_dropout,
-                 attention_dropout,
-                 relu_dropout,
-                 preprocess_cmd,
-                 postprocess_cmd,
-                 weight_sharing,
-                 enc_inputs=None,
-                 bos_idx=0):
-    """
-    The wrapper assembles together all needed layers for the encoder.
-    img, src_pos, src_slf_attn_bias = enc_inputs
-    img
-    """
-
-    src_word, src_pos, src_slf_attn_bias = enc_inputs  #
-
-    enc_input = prepare_decoder(
-        src_word,
-        src_pos,
-        src_vocab_size,
-        d_model,
-        max_length,
-        prepostprocess_dropout,
-        bos_idx=bos_idx,
-        word_emb_param_name="src_word_emb_table")
-
-    enc_output = encoder(
-        enc_input,
-        src_slf_attn_bias,
-        n_layer,
-        n_head,
-        d_key,
-        d_value,
-        d_model,
-        d_inner_hid,
-        prepostprocess_dropout,
-        attention_dropout,
-        relu_dropout,
-        preprocess_cmd,
-        postprocess_cmd, )
-    return enc_output
--- a/ppocr/modeling/losses/__init__.py
+++ b/ppocr/modeling/losses/__init__.py
@@ -11,3 +11,22 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+import copy
+
+
+def build_loss(config):
+    # det loss
+    from .det_db_loss import DBLoss
+
+    # rec loss
+    from .rec_ctc_loss import CTCLoss
+
+    support_dict = ['DBLoss', 'CTCLoss']
+
+    config = copy.deepcopy(config)
+    module_name = config.pop('name')
+    assert module_name in support_dict, Exception('loss only support {}'.format(
+        support_dict))
+    module_class = eval(module_name)(**config)
+    return module_class
--- a/ppocr/modeling/losses/det_basic_loss.py
+++ b/ppocr/modeling/losses/det_basic_loss.py
-#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

 from __future__ import absolute_import
 from __future__ import division
@@ -18,99 +18,189 @@ from __future__ import print_function

 import numpy as np

-import paddle.fluid as fluid
-
-
-def BalanceLoss(pred,
-                gt,
-                mask,
-                balance_loss=True,
-                main_loss_type="DiceLoss",
-                negative_ratio=3,
-                return_origin=False,
-                eps=1e-6):
-    """
-    The BalanceLoss for Differentiable Binarization text detection
-    args:
-        pred (variable): predicted feature maps.
-        gt (variable): ground truth feature maps.
-        mask (variable): masked maps.
-        balance_loss (bool): whether balance loss or not, default is True
-        main_loss_type (str): can only be one of ['CrossEntropy','DiceLoss',
-            'Euclidean','BCELoss', 'MaskL1Loss'], default is  'DiceLoss'.
-        negative_ratio (int|float): float, default is 3.
-        return_origin (bool): whether return unbalanced loss or not, default is False.
-        eps (float): default is 1e-6.
-    return: (variable) balanced loss
-    """
-    positive = gt * mask
-    negative = (1 - gt) * mask
-
-    positive_count = fluid.layers.reduce_sum(positive)
-    positive_count_int = fluid.layers.cast(positive_count, dtype=np.int32)
-    negative_count = min(
-        fluid.layers.reduce_sum(negative), positive_count * negative_ratio)
-    negative_count_int = fluid.layers.cast(negative_count, dtype=np.int32)
-
-    if main_loss_type == "CrossEntropy":
-        loss = fluid.layers.cross_entropy(input=pred, label=gt, soft_label=True)
-        loss = fluid.layers.reduce_mean(loss)
-    elif main_loss_type == "Euclidean":
-        loss = fluid.layers.square(pred - gt)
-        loss = fluid.layers.reduce_mean(loss)
-    elif main_loss_type == "DiceLoss":
-        loss = DiceLoss(pred, gt, mask)
-    elif main_loss_type == "BCELoss":
-        loss = fluid.layers.sigmoid_cross_entropy_with_logits(pred, label=gt)
-    elif main_loss_type == "MaskL1Loss":
-        loss = MaskL1Loss(pred, gt, mask)
-    else:
-        loss_type = [
-            'CrossEntropy', 'DiceLoss', 'Euclidean', 'BCELoss', 'MaskL1Loss'
-        ]
-        raise Exception("main_loss_type in BalanceLoss() can only be one of {}".
-                        format(loss_type))
-
-    if not balance_loss:
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+
+
+class BalanceLoss(nn.Layer):
+    def __init__(self,
+                 balance_loss=True,
+                 main_loss_type='DiceLoss',
+                 negative_ratio=3,
+                 return_origin=False,
+                 eps=1e-6,
+                 **kwargs):
+        """
+               The BalanceLoss for Differentiable Binarization text detection
+               args:
+                   balance_loss (bool): whether balance loss or not, default is True
+                   main_loss_type (str): can only be one of ['CrossEntropy','DiceLoss',
+                       'Euclidean','BCELoss', 'MaskL1Loss'], default is  'DiceLoss'.
+                   negative_ratio (int|float): float, default is 3.
+                   return_origin (bool): whether return unbalanced loss or not, default is False.
+                   eps (float): default is 1e-6.
+               """
+        super(BalanceLoss, self).__init__()
+        self.balance_loss = balance_loss
+        self.main_loss_type = main_loss_type
+        self.negative_ratio = negative_ratio
+        self.main_loss_type = main_loss_type
+        self.return_origin = return_origin
+        self.eps = eps
+
+        if self.main_loss_type == "CrossEntropy":
+            self.loss = nn.CrossEntropyLoss()
+        elif self.main_loss_type == "Euclidean":
+            self.loss = nn.MSELoss()
+        elif self.main_loss_type == "DiceLoss":
+            self.loss = DiceLoss(self.eps)
+        elif self.main_loss_type == "BCELoss":
+            self.loss = BCELoss(reduction='none')
+        elif self.main_loss_type == "MaskL1Loss":
+            self.loss = MaskL1Loss(self.eps)
+        else:
+            loss_type = [
+                'CrossEntropy', 'DiceLoss', 'Euclidean', 'BCELoss', 'MaskL1Loss'
+            ]
+            raise Exception(
+                "main_loss_type in BalanceLoss() can only be one of {}".format(
+                    loss_type))
+
+    def forward(self, pred, gt, mask=None):
+        """
+        The BalanceLoss for Differentiable Binarization text detection
+        args:
+            pred (variable): predicted feature maps.
+            gt (variable): ground truth feature maps.
+            mask (variable): masked maps.
+        return: (variable) balanced loss
+        """
+        # if self.main_loss_type in ['DiceLoss']:
+        #     # For the loss that returns to scalar value, perform ohem on the mask
+        #     mask = ohem_batch(pred, gt, mask, self.negative_ratio)
+        #     loss = self.loss(pred, gt, mask)
+        #     return loss
+
+        positive = gt * mask
+        negative = (1 - gt) * mask
+
+        positive_count = int(positive.sum())
+        negative_count = int(
+            min(negative.sum(), positive_count * self.negative_ratio))
+        loss = self.loss(pred, gt, mask=mask)
+
+        if not self.balance_loss:
+            return loss
+
+        positive_loss = positive * loss
+        negative_loss = negative * loss
+        negative_loss = paddle.reshape(negative_loss, shape=[-1])
+        if negative_count > 0:
+            sort_loss = negative_loss.sort(descending=True)
+            negative_loss = sort_loss[:negative_count]
+            # negative_loss, _ = paddle.topk(negative_loss, k=negative_count_int)
+            balance_loss = (positive_loss.sum() + negative_loss.sum()) / (
+                positive_count + negative_count + self.eps)
+        else:
+            balance_loss = positive_loss.sum() / (positive_count + self.eps)
+        if self.return_origin:
+            return balance_loss, loss
+
+        return balance_loss
+
+
+class DiceLoss(nn.Layer):
+    def __init__(self, eps=1e-6):
+        super(DiceLoss, self).__init__()
+        self.eps = eps
+
+    def forward(self, pred, gt, mask, weights=None):
+        """
+        DiceLoss function.
+        """
+
+        assert pred.shape == gt.shape
+        assert pred.shape == mask.shape
+        if weights is not None:
+            assert weights.shape == mask.shape
+            mask = weights * mask
+        intersection = paddle.sum(pred * gt * mask)
+
+        union = paddle.sum(pred * mask) + paddle.sum(gt * mask) + self.eps
+        loss = 1 - 2.0 * intersection / union
+        assert loss <= 1
        return loss

-    positive_loss = positive * loss
-    negative_loss = negative * loss
-    negative_loss = fluid.layers.reshape(negative_loss, shape=[-1])
-    negative_loss, _ = fluid.layers.topk(negative_loss, k=negative_count_int)
-    balance_loss = (fluid.layers.reduce_sum(positive_loss) +
-                    fluid.layers.reduce_sum(negative_loss)) / (
-                        positive_count + negative_count + eps)
-
-    if return_origin:
-        return balance_loss, loss
-    return balance_loss
-
-
-def DiceLoss(pred, gt, mask, weights=None, eps=1e-6):
-    """
-    DiceLoss function.
-    """
-
-    assert pred.shape == gt.shape
-    assert pred.shape == mask.shape
-    if weights is not None:
-        assert weights.shape == mask.shape
-        mask = weights * mask
-    intersection = fluid.layers.reduce_sum(pred * gt * mask)
-
-    union = fluid.layers.reduce_sum(pred * mask) + fluid.layers.reduce_sum(
-        gt * mask) + eps
-    loss = 1 - 2.0 * intersection / union
-    assert loss <= 1
-    return loss
-
-
-def MaskL1Loss(pred, gt, mask, eps=1e-6):
-    """
-    Mask L1 Loss
-    """
-    loss = fluid.layers.reduce_sum((fluid.layers.abs(pred - gt) * mask)) / (
-        fluid.layers.reduce_sum(mask) + eps)
-    loss = fluid.layers.reduce_mean(loss)
-    return loss
+
+class MaskL1Loss(nn.Layer):
+    def __init__(self, eps=1e-6):
+        super(MaskL1Loss, self).__init__()
+        self.eps = eps
+
+    def forward(self, pred, gt, mask):
+        """
+        Mask L1 Loss
+        """
+        loss = (paddle.abs(pred - gt) * mask).sum() / (mask.sum() + self.eps)
+        loss = paddle.mean(loss)
+        return loss
+
+
+class BCELoss(nn.Layer):
+    def __init__(self, reduction='mean'):
+        super(BCELoss, self).__init__()
+        self.reduction = reduction
+
+    def forward(self, input, label, mask=None, weight=None, name=None):
+        loss = F.binary_cross_entropy(input, label, reduction=self.reduction)
+        return loss
+
+
+def ohem_single(score, gt_text, training_mask, ohem_ratio):
+    pos_num = (int)(np.sum(gt_text > 0.5)) - (
+        int)(np.sum((gt_text > 0.5) & (training_mask <= 0.5)))
+
+    if pos_num == 0:
+        # selected_mask = gt_text.copy() * 0 # may be not good
+        selected_mask = training_mask
+        selected_mask = selected_mask.reshape(
+            1, selected_mask.shape[0], selected_mask.shape[1]).astype('float32')
+        return selected_mask
+
+    neg_num = (int)(np.sum(gt_text <= 0.5))
+    neg_num = (int)(min(pos_num * ohem_ratio, neg_num))
+
+    if neg_num == 0:
+        selected_mask = training_mask
+        selected_mask = selected_mask.reshape(
+            1, selected_mask.shape[0], selected_mask.shape[1]).astype('float32')
+        return selected_mask
+
+    neg_score = score[gt_text <= 0.5]
+    # 将负样本得分从高到低排序
+    neg_score_sorted = np.sort(-neg_score)
+    threshold = -neg_score_sorted[neg_num - 1]
+    # 选出 得分高的 负样本 和正样本 的 mask
+    selected_mask = ((score >= threshold) |
+                     (gt_text > 0.5)) & (training_mask > 0.5)
+    selected_mask = selected_mask.reshape(
+        1, selected_mask.shape[0], selected_mask.shape[1]).astype('float32')
+    return selected_mask
+
+
+def ohem_batch(scores, gt_texts, training_masks, ohem_ratio):
+    scores = scores.numpy()
+    gt_texts = gt_texts.numpy()
+    training_masks = training_masks.numpy()
+
+    selected_masks = []
+    for i in range(scores.shape[0]):
+        selected_masks.append(
+            ohem_single(scores[i, :, :], gt_texts[i, :, :], training_masks[
+                i, :, :], ohem_ratio))
+
+    selected_masks = np.concatenate(selected_masks, 0)
+    selected_masks = paddle.to_variable(selected_masks)
+
+    return selected_masks
--- a/ppocr/modeling/losses/det_db_loss.py
+++ b/ppocr/modeling/losses/det_db_loss.py
-#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+from paddle import nn
+
 from .det_basic_loss import BalanceLoss, MaskL1Loss, DiceLoss


-class DBLoss(object):
+class DBLoss(nn.Layer):
    """
    Differentiable Binarization (DB) Loss Function
    args:
        param (dict): the super paramter for DB Loss
    """

-    def __init__(self, params):
+    def __init__(self,
+                 balance_loss=True,
+                 main_loss_type='DiceLoss',
+                 alpha=5,
+                 beta=10,
+                 ohem_ratio=3,
+                 eps=1e-6,
+                 **kwargs):
        super(DBLoss, self).__init__()
-        self.balance_loss = params['balance_loss']
-        self.main_loss_type = params['main_loss_type']
-
-        self.alpha = params['alpha']
-        self.beta = params['beta']
-        self.ohem_ratio = params['ohem_ratio']
+        self.alpha = alpha
+        self.beta = beta
+        self.dice_loss = DiceLoss(eps=eps)
+        self.l1_loss = MaskL1Loss(eps=eps)
+        self.bce_loss = BalanceLoss(
+            balance_loss=balance_loss,
+            main_loss_type=main_loss_type,
+            negative_ratio=ohem_ratio)

-    def __call__(self, predicts, labels):
-        label_shrink_map = labels['shrink_map']
-        label_shrink_mask = labels['shrink_mask']
-        label_threshold_map = labels['threshold_map']
-        label_threshold_mask = labels['threshold_mask']
-        pred = predicts['maps']
-        shrink_maps = pred[:, 0, :, :]
-        threshold_maps = pred[:, 1, :, :]
-        binary_maps = pred[:, 2, :, :]
+    def forward(self, predicts, labels):
+        label_threshold_map, label_threshold_mask, label_shrink_map, label_shrink_mask = labels[
+            1:]
+        shrink_maps = predicts[:, 0, :, :]
+        threshold_maps = predicts[:, 1, :, :]
+        binary_maps = predicts[:, 2, :, :]

-        loss_shrink_maps = BalanceLoss(
-            shrink_maps,
-            label_shrink_map,
-            label_shrink_mask,
-            balance_loss=self.balance_loss,
-            main_loss_type=self.main_loss_type,
-            negative_ratio=self.ohem_ratio)
-        loss_threshold_maps = MaskL1Loss(threshold_maps, label_threshold_map,
-                                         label_threshold_mask)
-        loss_binary_maps = DiceLoss(binary_maps, label_shrink_map,
-                                    label_shrink_mask)
+        loss_shrink_maps = self.bce_loss(shrink_maps, label_shrink_map,
+                                         label_shrink_mask)
+        loss_threshold_maps = self.l1_loss(threshold_maps, label_threshold_map,
+                                           label_threshold_mask)
+        loss_binary_maps = self.dice_loss(binary_maps, label_shrink_map,
+                                          label_shrink_mask)
        loss_shrink_maps = self.alpha * loss_shrink_maps
        loss_threshold_maps = self.beta * loss_threshold_maps

-        loss_all = loss_shrink_maps + loss_threshold_maps\
-            + loss_binary_maps
-        losses = {'total_loss':loss_all,\
-            "loss_shrink_maps":loss_shrink_maps,\
-            "loss_threshold_maps":loss_threshold_maps,\
-            "loss_binary_maps":loss_binary_maps}
+        loss_all = loss_shrink_maps + loss_threshold_maps \
+                   + loss_binary_maps
+        losses = {'loss': loss_all, \
+                  "loss_shrink_maps": loss_shrink_maps, \
+                  "loss_threshold_maps": loss_threshold_maps, \
+                  "loss_binary_maps": loss_binary_maps}
        return losses
--- a/ppocr/modeling/losses/det_east_loss.py
+++ b/ppocr/modeling/losses/det_east_loss.py
-#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle.fluid as fluid
-
-
-class EASTLoss(object):
-    """
-    EAST Loss function
-    """
-
-    def __init__(self, params=None):
-        super(EASTLoss, self).__init__()
-
-    def __call__(self, predicts, labels):
-        f_score = predicts['f_score']
-        f_geo = predicts['f_geo']
-        l_score = labels['score']
-        l_geo = labels['geo']
-        l_mask = labels['mask']
-        ##dice_loss
-        intersection = fluid.layers.reduce_sum(f_score * l_score * l_mask)
-        union = fluid.layers.reduce_sum(f_score * l_mask)\
-            + fluid.layers.reduce_sum(l_score * l_mask)
-        dice_loss = 1 - 2 * intersection / (union + 1e-5)
-        #smoooth_l1_loss
-        channels = 8
-        l_geo_split = fluid.layers.split(
-            l_geo, num_or_sections=channels + 1, dim=1)
-        f_geo_split = fluid.layers.split(f_geo, num_or_sections=channels, dim=1)
-        smooth_l1 = 0
-        for i in range(0, channels):
-            geo_diff = l_geo_split[i] - f_geo_split[i]
-            abs_geo_diff = fluid.layers.abs(geo_diff)
-            smooth_l1_sign = fluid.layers.less_than(abs_geo_diff, l_score)
-            smooth_l1_sign = fluid.layers.cast(smooth_l1_sign, dtype='float32')
-            in_loss = abs_geo_diff * abs_geo_diff * smooth_l1_sign + \
-                (abs_geo_diff - 0.5) * (1.0 - smooth_l1_sign)
-            out_loss = l_geo_split[-1] / channels * in_loss * l_score
-            smooth_l1 += out_loss
-        smooth_l1_loss = fluid.layers.reduce_mean(smooth_l1 * l_score)
-        dice_loss = dice_loss * 0.01
-        total_loss = dice_loss + smooth_l1_loss
-        losses = {'total_loss':total_loss, "dice_loss":dice_loss,\
-            "smooth_l1_loss":smooth_l1_loss}
-        return losses
--- a/ppocr/modeling/losses/det_sast_loss.py
+++ b/ppocr/modeling/losses/det_sast_loss.py
-#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle.fluid as fluid
-
-
-class SASTLoss(object):
-    """
-    SAST Loss function
-    """
-
-    def __init__(self, params=None):
-        super(SASTLoss, self).__init__()
-
-    def __call__(self, predicts, labels):
-        """
-        tcl_pos: N x 128 x 3
-        tcl_mask: N x 128 x 1
-        tcl_label: N x X list or LoDTensor
-        """
-                
-        f_score = predicts['f_score']
-        f_border = predicts['f_border']
-        f_tvo = predicts['f_tvo']
-        f_tco = predicts['f_tco']
-
-        l_score = labels['input_score']
-        l_border = labels['input_border']
-        l_mask = labels['input_mask']
-        l_tvo = labels['input_tvo']
-        l_tco = labels['input_tco']
-
-        #score_loss
-        intersection = fluid.layers.reduce_sum(f_score * l_score * l_mask)
-        union = fluid.layers.reduce_sum(f_score * l_mask) + fluid.layers.reduce_sum(l_score * l_mask)
-        score_loss = 1.0 - 2 * intersection / (union + 1e-5)
-
-        #border loss
-        l_border_split, l_border_norm = fluid.layers.split(l_border, num_or_sections=[4, 1], dim=1)
-        f_border_split = f_border
-        l_border_norm_split = fluid.layers.expand(x=l_border_norm, expand_times=[1, 4, 1, 1])
-        l_border_score = fluid.layers.expand(x=l_score, expand_times=[1, 4, 1, 1])   
-        l_border_mask = fluid.layers.expand(x=l_mask, expand_times=[1, 4, 1, 1])   
-        border_diff = l_border_split - f_border_split
-        abs_border_diff = fluid.layers.abs(border_diff) 
-        border_sign = abs_border_diff < 1.0
-        border_sign = fluid.layers.cast(border_sign, dtype='float32')
-        border_sign.stop_gradient = True
-        border_in_loss = 0.5 * abs_border_diff * abs_border_diff * border_sign + \
-                    (abs_border_diff - 0.5) * (1.0 - border_sign)
-        border_out_loss = l_border_norm_split * border_in_loss
-        border_loss = fluid.layers.reduce_sum(border_out_loss * l_border_score * l_border_mask) / \
-                    (fluid.layers.reduce_sum(l_border_score * l_border_mask) + 1e-5)
-
-        #tvo_loss
-        l_tvo_split, l_tvo_norm = fluid.layers.split(l_tvo, num_or_sections=[8, 1], dim=1)
-        f_tvo_split = f_tvo
-        l_tvo_norm_split = fluid.layers.expand(x=l_tvo_norm, expand_times=[1, 8, 1, 1])
-        l_tvo_score = fluid.layers.expand(x=l_score, expand_times=[1, 8, 1, 1])   
-        l_tvo_mask = fluid.layers.expand(x=l_mask, expand_times=[1, 8, 1, 1])   
-        #
-        tvo_geo_diff = l_tvo_split - f_tvo_split
-        abs_tvo_geo_diff = fluid.layers.abs(tvo_geo_diff) 
-        tvo_sign = abs_tvo_geo_diff < 1.0
-        tvo_sign = fluid.layers.cast(tvo_sign, dtype='float32')
-        tvo_sign.stop_gradient = True
-        tvo_in_loss = 0.5 * abs_tvo_geo_diff * abs_tvo_geo_diff * tvo_sign + \
-                    (abs_tvo_geo_diff - 0.5) * (1.0 - tvo_sign)
-        tvo_out_loss = l_tvo_norm_split * tvo_in_loss
-        tvo_loss = fluid.layers.reduce_sum(tvo_out_loss * l_tvo_score * l_tvo_mask) / \
-                    (fluid.layers.reduce_sum(l_tvo_score * l_tvo_mask) + 1e-5)
-
-        #tco_loss
-        l_tco_split, l_tco_norm = fluid.layers.split(l_tco, num_or_sections=[2, 1], dim=1)
-        f_tco_split = f_tco
-        l_tco_norm_split = fluid.layers.expand(x=l_tco_norm, expand_times=[1, 2, 1, 1])
-        l_tco_score = fluid.layers.expand(x=l_score, expand_times=[1, 2, 1, 1])   
-        l_tco_mask = fluid.layers.expand(x=l_mask, expand_times=[1, 2, 1, 1])   
-        #
-        tco_geo_diff = l_tco_split - f_tco_split
-        abs_tco_geo_diff = fluid.layers.abs(tco_geo_diff) 
-        tco_sign = abs_tco_geo_diff < 1.0
-        tco_sign = fluid.layers.cast(tco_sign, dtype='float32')
-        tco_sign.stop_gradient = True
-        tco_in_loss = 0.5 * abs_tco_geo_diff * abs_tco_geo_diff * tco_sign + \
-                    (abs_tco_geo_diff - 0.5) * (1.0 - tco_sign)
-        tco_out_loss = l_tco_norm_split * tco_in_loss
-        tco_loss = fluid.layers.reduce_sum(tco_out_loss * l_tco_score * l_tco_mask) / \
-                    (fluid.layers.reduce_sum(l_tco_score * l_tco_mask) + 1e-5)
-
-
-        # total loss
-        tvo_lw, tco_lw = 1.5, 1.5
-        score_lw, border_lw = 1.0, 1.0
-        total_loss = score_loss * score_lw + border_loss * border_lw + \
-                    tvo_loss * tvo_lw + tco_loss * tco_lw
-                    
-        losses = {'total_loss':total_loss, "score_loss":score_loss,\
-            "border_loss":border_loss, 'tvo_loss':tvo_loss, 'tco_loss':tco_loss}
-        return losses
\ No newline at end of file