Merge remote-tracking branch 'origin/dygraph' into dygraph

e40fd431 · Leif · 6e0cbbe1 · 0da240d0 · e40fd431 · e40fd431
Commit e40fd431 authored Sep 23, 2021 by Leif
20 changed files
--- a/ppocr/data/imaug/__init__.py
+++ b/ppocr/data/imaug/__init__.py
@@ -19,11 +19,13 @@ from __future__ import unicode_literals
 from .iaa_augment import IaaAugment
 from .make_border_map import MakeBorderMap
 from .make_shrink_map import MakeShrinkMap
-from .random_crop_data import EastRandomCropData, PSERandomCrop
+from .random_crop_data import EastRandomCropData, RandomCropImgMask
+from .make_pse_gt import MakePseGt

 from .rec_img_aug import RecAug, RecResizeImg, ClsResizeImg, SRNRecResizeImg, NRTRRecResizeImg, SARRecResizeImg
 from .randaugment import RandAugment
 from .copy_paste import CopyPaste
+from .ColorJitter import ColorJitter
 from .operators import *
 from .label_ops import *


--- a/ppocr/data/imaug/label_ops.py
+++ b/ppocr/data/imaug/label_ops.py
@@ -174,21 +174,26 @@ class NRTRLabelEncode(BaseRecLabelEncode):
        super(NRTRLabelEncode,
              self).__init__(max_text_length, character_dict_path,
                             character_type, use_space_char)
+
    def __call__(self, data):
        text = data['label']
        text = self.encode(text)
        if text is None:
            return None
+        if len(text) >= self.max_text_len - 1:
+            return None
        data['length'] = np.array(len(text))
        text.insert(0, 2)
        text.append(3)
        text = text + [0] * (self.max_text_len - len(text))
        data['label'] = np.array(text)
        return data
+
    def add_special_char(self, dict_character):
-        dict_character = ['blank','<unk>','<s>','</s>'] + dict_character
+        dict_character = ['blank', '<unk>', '<s>', '</s>'] + dict_character
        return dict_character

+
 class CTCLabelEncode(BaseRecLabelEncode):
    """ Convert between text-label and text-index """

@@ -588,7 +593,7 @@ class SARLabelEncode(BaseRecLabelEncode):
        data['length'] = np.array(len(text))
        target = [self.start_idx] + text + [self.end_idx]
        padded_text = [self.padding_idx for _ in range(self.max_text_len)]
-        
+
        padded_text[:len(target)] = target
        data['label'] = np.array(padded_text)
        return data

--- a/ppocr/data/imaug/make_pse_gt.py
+++ b/ppocr/data/imaug/make_pse_gt.py
+# -*- coding:utf-8 -*- 
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import cv2
+import numpy as np
+import pyclipper
+from shapely.geometry import Polygon
+
+__all__ = ['MakePseGt']
+
+class MakePseGt(object):
+    r'''
+    Making binary mask from detection data with ICDAR format.
+    Typically following the process of class `MakeICDARData`.
+    '''
+
+    def __init__(self, kernel_num=7, size=640, min_shrink_ratio=0.4, **kwargs):
+        self.kernel_num = kernel_num
+        self.min_shrink_ratio = min_shrink_ratio
+        self.size = size
+
+    def __call__(self, data):
+
+        image = data['image']
+        text_polys = data['polys']
+        ignore_tags = data['ignore_tags']
+
+        h, w, _ = image.shape
+        short_edge = min(h, w)
+        if short_edge < self.size:
+            # keep short_size >= self.size
+            scale = self.size / short_edge
+            image = cv2.resize(image, dsize=None, fx=scale, fy=scale)
+            text_polys *= scale
+
+        gt_kernels = []
+        for i in range(1,self.kernel_num+1):
+            # s1->sn, from big to small
+            rate = 1.0 - (1.0 - self.min_shrink_ratio) / (self.kernel_num - 1) * i
+            text_kernel, ignore_tags = self.generate_kernel(image.shape[0:2], rate, text_polys, ignore_tags)
+            gt_kernels.append(text_kernel)
+
+        training_mask = np.ones(image.shape[0:2], dtype='uint8')
+        for i in range(text_polys.shape[0]):
+            if ignore_tags[i]:
+                cv2.fillPoly(training_mask, text_polys[i].astype(np.int32)[np.newaxis, :, :], 0)
+
+        gt_kernels = np.array(gt_kernels)
+        gt_kernels[gt_kernels > 0] = 1
+
+        data['image'] = image
+        data['polys'] = text_polys
+        data['gt_kernels'] = gt_kernels[0:]
+        data['gt_text'] = gt_kernels[0]
+        data['mask'] = training_mask.astype('float32')
+        return data
+
+    def generate_kernel(self, img_size, shrink_ratio, text_polys, ignore_tags=None):
+        h, w = img_size
+        text_kernel = np.zeros((h, w), dtype=np.float32)
+        for i, poly in enumerate(text_polys):
+            polygon = Polygon(poly)
+            distance = polygon.area * (1 - shrink_ratio * shrink_ratio) / (polygon.length + 1e-6)
+            subject = [tuple(l) for l in poly]
+            pco = pyclipper.PyclipperOffset()
+            pco.AddPath(subject, pyclipper.JT_ROUND,
+                        pyclipper.ET_CLOSEDPOLYGON)
+            shrinked = np.array(pco.Execute(-distance))
+
+            if len(shrinked) == 0 or shrinked.size == 0:
+                if ignore_tags is not None:
+                    ignore_tags[i] = True
+                continue
+            try:
+                shrinked = np.array(shrinked[0]).reshape(-1, 2)
+            except:
+                if ignore_tags is not None:
+                    ignore_tags[i] = True
+                continue
+            cv2.fillPoly(text_kernel, [shrinked.astype(np.int32)], i + 1)
+        return text_kernel, ignore_tags
--- a/ppocr/data/imaug/random_crop_data.py
+++ b/ppocr/data/imaug/random_crop_data.py
@@ -164,47 +164,55 @@ class EastRandomCropData(object):
        return data


-class PSERandomCrop(object):
-    def __init__(self, size, **kwargs):
+class RandomCropImgMask(object):
+    def __init__(self, size, main_key, crop_keys, p=3 / 8, **kwargs):
        self.size = size
+        self.main_key = main_key
+        self.crop_keys = crop_keys
+        self.p = p

    def __call__(self, data):
-        imgs = data['imgs']
+        image = data['image']

-        h, w = imgs[0].shape[0:2]
+        h, w = image.shape[0:2]
        th, tw = self.size
        if w == tw and h == th:
-            return imgs
+            return data

-        # label中存在文本实例，并且按照概率进行裁剪，使用threshold_label_map控制
-        if np.max(imgs[2]) > 0 and random.random() > 3 / 8:
-            # 文本实例的左上角点
-            tl = np.min(np.where(imgs[2] > 0), axis=1) - self.size
+        mask = data[self.main_key]
+        if np.max(mask) > 0 and random.random() > self.p:
+            # make sure to crop the text region
+            tl = np.min(np.where(mask > 0), axis=1) - (th, tw)
            tl[tl < 0] = 0
-            # 文本实例的右下角点
-            br = np.max(np.where(imgs[2] > 0), axis=1) - self.size
+            br = np.max(np.where(mask > 0), axis=1) - (th, tw)
            br[br < 0] = 0
-            # 保证选到右下角点时，有足够的距离进行crop
+
            br[0] = min(br[0], h - th)
            br[1] = min(br[1], w - tw)

-            for _ in range(50000):
-                i = random.randint(tl[0], br[0])
-                j = random.randint(tl[1], br[1])
-                # 保证shrink_label_map有文本
-                if imgs[1][i:i + th, j:j + tw].sum() <= 0:
-                    continue
-                else:
-                    break
+            i = random.randint(tl[0], br[0]) if tl[0] < br[0] else 0
+            j = random.randint(tl[1], br[1]) if tl[1] < br[1] else 0
        else:
-            i = random.randint(0, h - th)
-            j = random.randint(0, w - tw)
+            i = random.randint(0, h - th) if h - th > 0 else 0
+            j = random.randint(0, w - tw) if w - tw > 0 else 0

        # return i, j, th, tw
-        for idx in range(len(imgs)):
-            if len(imgs[idx].shape) == 3:
-                imgs[idx] = imgs[idx][i:i + th, j:j + tw, :]
-            else:
-                imgs[idx] = imgs[idx][i:i + th, j:j + tw]
-        data['imgs'] = imgs
+        for k in data:
+            if k in self.crop_keys:
+                if len(data[k].shape) == 3:
+                    if np.argmin(data[k].shape) == 0:
+                        img = data[k][:, i:i + th, j:j + tw]
+                        if img.shape[1] != img.shape[2]:
+                            a = 1
+                    elif np.argmin(data[k].shape) == 2:
+                        img = data[k][i:i + th, j:j + tw, :]
+                        if img.shape[1] != img.shape[0]:
+                            a = 1
+                    else:
+                        img = data[k]
+                else:
+                    img = data[k][i:i + th, j:j + tw]
+                    if img.shape[0] != img.shape[1]:
+                        a = 1
+                data[k] = img
        return data
--- a/ppocr/data/imaug/rec_img_aug.py
+++ b/ppocr/data/imaug/rec_img_aug.py
@@ -44,12 +44,33 @@ class ClsResizeImg(object):


 class NRTRRecResizeImg(object):
-    def __init__(self, image_shape, resize_type, **kwargs):
+    def __init__(self, image_shape, resize_type, padding=False, **kwargs):
        self.image_shape = image_shape
        self.resize_type = resize_type
+        self.padding = padding

    def __call__(self, data):
        img = data['image']
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        image_shape = self.image_shape
+        if self.padding:
+            imgC, imgH, imgW = image_shape
+            # todo: change to 0 and modified image shape
+            h = img.shape[0]
+            w = img.shape[1]
+            ratio = w / float(h)
+            if math.ceil(imgH * ratio) > imgW:
+                resized_w = imgW
+            else:
+                resized_w = int(math.ceil(imgH * ratio))
+            resized_image = cv2.resize(img, (resized_w, imgH))
+            norm_img = np.expand_dims(resized_image, -1)
+            norm_img = norm_img.transpose((2, 0, 1))
+            resized_image = norm_img.astype(np.float32) / 128. - 1.
+            padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
+            padding_im[:, :, 0:resized_w] = resized_image
+            data['image'] = padding_im
+            return data
        if self.resize_type == 'PIL':
            image_pil = Image.fromarray(np.uint8(img))
            img = image_pil.resize(self.image_shape, Image.ANTIALIAS)
@@ -109,7 +130,8 @@ class SARRecResizeImg(object):

    def __call__(self, data):
        img = data['image']
-        norm_img, resize_shape, pad_shape, valid_ratio = resize_norm_img_sar(img, self.image_shape, self.width_downsample_ratio)
+        norm_img, resize_shape, pad_shape, valid_ratio = resize_norm_img_sar(
+            img, self.image_shape, self.width_downsample_ratio)
        data['image'] = norm_img
        data['resized_shape'] = resize_shape
        data['pad_shape'] = pad_shape

--- a/ppocr/data/simple_dataset.py
+++ b/ppocr/data/simple_dataset.py
@@ -15,7 +15,6 @@ import numpy as np
 import os
 import random
 from paddle.io import Dataset
-
 from .imaug import transform, create_operators



--- a/ppocr/losses/__init__.py
+++ b/ppocr/losses/__init__.py
@@ -20,6 +20,7 @@ import paddle.nn as nn
 from .det_db_loss import DBLoss
 from .det_east_loss import EASTLoss
 from .det_sast_loss import SASTLoss
+from .det_pse_loss import PSELoss

 # rec loss
 from .rec_ctc_loss import CTCLoss
@@ -42,10 +43,12 @@ from .combined_loss import CombinedLoss
 # table loss
 from .table_att_loss import TableAttentionLoss

+
 def build_loss(config):
    support_dict = [
-        'DBLoss', 'EASTLoss', 'SASTLoss', 'CTCLoss', 'ClsLoss', 'AttentionLoss',
-        'SRNLoss', 'PGLoss', 'CombinedLoss', 'NRTRLoss', 'TableAttentionLoss', 'SARLoss'
+        'DBLoss', 'PSELoss', 'EASTLoss', 'SASTLoss', 'CTCLoss', 'ClsLoss',
+        'AttentionLoss', 'SRNLoss', 'PGLoss', 'CombinedLoss', 'NRTRLoss',
+        'TableAttentionLoss', 'SARLoss'
    ]

    config = copy.deepcopy(config)

--- a/ppocr/losses/basic_loss.py
+++ b/ppocr/losses/basic_loss.py
@@ -56,31 +56,34 @@ class CELoss(nn.Layer):

 class KLJSLoss(object):
    def __init__(self, mode='kl'):
-        assert mode in ['kl', 'js', 'KL', 'JS'], "mode can only be one of ['kl', 'js', 'KL', 'JS']"
+        assert mode in ['kl', 'js', 'KL', 'JS'
+                        ], "mode can only be one of ['kl', 'js', 'KL', 'JS']"
        self.mode = mode

    def __call__(self, p1, p2, reduction="mean"):

-        loss = paddle.multiply(p2, paddle.log( (p2+1e-5)/(p1+1e-5) + 1e-5))
+        loss = paddle.multiply(p2, paddle.log((p2 + 1e-5) / (p1 + 1e-5) + 1e-5))

        if self.mode.lower() == "js":
-            loss += paddle.multiply(p1, paddle.log((p1+1e-5)/(p2+1e-5) + 1e-5))
+            loss += paddle.multiply(
+                p1, paddle.log((p1 + 1e-5) / (p2 + 1e-5) + 1e-5))
            loss *= 0.5
        if reduction == "mean":
-            loss = paddle.mean(loss, axis=[1,2])
-        elif reduction=="none" or reduction is None:
-            return loss 
+            loss = paddle.mean(loss, axis=[1, 2])
+        elif reduction == "none" or reduction is None:
+            return loss
        else:
-            loss = paddle.sum(loss, axis=[1,2])
+            loss = paddle.sum(loss, axis=[1, 2])
+
+        return loss

-        return loss 

 class DMLLoss(nn.Layer):
    """
    DMLLoss
    """

-    def __init__(self, act=None):
+    def __init__(self, act=None, use_log=False):
        super().__init__()
        if act is not None:
            assert act in ["softmax", "sigmoid"]
@@ -90,20 +93,24 @@ class DMLLoss(nn.Layer):
            self.act = nn.Sigmoid()
        else:
            self.act = None
-        
+
+        self.use_log = use_log
+
        self.jskl_loss = KLJSLoss(mode="js")

    def forward(self, out1, out2):
        if self.act is not None:
            out1 = self.act(out1)
            out2 = self.act(out2)
-        if len(out1.shape) < 2:
+        if self.use_log:
+            # for recognition distillation, log is needed for feature map
            log_out1 = paddle.log(out1)
            log_out2 = paddle.log(out2)
            loss = (F.kl_div(
                log_out1, out2, reduction='batchmean') + F.kl_div(
                    log_out2, out1, reduction='batchmean')) / 2.0
        else:
+            # for detection distillation log is not needed
            loss = self.jskl_loss(out1, out2)
        return loss


--- a/ppocr/losses/combined_loss.py
+++ b/ppocr/losses/combined_loss.py
@@ -49,11 +49,15 @@ class CombinedLoss(nn.Layer):
            loss = loss_func(input, batch, **kargs)
            if isinstance(loss, paddle.Tensor):
                loss = {"loss_{}_{}".format(str(loss), idx): loss}
+
            weight = self.loss_weight[idx]
-            for key in loss.keys():
-                if key == "loss":
-                    loss_all += loss[key] * weight
-                else:
-                    loss_dict["{}_{}".format(key, idx)] = loss[key]
+
+            loss = {key: loss[key] * weight for key in loss}
+
+            if "loss" in loss:
+                loss_all += loss["loss"]
+            else:
+                loss_all += paddle.add_n(list(loss.values()))
+            loss_dict.update(loss)
        loss_dict["loss"] = loss_all
        return loss_dict
--- a/ppocr/losses/det_basic_loss.py
+++ b/ppocr/losses/det_basic_loss.py
@@ -75,12 +75,6 @@ class BalanceLoss(nn.Layer):
            mask (variable): masked maps.
        return: (variable) balanced loss
        """
-        # if self.main_loss_type in ['DiceLoss']:
-        #     # For the loss that returns to scalar value, perform ohem on the mask
-        #     mask = ohem_batch(pred, gt, mask, self.negative_ratio)
-        #     loss = self.loss(pred, gt, mask)
-        #     return loss
-
        positive = gt * mask
        negative = (1 - gt) * mask

@@ -153,53 +147,4 @@ class BCELoss(nn.Layer):

    def forward(self, input, label, mask=None, weight=None, name=None):
        loss = F.binary_cross_entropy(input, label, reduction=self.reduction)
-        return loss
-
-
-def ohem_single(score, gt_text, training_mask, ohem_ratio):
-    pos_num = (int)(np.sum(gt_text > 0.5)) - (
-        int)(np.sum((gt_text > 0.5) & (training_mask <= 0.5)))
-
-    if pos_num == 0:
-        # selected_mask = gt_text.copy() * 0 # may be not good
-        selected_mask = training_mask
-        selected_mask = selected_mask.reshape(
-            1, selected_mask.shape[0], selected_mask.shape[1]).astype('float32')
-        return selected_mask
-
-    neg_num = (int)(np.sum(gt_text <= 0.5))
-    neg_num = (int)(min(pos_num * ohem_ratio, neg_num))
-
-    if neg_num == 0:
-        selected_mask = training_mask
-        selected_mask = selected_mask.reshape(
-            1, selected_mask.shape[0], selected_mask.shape[1]).astype('float32')
-        return selected_mask
-
-    neg_score = score[gt_text <= 0.5]
-    # 将负样本得分从高到低排序
-    neg_score_sorted = np.sort(-neg_score)
-    threshold = -neg_score_sorted[neg_num - 1]
-    # 选出 得分高的 负样本 和正样本 的 mask
-    selected_mask = ((score >= threshold) |
-                     (gt_text > 0.5)) & (training_mask > 0.5)
-    selected_mask = selected_mask.reshape(
-        1, selected_mask.shape[0], selected_mask.shape[1]).astype('float32')
-    return selected_mask
-
-
-def ohem_batch(scores, gt_texts, training_masks, ohem_ratio):
-    scores = scores.numpy()
-    gt_texts = gt_texts.numpy()
-    training_masks = training_masks.numpy()
-
-    selected_masks = []
-    for i in range(scores.shape[0]):
-        selected_masks.append(
-            ohem_single(scores[i, :, :], gt_texts[i, :, :], training_masks[
-                i, :, :], ohem_ratio))
-
-    selected_masks = np.concatenate(selected_masks, 0)
-    selected_masks = paddle.to_tensor(selected_masks)
-
-    return selected_masks
+        return loss
\ No newline at end of file
--- a/ppocr/losses/det_pse_loss.py
+++ b/ppocr/losses/det_pse_loss.py
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import nn
+from paddle.nn import functional as F
+import numpy as np
+from ppocr.utils.iou import iou
+
+
+class PSELoss(nn.Layer):
+    def __init__(self,
+                 alpha,
+                 ohem_ratio=3,
+                 kernel_sample_mask='pred',
+                 reduction='sum',
+                 eps=1e-6,
+                 **kwargs):
+        """Implement PSE Loss.
+        """
+        super(PSELoss, self).__init__()
+        assert reduction in ['sum', 'mean', 'none']
+        self.alpha = alpha
+        self.ohem_ratio = ohem_ratio
+        self.kernel_sample_mask = kernel_sample_mask
+        self.reduction = reduction
+        self.eps = eps
+
+    def forward(self, outputs, labels):
+        predicts = outputs['maps']
+        predicts = F.interpolate(predicts, scale_factor=4)
+
+        texts = predicts[:, 0, :, :]
+        kernels = predicts[:, 1:, :, :]
+        gt_texts, gt_kernels, training_masks = labels[1:]
+
+        # text loss
+        selected_masks = self.ohem_batch(texts, gt_texts, training_masks)
+
+        loss_text = self.dice_loss(texts, gt_texts, selected_masks)
+        iou_text = iou((texts > 0).astype('int64'),
+                       gt_texts,
+                       training_masks,
+                       reduce=False)
+        losses = dict(loss_text=loss_text, iou_text=iou_text)
+
+        # kernel loss
+        loss_kernels = []
+        if self.kernel_sample_mask == 'gt':
+            selected_masks = gt_texts * training_masks
+        elif self.kernel_sample_mask == 'pred':
+            selected_masks = (
+                F.sigmoid(texts) > 0.5).astype('float32') * training_masks
+
+        for i in range(kernels.shape[1]):
+            kernel_i = kernels[:, i, :, :]
+            gt_kernel_i = gt_kernels[:, i, :, :]
+            loss_kernel_i = self.dice_loss(kernel_i, gt_kernel_i,
+                                           selected_masks)
+            loss_kernels.append(loss_kernel_i)
+        loss_kernels = paddle.mean(paddle.stack(loss_kernels, axis=1), axis=1)
+        iou_kernel = iou((kernels[:, -1, :, :] > 0).astype('int64'),
+                         gt_kernels[:, -1, :, :],
+                         training_masks * gt_texts,
+                         reduce=False)
+        losses.update(dict(loss_kernels=loss_kernels, iou_kernel=iou_kernel))
+        loss = self.alpha * loss_text + (1 - self.alpha) * loss_kernels
+        losses['loss'] = loss
+        if self.reduction == 'sum':
+            losses = {x: paddle.sum(v) for x, v in losses.items()}
+        elif self.reduction == 'mean':
+            losses = {x: paddle.mean(v) for x, v in losses.items()}
+        return losses
+
+    def dice_loss(self, input, target, mask):
+        input = F.sigmoid(input)
+
+        input = input.reshape([input.shape[0], -1])
+        target = target.reshape([target.shape[0], -1])
+        mask = mask.reshape([mask.shape[0], -1])
+
+        input = input * mask
+        target = target * mask
+
+        a = paddle.sum(input * target, 1)
+        b = paddle.sum(input * input, 1) + self.eps
+        c = paddle.sum(target * target, 1) + self.eps
+        d = (2 * a) / (b + c)
+        return 1 - d
+
+    def ohem_single(self, score, gt_text, training_mask, ohem_ratio=3):
+        pos_num = int(paddle.sum((gt_text > 0.5).astype('float32'))) - int(
+            paddle.sum(
+                paddle.logical_and((gt_text > 0.5), (training_mask <= 0.5))
+                .astype('float32')))
+
+        if pos_num == 0:
+            selected_mask = training_mask
+            selected_mask = selected_mask.reshape(
+                [1, selected_mask.shape[0], selected_mask.shape[1]]).astype(
+                    'float32')
+            return selected_mask
+
+        neg_num = int(paddle.sum((gt_text <= 0.5).astype('float32')))
+        neg_num = int(min(pos_num * ohem_ratio, neg_num))
+
+        if neg_num == 0:
+            selected_mask = training_mask
+            selected_mask = selected_mask.view(
+                1, selected_mask.shape[0],
+                selected_mask.shape[1]).astype('float32')
+            return selected_mask
+
+        neg_score = paddle.masked_select(score, gt_text <= 0.5)
+        neg_score_sorted = paddle.sort(-neg_score)
+        threshold = -neg_score_sorted[neg_num - 1]
+
+        selected_mask = paddle.logical_and(
+            paddle.logical_or((score >= threshold), (gt_text > 0.5)),
+            (training_mask > 0.5))
+        selected_mask = selected_mask.reshape(
+            [1, selected_mask.shape[0], selected_mask.shape[1]]).astype(
+                'float32')
+        return selected_mask
+
+    def ohem_batch(self, scores, gt_texts, training_masks, ohem_ratio=3):
+        selected_masks = []
+        for i in range(scores.shape[0]):
+            selected_masks.append(
+                self.ohem_single(scores[i, :, :], gt_texts[i, :, :],
+                                 training_masks[i, :, :], ohem_ratio))
+
+        selected_masks = paddle.concat(selected_masks, 0).astype('float32')
+        return selected_masks
--- a/ppocr/losses/distillation_loss.py
+++ b/ppocr/losses/distillation_loss.py
@@ -44,20 +44,22 @@ class DistillationDMLLoss(DMLLoss):
    def __init__(self,
                 model_name_pairs=[],
                 act=None,
+                 use_log=False,
                 key=None,
                 maps_name=None,
                 name="dml"):
-        super().__init__(act=act)
+        super().__init__(act=act, use_log=use_log)
        assert isinstance(model_name_pairs, list)
        self.key = key
        self.model_name_pairs = self._check_model_name_pairs(model_name_pairs)
        self.name = name
        self.maps_name = self._check_maps_name(maps_name)
-    
+
    def _check_model_name_pairs(self, model_name_pairs):
        if not isinstance(model_name_pairs, list):
            return []
-        elif isinstance(model_name_pairs[0], list) and isinstance(model_name_pairs[0][0], str):
+        elif isinstance(model_name_pairs[0], list) and isinstance(
+                model_name_pairs[0][0], str):
            return model_name_pairs
        else:
            return [model_name_pairs]
@@ -112,9 +114,9 @@ class DistillationDMLLoss(DMLLoss):
                            loss_dict["{}_{}_{}_{}_{}".format(key, pair[
                                0], pair[1], map_name, idx)] = loss[key]
                    else:
-                        loss_dict["{}_{}_{}".format(self.name, self.maps_name[_c],
-                                                    idx)] = loss
-        
+                        loss_dict["{}_{}_{}".format(self.name, self.maps_name[
+                            _c], idx)] = loss
+
        loss_dict = _sum_loss(loss_dict)

        return loss_dict

--- a/ppocr/metrics/eval_det_iou.py
+++ b/ppocr/metrics/eval_det_iou.py
@@ -169,21 +169,10 @@ class DetectionIoUEvaluator(object):
        numGlobalCareDet += numDetCare

        perSampleMetrics = {
-            'precision': precision,
-            'recall': recall,
-            'hmean': hmean,
-            'pairs': pairs,
-            'iouMat': [] if len(detPols) > 100 else iouMat.tolist(),
-            'gtPolPoints': gtPolPoints,
-            'detPolPoints': detPolPoints,
            'gtCare': numGtCare,
            'detCare': numDetCare,
-            'gtDontCare': gtDontCarePolsNum,
-            'detDontCare': detDontCarePolsNum,
            'detMatched': detMatched,
-            'evaluationLog': evaluationLog
        }
-
        return perSampleMetrics

    def combine_results(self, results):

--- a/ppocr/modeling/backbones/rec_nrtr_mtb.py
+++ b/ppocr/modeling/backbones/rec_nrtr_mtb.py
@@ -13,6 +13,7 @@
 # limitations under the License.

 from paddle import nn
+import paddle


 class MTB(nn.Layer):
@@ -40,7 +41,8 @@ class MTB(nn.Layer):
        x = self.block(images)
        if self.cnn_num == 2:
            # (b, w, h, c)
-            x = x.transpose([0, 3, 2, 1])
-            x_shape = x.shape
-            x = x.reshape([x_shape[0], x_shape[1], x_shape[2] * x_shape[3]])
+            x = paddle.transpose(x, [0, 3, 2, 1])
+            x_shape = paddle.shape(x)
+            x = paddle.reshape(
+                x, [x_shape[0], x_shape[1], x_shape[2] * x_shape[3]])
        return x
--- a/ppocr/modeling/heads/__init__.py
+++ b/ppocr/modeling/heads/__init__.py
@@ -20,6 +20,7 @@ def build_head(config):
    from .det_db_head import DBHead
    from .det_east_head import EASTHead
    from .det_sast_head import SASTHead
+    from .det_pse_head import PSEHead
    from .e2e_pg_head import PGHead

    # rec head
@@ -32,8 +33,9 @@ def build_head(config):
    # cls head
    from .cls_head import ClsHead
    support_dict = [
-        'DBHead', 'EASTHead', 'SASTHead', 'CTCHead', 'ClsHead', 'AttentionHead',
-        'SRNHead', 'PGHead', 'Transformer', 'TableAttentionHead', 'SARHead'
+        'DBHead', 'PSEHead', 'EASTHead', 'SASTHead', 'CTCHead', 'ClsHead',
+        'AttentionHead', 'SRNHead', 'PGHead', 'Transformer',
+        'TableAttentionHead', 'SARHead'
    ]

    #table head

--- a/ppocr/modeling/heads/det_pse_head.py
+++ b/ppocr/modeling/heads/det_pse_head.py
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle import nn
+
+
+class PSEHead(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 hidden_dim=256,
+                 out_channels=7,
+                 **kwargs):
+        super(PSEHead, self).__init__()
+        self.conv1 = nn.Conv2D(in_channels, hidden_dim, kernel_size=3, stride=1, padding=1)
+        self.bn1 = nn.BatchNorm2D(hidden_dim)
+        self.relu1 = nn.ReLU()
+
+        self.conv2 = nn.Conv2D(hidden_dim, out_channels, kernel_size=1, stride=1, padding=0)
+
+
+    def forward(self, x, **kwargs):
+        out = self.conv1(x)
+        out = self.relu1(self.bn1(out))
+        out = self.conv2(out)
+        return {'maps': out}
--- a/ppocr/modeling/heads/multiheadAttention.py
+++ b/ppocr/modeling/heads/multiheadAttention.py
@@ -71,8 +71,6 @@ class MultiheadAttention(nn.Layer):
                value,
                key_padding_mask=None,
                incremental_state=None,
-                need_weights=True,
-                static_kv=False,
                attn_mask=None):
        """
        Inputs of forward function
@@ -88,46 +86,42 @@ class MultiheadAttention(nn.Layer):
            attn_output: [target length, batch size, embed dim]
            attn_output_weights: [batch size, target length, sequence length]
        """
-        tgt_len, bsz, embed_dim = query.shape
-        assert embed_dim == self.embed_dim
-        assert list(query.shape) == [tgt_len, bsz, embed_dim]
-        assert key.shape == value.shape
-
+        q_shape = paddle.shape(query)
+        src_shape = paddle.shape(key)
        q = self._in_proj_q(query)
        k = self._in_proj_k(key)
        v = self._in_proj_v(value)
        q *= self.scaling
-
-        q = q.reshape([tgt_len, bsz * self.num_heads, self.head_dim]).transpose(
-            [1, 0, 2])
-        k = k.reshape([-1, bsz * self.num_heads, self.head_dim]).transpose(
-            [1, 0, 2])
-        v = v.reshape([-1, bsz * self.num_heads, self.head_dim]).transpose(
-            [1, 0, 2])
-
-        src_len = k.shape[1]
-
+        q = paddle.transpose(
+            paddle.reshape(
+                q, [q_shape[0], q_shape[1], self.num_heads, self.head_dim]),
+            [1, 2, 0, 3])
+        k = paddle.transpose(
+            paddle.reshape(
+                k, [src_shape[0], q_shape[1], self.num_heads, self.head_dim]),
+            [1, 2, 0, 3])
+        v = paddle.transpose(
+            paddle.reshape(
+                v, [src_shape[0], q_shape[1], self.num_heads, self.head_dim]),
+            [1, 2, 0, 3])
        if key_padding_mask is not None:
-            assert key_padding_mask.shape[0] == bsz
-            assert key_padding_mask.shape[1] == src_len
-
-        attn_output_weights = paddle.bmm(q, k.transpose([0, 2, 1]))
-        assert list(attn_output_weights.
-                    shape) == [bsz * self.num_heads, tgt_len, src_len]
-
+            assert key_padding_mask.shape[0] == q_shape[1]
+            assert key_padding_mask.shape[1] == src_shape[0]
+        attn_output_weights = paddle.matmul(q,
+                                            paddle.transpose(k, [0, 1, 3, 2]))
        if attn_mask is not None:
-            attn_mask = attn_mask.unsqueeze(0)
+            attn_mask = paddle.unsqueeze(paddle.unsqueeze(attn_mask, 0), 0)
            attn_output_weights += attn_mask
        if key_padding_mask is not None:
-            attn_output_weights = attn_output_weights.reshape(
-                [bsz, self.num_heads, tgt_len, src_len])
-            key = key_padding_mask.unsqueeze(1).unsqueeze(2).astype('float32')
-            y = paddle.full(shape=key.shape, dtype='float32', fill_value='-inf')
+            attn_output_weights = paddle.reshape(
+                attn_output_weights,
+                [q_shape[1], self.num_heads, q_shape[0], src_shape[0]])
+            key = paddle.unsqueeze(paddle.unsqueeze(key_padding_mask, 1), 2)
+            key = paddle.cast(key, 'float32')
+            y = paddle.full(
+                shape=paddle.shape(key), dtype='float32', fill_value='-inf')
            y = paddle.where(key == 0., key, y)
            attn_output_weights += y
-            attn_output_weights = attn_output_weights.reshape(
-                [bsz * self.num_heads, tgt_len, src_len])
-
        attn_output_weights = F.softmax(
            attn_output_weights.astype('float32'),
            axis=-1,
@@ -136,43 +130,34 @@ class MultiheadAttention(nn.Layer):
        attn_output_weights = F.dropout(
            attn_output_weights, p=self.dropout, training=self.training)

-        attn_output = paddle.bmm(attn_output_weights, v)
-        assert list(attn_output.
-                    shape) == [bsz * self.num_heads, tgt_len, self.head_dim]
-        attn_output = attn_output.transpose([1, 0, 2]).reshape(
-            [tgt_len, bsz, embed_dim])
+        attn_output = paddle.matmul(attn_output_weights, v)
+        attn_output = paddle.reshape(
+            paddle.transpose(attn_output, [2, 0, 1, 3]),
+            [q_shape[0], q_shape[1], self.embed_dim])
        attn_output = self.out_proj(attn_output)

-        if need_weights:
-            # average attention weights over heads
-            attn_output_weights = attn_output_weights.reshape(
-                [bsz, self.num_heads, tgt_len, src_len])
-            attn_output_weights = attn_output_weights.sum(
-                axis=1) / self.num_heads
-        else:
-            attn_output_weights = None
-        return attn_output, attn_output_weights
+        return attn_output

    def _in_proj_q(self, query):
-        query = query.transpose([1, 2, 0])
+        query = paddle.transpose(query, [1, 2, 0])
        query = paddle.unsqueeze(query, axis=2)
        res = self.conv1(query)
        res = paddle.squeeze(res, axis=2)
-        res = res.transpose([2, 0, 1])
+        res = paddle.transpose(res, [2, 0, 1])
        return res

    def _in_proj_k(self, key):
-        key = key.transpose([1, 2, 0])
+        key = paddle.transpose(key, [1, 2, 0])
        key = paddle.unsqueeze(key, axis=2)
        res = self.conv2(key)
        res = paddle.squeeze(res, axis=2)
-        res = res.transpose([2, 0, 1])
+        res = paddle.transpose(res, [2, 0, 1])
        return res

    def _in_proj_v(self, value):
-        value = value.transpose([1, 2, 0])  #(1, 2, 0)
+        value = paddle.transpose(value, [1, 2, 0])  #(1, 2, 0)
        value = paddle.unsqueeze(value, axis=2)
        res = self.conv3(value)
        res = paddle.squeeze(res, axis=2)
-        res = res.transpose([2, 0, 1])
+        res = paddle.transpose(res, [2, 0, 1])
        return res
--- a/ppocr/modeling/heads/rec_nrtr_head.py
+++ b/ppocr/modeling/heads/rec_nrtr_head.py
@@ -61,12 +61,12 @@ class Transformer(nn.Layer):
                 custom_decoder=None,
                 in_channels=0,
                 out_channels=0,
-                 dst_vocab_size=99,
                 scale_embedding=True):
        super(Transformer, self).__init__()
+        self.out_channels = out_channels + 1
        self.embedding = Embeddings(
            d_model=d_model,
-            vocab=dst_vocab_size,
+            vocab=self.out_channels,
            padding_idx=0,
            scale_embedding=scale_embedding)
        self.positional_encoding = PositionalEncoding(
@@ -96,9 +96,10 @@ class Transformer(nn.Layer):
        self.beam_size = beam_size
        self.d_model = d_model
        self.nhead = nhead
-        self.tgt_word_prj = nn.Linear(d_model, dst_vocab_size, bias_attr=False)
+        self.tgt_word_prj = nn.Linear(
+            d_model, self.out_channels, bias_attr=False)
        w0 = np.random.normal(0.0, d_model**-0.5,
-                              (d_model, dst_vocab_size)).astype(np.float32)
+                              (d_model, self.out_channels)).astype(np.float32)
        self.tgt_word_prj.weight.set_value(w0)
        self.apply(self._init_weights)

@@ -156,46 +157,41 @@ class Transformer(nn.Layer):
                return self.forward_test(src)

    def forward_test(self, src):
-        bs = src.shape[0]
+        bs = paddle.shape(src)[0]
        if self.encoder is not None:
-            src = self.positional_encoding(src.transpose([1, 0, 2]))
+            src = self.positional_encoding(paddle.transpose(src, [1, 0, 2]))
            memory = self.encoder(src)
        else:
-            memory = src.squeeze(2).transpose([2, 0, 1])
+            memory = paddle.transpose(paddle.squeeze(src, 2), [2, 0, 1])
        dec_seq = paddle.full((bs, 1), 2, dtype=paddle.int64)
+        dec_prob = paddle.full((bs, 1), 1., dtype=paddle.float32)
        for len_dec_seq in range(1, 25):
-            src_enc = memory.clone()
-            tgt_key_padding_mask = self.generate_padding_mask(dec_seq)
-            dec_seq_embed = self.embedding(dec_seq).transpose([1, 0, 2])
+            dec_seq_embed = paddle.transpose(self.embedding(dec_seq), [1, 0, 2])
            dec_seq_embed = self.positional_encoding(dec_seq_embed)
-            tgt_mask = self.generate_square_subsequent_mask(dec_seq_embed.shape[
-                0])
+            tgt_mask = self.generate_square_subsequent_mask(
+                paddle.shape(dec_seq_embed)[0])
            output = self.decoder(
                dec_seq_embed,
-                src_enc,
+                memory,
                tgt_mask=tgt_mask,
                memory_mask=None,
-                tgt_key_padding_mask=tgt_key_padding_mask,
+                tgt_key_padding_mask=None,
                memory_key_padding_mask=None)
-            dec_output = output.transpose([1, 0, 2])
-
-            dec_output = dec_output[:,
-                                    -1, :]  # Pick the last step: (bh * bm) * d_h
-            word_prob = F.log_softmax(self.tgt_word_prj(dec_output), axis=1)
-            word_prob = word_prob.reshape([1, bs, -1])
-            preds_idx = word_prob.argmax(axis=2)
-
+            dec_output = paddle.transpose(output, [1, 0, 2])
+            dec_output = dec_output[:, -1, :]
+            word_prob = F.softmax(self.tgt_word_prj(dec_output), axis=1)
+            preds_idx = paddle.argmax(word_prob, axis=1)
            if paddle.equal_all(
-                    preds_idx[-1],
+                    preds_idx,
                    paddle.full(
-                        preds_idx[-1].shape, 3, dtype='int64')):
+                        paddle.shape(preds_idx), 3, dtype='int64')):
                break
-
-            preds_prob = word_prob.max(axis=2)
+            preds_prob = paddle.max(word_prob, axis=1)
            dec_seq = paddle.concat(
-                [dec_seq, preds_idx.reshape([-1, 1])], axis=1)
-
-        return dec_seq
+                [dec_seq, paddle.reshape(preds_idx, [-1, 1])], axis=1)
+            dec_prob = paddle.concat(
+                [dec_prob, paddle.reshape(preds_prob, [-1, 1])], axis=1)
+        return [dec_seq, dec_prob]

    def forward_beam(self, images):
        ''' Translation work in one batch '''
@@ -211,14 +207,15 @@ class Transformer(nn.Layer):
                                n_prev_active_inst, n_bm):
            ''' Collect tensor parts associated to active instances. '''

-            _, *d_hs = beamed_tensor.shape
+            beamed_tensor_shape = paddle.shape(beamed_tensor)
            n_curr_active_inst = len(curr_active_inst_idx)
-            new_shape = (n_curr_active_inst * n_bm, *d_hs)
+            new_shape = (n_curr_active_inst * n_bm, beamed_tensor_shape[1],
+                         beamed_tensor_shape[2])

            beamed_tensor = beamed_tensor.reshape([n_prev_active_inst, -1])
            beamed_tensor = beamed_tensor.index_select(
-                paddle.to_tensor(curr_active_inst_idx), axis=0)
-            beamed_tensor = beamed_tensor.reshape([*new_shape])
+                curr_active_inst_idx, axis=0)
+            beamed_tensor = beamed_tensor.reshape(new_shape)

            return beamed_tensor

@@ -249,44 +246,26 @@ class Transformer(nn.Layer):
                    b.get_current_state() for b in inst_dec_beams if not b.done
                ]
                dec_partial_seq = paddle.stack(dec_partial_seq)
-
                dec_partial_seq = dec_partial_seq.reshape([-1, len_dec_seq])
                return dec_partial_seq

-            def prepare_beam_memory_key_padding_mask(
-                    inst_dec_beams, memory_key_padding_mask, n_bm):
-                keep = []
-                for idx in (memory_key_padding_mask):
-                    if not inst_dec_beams[idx].done:
-                        keep.append(idx)
-                memory_key_padding_mask = memory_key_padding_mask[
-                    paddle.to_tensor(keep)]
-                len_s = memory_key_padding_mask.shape[-1]
-                n_inst = memory_key_padding_mask.shape[0]
-                memory_key_padding_mask = paddle.concat(
-                    [memory_key_padding_mask for i in range(n_bm)], axis=1)
-                memory_key_padding_mask = memory_key_padding_mask.reshape(
-                    [n_inst * n_bm, len_s])  #repeat(1, n_bm)
-                return memory_key_padding_mask
-
            def predict_word(dec_seq, enc_output, n_active_inst, n_bm,
                             memory_key_padding_mask):
-                tgt_key_padding_mask = self.generate_padding_mask(dec_seq)
-                dec_seq = self.embedding(dec_seq).transpose([1, 0, 2])
+                dec_seq = paddle.transpose(self.embedding(dec_seq), [1, 0, 2])
                dec_seq = self.positional_encoding(dec_seq)
-                tgt_mask = self.generate_square_subsequent_mask(dec_seq.shape[
-                    0])
+                tgt_mask = self.generate_square_subsequent_mask(
+                    paddle.shape(dec_seq)[0])
                dec_output = self.decoder(
                    dec_seq,
                    enc_output,
                    tgt_mask=tgt_mask,
-                    tgt_key_padding_mask=tgt_key_padding_mask,
-                    memory_key_padding_mask=memory_key_padding_mask,
-                ).transpose([1, 0, 2])
+                    tgt_key_padding_mask=None,
+                    memory_key_padding_mask=memory_key_padding_mask, )
+                dec_output = paddle.transpose(dec_output, [1, 0, 2])
                dec_output = dec_output[:,
                                        -1, :]  # Pick the last step: (bh * bm) * d_h
-                word_prob = F.log_softmax(self.tgt_word_prj(dec_output), axis=1)
-                word_prob = word_prob.reshape([n_active_inst, n_bm, -1])
+                word_prob = F.softmax(self.tgt_word_prj(dec_output), axis=1)
+                word_prob = paddle.reshape(word_prob, [n_active_inst, n_bm, -1])
                return word_prob

            def collect_active_inst_idx_list(inst_beams, word_prob,
@@ -302,9 +281,8 @@ class Transformer(nn.Layer):

            n_active_inst = len(inst_idx_to_position_map)
            dec_seq = prepare_beam_dec_seq(inst_dec_beams, len_dec_seq)
-            memory_key_padding_mask = None
            word_prob = predict_word(dec_seq, enc_output, n_active_inst, n_bm,
-                                     memory_key_padding_mask)
+                                     None)
            # Update the beam with predicted word prob information and collect incomplete instances
            active_inst_idx_list = collect_active_inst_idx_list(
                inst_dec_beams, word_prob, inst_idx_to_position_map)
@@ -324,27 +302,21 @@ class Transformer(nn.Layer):

        with paddle.no_grad():
            #-- Encode
-
            if self.encoder is not None:
                src = self.positional_encoding(images.transpose([1, 0, 2]))
-                src_enc = self.encoder(src).transpose([1, 0, 2])
+                src_enc = self.encoder(src)
            else:
                src_enc = images.squeeze(2).transpose([0, 2, 1])

-            #-- Repeat data for beam search
            n_bm = self.beam_size
-            n_inst, len_s, d_h = src_enc.shape
-            src_enc = paddle.concat([src_enc for i in range(n_bm)], axis=1)
-            src_enc = src_enc.reshape([n_inst * n_bm, len_s, d_h]).transpose(
-                [1, 0, 2])
-            #-- Prepare beams
-            inst_dec_beams = [Beam(n_bm) for _ in range(n_inst)]
-
-            #-- Bookkeeping for active or not
-            active_inst_idx_list = list(range(n_inst))
+            src_shape = paddle.shape(src_enc)
+            inst_dec_beams = [Beam(n_bm) for _ in range(1)]
+            active_inst_idx_list = list(range(1))
+            # Repeat data for beam search
+            src_enc = paddle.tile(src_enc, [1, n_bm, 1])
            inst_idx_to_position_map = get_inst_idx_to_tensor_position_map(
                active_inst_idx_list)
-            #-- Decode
+            # Decode
            for len_dec_seq in range(1, 25):
                src_enc_copy = src_enc.clone()
                active_inst_idx_list = beam_decode_step(
@@ -358,10 +330,19 @@ class Transformer(nn.Layer):
        batch_hyp, batch_scores = collect_hypothesis_and_scores(inst_dec_beams,
                                                                1)
        result_hyp = []
-        for bs_hyp in batch_hyp:
-            bs_hyp_pad = bs_hyp[0] + [3] * (25 - len(bs_hyp[0]))
+        hyp_scores = []
+        for bs_hyp, score in zip(batch_hyp, batch_scores):
+            l = len(bs_hyp[0])
+            bs_hyp_pad = bs_hyp[0] + [3] * (25 - l)
            result_hyp.append(bs_hyp_pad)
-        return paddle.to_tensor(np.array(result_hyp), dtype=paddle.int64)
+            score = float(score) / l
+            hyp_score = [score for _ in range(25)]
+            hyp_scores.append(hyp_score)
+        return [
+            paddle.to_tensor(
+                np.array(result_hyp), dtype=paddle.int64),
+            paddle.to_tensor(hyp_scores)
+        ]

    def generate_square_subsequent_mask(self, sz):
        """Generate a square mask for the sequence. The masked positions are filled with float('-inf').
@@ -376,7 +357,7 @@ class Transformer(nn.Layer):
        return mask

    def generate_padding_mask(self, x):
-        padding_mask = x.equal(paddle.to_tensor(0, dtype=x.dtype))
+        padding_mask = paddle.equal(x, paddle.to_tensor(0, dtype=x.dtype))
        return padding_mask

    def _reset_parameters(self):
@@ -514,17 +495,17 @@ class TransformerEncoderLayer(nn.Layer):
            src,
            src,
            attn_mask=src_mask,
-            key_padding_mask=src_key_padding_mask)[0]
+            key_padding_mask=src_key_padding_mask)
        src = src + self.dropout1(src2)
        src = self.norm1(src)

-        src = src.transpose([1, 2, 0])
+        src = paddle.transpose(src, [1, 2, 0])
        src = paddle.unsqueeze(src, 2)
        src2 = self.conv2(F.relu(self.conv1(src)))
        src2 = paddle.squeeze(src2, 2)
-        src2 = src2.transpose([2, 0, 1])
+        src2 = paddle.transpose(src2, [2, 0, 1])
        src = paddle.squeeze(src, 2)
-        src = src.transpose([2, 0, 1])
+        src = paddle.transpose(src, [2, 0, 1])

        src = src + self.dropout2(src2)
        src = self.norm2(src)
@@ -598,7 +579,7 @@ class TransformerDecoderLayer(nn.Layer):
            tgt,
            tgt,
            attn_mask=tgt_mask,
-            key_padding_mask=tgt_key_padding_mask)[0]
+            key_padding_mask=tgt_key_padding_mask)
        tgt = tgt + self.dropout1(tgt2)
        tgt = self.norm1(tgt)
        tgt2 = self.multihead_attn(
@@ -606,18 +587,18 @@ class TransformerDecoderLayer(nn.Layer):
            memory,
            memory,
            attn_mask=memory_mask,
-            key_padding_mask=memory_key_padding_mask)[0]
+            key_padding_mask=memory_key_padding_mask)
        tgt = tgt + self.dropout2(tgt2)
        tgt = self.norm2(tgt)

        # default
-        tgt = tgt.transpose([1, 2, 0])
+        tgt = paddle.transpose(tgt, [1, 2, 0])
        tgt = paddle.unsqueeze(tgt, 2)
        tgt2 = self.conv2(F.relu(self.conv1(tgt)))
        tgt2 = paddle.squeeze(tgt2, 2)
-        tgt2 = tgt2.transpose([2, 0, 1])
+        tgt2 = paddle.transpose(tgt2, [2, 0, 1])
        tgt = paddle.squeeze(tgt, 2)
-        tgt = tgt.transpose([2, 0, 1])
+        tgt = paddle.transpose(tgt, [2, 0, 1])

        tgt = tgt + self.dropout3(tgt2)
        tgt = self.norm3(tgt)
@@ -656,8 +637,8 @@ class PositionalEncoding(nn.Layer):
            (-math.log(10000.0) / dim))
        pe[:, 0::2] = paddle.sin(position * div_term)
        pe[:, 1::2] = paddle.cos(position * div_term)
-        pe = pe.unsqueeze(0)
-        pe = pe.transpose([1, 0, 2])
+        pe = paddle.unsqueeze(pe, 0)
+        pe = paddle.transpose(pe, [1, 0, 2])
        self.register_buffer('pe', pe)

    def forward(self, x):
@@ -670,7 +651,7 @@ class PositionalEncoding(nn.Layer):
        Examples:
            >>> output = pos_encoder(x)
        """
-        x = x + self.pe[:x.shape[0], :]
+        x = x + self.pe[:paddle.shape(x)[0], :]
        return self.dropout(x)


@@ -702,7 +683,7 @@ class PositionalEncoding_2d(nn.Layer):
            (-math.log(10000.0) / dim))
        pe[:, 0::2] = paddle.sin(position * div_term)
        pe[:, 1::2] = paddle.cos(position * div_term)
-        pe = pe.unsqueeze(0).transpose([1, 0, 2])
+        pe = paddle.transpose(paddle.unsqueeze(pe, 0), [1, 0, 2])
        self.register_buffer('pe', pe)

        self.avg_pool_1 = nn.AdaptiveAvgPool2D((1, 1))
@@ -722,22 +703,23 @@ class PositionalEncoding_2d(nn.Layer):
        Examples:
            >>> output = pos_encoder(x)
        """
-        w_pe = self.pe[:x.shape[-1], :]
+        w_pe = self.pe[:paddle.shape(x)[-1], :]
        w1 = self.linear1(self.avg_pool_1(x).squeeze()).unsqueeze(0)
        w_pe = w_pe * w1
-        w_pe = w_pe.transpose([1, 2, 0])
-        w_pe = w_pe.unsqueeze(2)
+        w_pe = paddle.transpose(w_pe, [1, 2, 0])
+        w_pe = paddle.unsqueeze(w_pe, 2)

-        h_pe = self.pe[:x.shape[-2], :]
+        h_pe = self.pe[:paddle.shape(x).shape[-2], :]
        w2 = self.linear2(self.avg_pool_2(x).squeeze()).unsqueeze(0)
        h_pe = h_pe * w2
-        h_pe = h_pe.transpose([1, 2, 0])
-        h_pe = h_pe.unsqueeze(3)
+        h_pe = paddle.transpose(h_pe, [1, 2, 0])
+        h_pe = paddle.unsqueeze(h_pe, 3)

        x = x + w_pe + h_pe
-        x = x.reshape(
-            [x.shape[0], x.shape[1], x.shape[2] * x.shape[3]]).transpose(
-                [2, 0, 1])
+        x = paddle.transpose(
+            paddle.reshape(x,
+                           [x.shape[0], x.shape[1], x.shape[2] * x.shape[3]]),
+            [2, 0, 1])

        return self.dropout(x)

@@ -817,7 +799,7 @@ class Beam():
    def sort_scores(self):
        "Sort the scores."
        return self.scores, paddle.to_tensor(
-            [i for i in range(self.scores.shape[0])], dtype='int32')
+            [i for i in range(int(self.scores.shape[0]))], dtype='int32')

    def get_the_best_score_and_idx(self):
        "Get the score of the best in the beam."

--- a/ppocr/modeling/heads/rec_sar_head.py
+++ b/ppocr/modeling/heads/rec_sar_head.py
@@ -235,7 +235,8 @@ class ParallelSARDecoder(BaseDecoder):
            # cal mask of attention weight
            for i, valid_ratio in enumerate(valid_ratios):
                valid_width = min(w, math.ceil(w * valid_ratio))
-                attn_weight[i, :, :, valid_width:, :] = float('-inf')
+                if valid_width < w:
+                    attn_weight[i, :, :, valid_width:, :] = float('-inf')

        attn_weight = paddle.reshape(attn_weight, [bsz, T, -1])
        attn_weight = F.softmax(attn_weight, axis=-1)

--- a/ppocr/modeling/necks/__init__.py
+++ b/ppocr/modeling/necks/__init__.py
@@ -22,7 +22,8 @@ def build_neck(config):
    from .rnn import SequenceEncoder
    from .pg_fpn import PGFPN
    from .table_fpn import TableFPN
-    support_dict = ['DBFPN', 'EASTFPN', 'SASTFPN', 'SequenceEncoder', 'PGFPN', 'TableFPN']
+    from .fpn import FPN
+    support_dict = ['FPN','DBFPN', 'EASTFPN', 'SASTFPN', 'SequenceEncoder', 'PGFPN', 'TableFPN']

    module_name = config.pop('name')
    assert module_name in support_dict, Exception('neck only support {}'.format(