refactor(ocr): remove unused code and simplify model architecture

- Remove unused imports and code - Simplify model architecture by removing unnecessary components - Update initialization and forward pass logic - Rename variables for consistency

refactor(ocr): remove unused code and simplify model architecture
- Remove unused imports and code - Simplify model architecture by removing unnecessary components - Update initialization and forward pass logic - Rename variables for consistency
b3d6785d · myhloli · 3cb156f5 · 3cb156f5 · 3cb156f5 · 3cb156f5
Commit b3d6785d authored Apr 01, 2025 by myhloli
20 changed files
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/transforms/stn.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/transforms/stn.py
-"""
-This code is refer from:
-https://github.com/ayumiymk/aster.pytorch/blob/master/lib/models/stn_head.py
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-import torch
-from torch import nn
-from torch.nn import functional as F
-import numpy as np
-
-from .tps_spatial_transformer import TPSSpatialTransformer
-
-
-def conv3x3_block(in_channels, out_channels, stride=1):
-    n = 3 * 3 * out_channels
-    w = math.sqrt(2. / n)
-    conv_layer = nn.Conv2d(
-        in_channels,
-        out_channels,
-        kernel_size=3,
-        stride=stride,
-        padding=1,
-        bias=True)
-    block = nn.Sequential(conv_layer, nn.BatchNorm2d(out_channels), nn.ReLU())
-    return block
-
-
-class STN(nn.Module):
-    def __init__(self, in_channels, num_ctrlpoints, activation='none'):
-        super(STN, self).__init__()
-        self.in_channels = in_channels
-        self.num_ctrlpoints = num_ctrlpoints
-        self.activation = activation
-        self.stn_convnet = nn.Sequential(
-            conv3x3_block(in_channels, 32),  #32x64
-            nn.MaxPool2d(
-                kernel_size=2, stride=2),
-            conv3x3_block(32, 64),  #16x32
-            nn.MaxPool2d(
-                kernel_size=2, stride=2),
-            conv3x3_block(64, 128),  # 8*16
-            nn.MaxPool2d(
-                kernel_size=2, stride=2),
-            conv3x3_block(128, 256),  # 4*8
-            nn.MaxPool2d(
-                kernel_size=2, stride=2),
-            conv3x3_block(256, 256),  # 2*4,
-            nn.MaxPool2d(
-                kernel_size=2, stride=2),
-            conv3x3_block(256, 256))  # 1*2
-        self.stn_fc1 = nn.Sequential(
-            nn.Linear(
-                2 * 256,
-                512,
-                bias=True),
-            nn.BatchNorm1d(512),
-            nn.ReLU(inplace=True))
-        fc2_bias = self.init_stn()
-        self.stn_fc2 = nn.Linear(
-            512,
-            num_ctrlpoints * 2,
-            bias=True)
-
-    def init_stn(self):
-        margin = 0.01
-        sampling_num_per_side = int(self.num_ctrlpoints / 2)
-        ctrl_pts_x = np.linspace(margin, 1. - margin, sampling_num_per_side)
-        ctrl_pts_y_top = np.ones(sampling_num_per_side) * margin
-        ctrl_pts_y_bottom = np.ones(sampling_num_per_side) * (1 - margin)
-        ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1)
-        ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1)
-        ctrl_points = np.concatenate(
-            [ctrl_pts_top, ctrl_pts_bottom], axis=0).astype(np.float32)
-        if self.activation == 'none':
-            pass
-        elif self.activation == 'sigmoid':
-            ctrl_points = -np.log(1. / ctrl_points - 1.)
-        ctrl_points = torch.Tensor(ctrl_points)
-        # fc2_bias = ctrl_points.view(-1)
-        fc2_bias = torch.reshape(
-            ctrl_points, shape=[ctrl_points.shape[0] * ctrl_points.shape[1]])
-        return fc2_bias
-
-    def forward(self, x):
-        x = self.stn_convnet(x)
-        batch_size, _, h, w = x.shape
-        # x = x.view(batch_size, -1)
-        x = torch.reshape(x, shape=(batch_size, -1))
-        img_feat = self.stn_fc1(x)
-        x = self.stn_fc2(0.1 * img_feat)
-        if self.activation == 'sigmoid':
-            x = F.sigmoid(x)
-        # x = x.view(-1, self.num_ctrlpoints, 2)
-        x = torch.reshape(x, shape=[-1, self.num_ctrlpoints, 2])
-        return img_feat, x
-
-
-class STN_ON(nn.Module):
-    def __init__(self, in_channels, tps_inputsize, tps_outputsize,
-                 num_control_points, tps_margins, stn_activation):
-        super(STN_ON, self).__init__()
-        self.tps = TPSSpatialTransformer(
-            output_image_size=tuple(tps_outputsize),
-            num_control_points=num_control_points,
-            margins=tuple(tps_margins))
-        self.stn_head = STN(in_channels=in_channels,
-                            num_ctrlpoints=num_control_points,
-                            activation=stn_activation)
-        self.tps_inputsize = tps_inputsize
-        self.out_channels = in_channels
-
-    def forward(self, image):
-        stn_input = torch.nn.functional.interpolate(
-            image, self.tps_inputsize, mode="bilinear", align_corners=True)
-        stn_img_feat, ctrl_points = self.stn_head(stn_input)
-        x, _ = self.tps(image, ctrl_points)
-        return x
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/transforms/tbsrn.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/transforms/tbsrn.py
-# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This code is refer from:
-https://github.com/FudanVI/FudanOCR/blob/main/scene-text-telescope/model/tbsrn.py
-"""
-
-import math
-import warnings
-import numpy as np
-import torch
-from torch import nn
-import string
-
-warnings.filterwarnings("ignore")
-
-from .tps_spatial_transformer import TPSSpatialTransformer
-from .stn import STN as STNHead
-from .tsrn import GruBlock, mish, UpsampleBLock
-from pytorchocr.modeling.heads.sr_rensnet_transformer import Transformer, LayerNorm, \
-    PositionwiseFeedForward, MultiHeadedAttention
-
-
-def positionalencoding2d(d_model, height, width):
-    """
-    :param d_model: dimension of the model
-    :param height: height of the positions
-    :param width: width of the positions
-    :return: d_model*height*width position matrix
-    """
-    if d_model % 4 != 0:
-        raise ValueError("Cannot use sin/cos positional encoding with "
-                         "odd dimension (got dim={:d})".format(d_model))
-    pe = torch.zeros([d_model, height, width])
-    # Each dimension use half of d_model
-    d_model = int(d_model / 2)
-    div_term = torch.exp(torch.arange(0., d_model, 2) *
-                          -(math.log(10000.0) / d_model))
-    pos_w = torch.arange(0., width, dtype=torch.float32).unsqueeze(1)
-    pos_h = torch.arange(0., height, dtype=torch.float32).unsqueeze(1)
-
-    pe[0:d_model:2, :, :] = torch.sin(pos_w * div_term).transpose(0, 1).unsqueeze(1).repeat(1, height, 1)
-    pe[1:d_model:2, :, :] = torch.cos(pos_w * div_term).transpose(0, 1).unsqueeze(1).repeat(1, height, 1)
-    pe[d_model::2, :, :] = torch.sin(pos_h * div_term).transpose(0, 1).unsqueeze(2).repeat(1, 1, width)
-    pe[d_model + 1::2, :, :] = torch.cos(pos_h * div_term).transpose(0, 1).unsqueeze(2).repeat(1, 1, width)
-
-    return pe
-
-
-class FeatureEnhancer(nn.Module):
-
-    def __init__(self):
-        super(FeatureEnhancer, self).__init__()
-
-        self.multihead = MultiHeadedAttention(h=4, d_model=128, dropout=0.1)
-        self.mul_layernorm1 = LayerNorm(features=128)
-
-        self.pff = PositionwiseFeedForward(128, 128)
-        self.mul_layernorm3 = LayerNorm(features=128)
-
-        self.linear = nn.Linear(128, 64)
-
-    def forward(self, conv_feature):
-        '''
-        text : (batch, seq_len, embedding_size)
-        global_info: (batch, embedding_size, 1, 1)
-        conv_feature: (batch, channel, H, W)
-        '''
-        batch = conv_feature.shape[0]
-        if torch.cuda.is_available():
-            position2d = positionalencoding2d(64, 16, 64).float().cuda().unsqueeze(0).reshape([1, 64, 1024])
-        else:
-            position2d = positionalencoding2d(64, 16, 64).float().unsqueeze(0).reshape([1, 64, 1024])
-        position2d = position2d.repeat(batch, 1, 1)
-        conv_feature = torch.cat([conv_feature, position2d], 1)  # batch, 128(64+64), 32, 128
-        result = conv_feature.permute(0, 2, 1).contiguous()
-        origin_result = result
-        result = self.mul_layernorm1(origin_result + self.multihead(result, result, result, mask=None)[0])
-        origin_result = result
-        result = self.mul_layernorm3(origin_result + self.pff(result))
-        result = self.linear(result)
-        return result.permute(0, 2, 1).contiguous()
-
-
-def str_filt(str_, voc_type):
-    alpha_dict = {
-        'digit': string.digits,
-        'lower': string.digits + string.ascii_lowercase,
-        'upper': string.digits + string.ascii_letters,
-        'all': string.digits + string.ascii_letters + string.punctuation
-    }
-    if voc_type == 'lower':
-        str_ = str_.lower()
-    for char in str_:
-        if char not in alpha_dict[voc_type]:
-            str_ = str_.replace(char, '')
-    str_ = str_.lower()
-    return str_
-
-
-class TBSRN(nn.Module):
-    def __init__(self,
-                 in_channels=3,
-                 scale_factor=2,
-                 width=128,
-                 height=32,
-                 STN=True,
-                 srb_nums=5,
-                 mask=False,
-                 hidden_units=32,
-                 infer_mode=False):
-        super(TBSRN, self).__init__()
-        in_planes = 3
-        if mask:
-            in_planes = 4
-        assert math.log(scale_factor, 2) % 1 == 0
-        upsample_block_num = int(math.log(scale_factor, 2))
-        self.block1 = nn.Sequential(
-            nn.Conv2d(in_planes, 2 * hidden_units, kernel_size=9, padding=4),
-            nn.PReLU()
-            # nn.ReLU()
-        )
-        self.srb_nums = srb_nums
-        for i in range(srb_nums):
-            setattr(self, 'block%d' % (i + 2), RecurrentResidualBlock(2 * hidden_units))
-
-        setattr(self, 'block%d' % (srb_nums + 2),
-                nn.Sequential(
-                    nn.Conv2d(2 * hidden_units, 2 * hidden_units, kernel_size=3, padding=1),
-                    nn.BatchNorm2d(2 * hidden_units)
-                ))
-
-        # self.non_local = NonLocalBlock2D(64, 64)
-        block_ = [UpsampleBLock(2 * hidden_units, 2) for _ in range(upsample_block_num)]
-        block_.append(nn.Conv2d(2 * hidden_units, in_planes, kernel_size=9, padding=4))
-        setattr(self, 'block%d' % (srb_nums + 3), nn.Sequential(*block_))
-        self.tps_inputsize = [height // scale_factor, width // scale_factor]
-        tps_outputsize = [height // scale_factor, width // scale_factor]
-        num_control_points = 20
-        tps_margins = [0.05, 0.05]
-        self.stn = STN
-        self.out_channels = in_channels
-        if self.stn:
-            self.tps = TPSSpatialTransformer(
-                output_image_size=tuple(tps_outputsize),
-                num_control_points=num_control_points,
-                margins=tuple(tps_margins))
-
-            self.stn_head = STNHead(
-                in_channels=in_planes,
-                num_ctrlpoints=num_control_points,
-                activation='none')
-        self.infer_mode = infer_mode
-
-        self.english_alphabet = '-0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
-        self.english_dict = {}
-        for index in range(len(self.english_alphabet)):
-            self.english_dict[self.english_alphabet[index]] = index
-        transformer = Transformer(alphabet='-0123456789abcdefghijklmnopqrstuvwxyz')
-        self.transformer = transformer
-        for param in self.transformer.parameters():
-            param.trainable = False
-
-    def label_encoder(self, label):
-        batch = len(label)
-
-        length = [len(i) for i in label]
-        length_tensor = torch.Tensor(length).type(torch.int64)
-
-        max_length = max(length)
-        input_tensor = np.zeros((batch, max_length))
-        for i in range(batch):
-            for j in range(length[i] - 1):
-                input_tensor[i][j + 1] = self.english_dict[label[i][j]]
-
-        text_gt = []
-        for i in label:
-            for j in i:
-                text_gt.append(self.english_dict[j])
-        text_gt = torch.Tensor(text_gt).type(torch.int64)
-
-        input_tensor = torch.Tensor(input_tensor).type(torch.int64)
-        return length_tensor, input_tensor, text_gt
-
-    def forward(self, x):
-        output = {}
-        if self.infer_mode:
-            output["lr_img"] = x
-            y = x
-        else:
-            output["lr_img"] = x[0]
-            output["hr_img"] = x[1]
-            y = x[0]
-        if self.stn and self.training:
-            _, ctrl_points_x = self.stn_head(y)
-            y, _ = self.tps(y, ctrl_points_x)
-        block = {'1': self.block1(y)}
-        for i in range(self.srb_nums + 1):
-            block[str(i + 2)] = getattr(self,
-                                        'block%d' % (i + 2))(block[str(i + 1)])
-
-        block[str(self.srb_nums + 3)] = getattr(self, 'block%d' % (self.srb_nums + 3)) \
-            ((block['1'] + block[str(self.srb_nums + 2)]))
-
-        sr_img = torch.tanh(block[str(self.srb_nums + 3)])
-        output["sr_img"] = sr_img
-
-        if self.training:
-            hr_img = x[1]
-
-            # add transformer
-            label = [str_filt(i, 'lower') + '-' for i in x[2]]
-            length_tensor, input_tensor, text_gt = self.label_encoder(label)
-            hr_pred, word_attention_map_gt, hr_correct_list = self.transformer(hr_img, length_tensor,
-                                                                               input_tensor)
-            sr_pred, word_attention_map_pred, sr_correct_list = self.transformer(sr_img, length_tensor,
-                                                                                 input_tensor)
-            output["hr_img"] = hr_img
-            output["hr_pred"] = hr_pred
-            output["text_gt"] = text_gt
-            output["word_attention_map_gt"] = word_attention_map_gt
-            output["sr_pred"] = sr_pred
-            output["word_attention_map_pred"] = word_attention_map_pred
-
-        return output
-
-
-class RecurrentResidualBlock(nn.Module):
-    def __init__(self, channels):
-        super(RecurrentResidualBlock, self).__init__()
-        self.conv1 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
-        self.bn1 = nn.BatchNorm2d(channels)
-        self.gru1 = GruBlock(channels, channels)
-        # self.prelu = nn.ReLU()
-        self.prelu = mish()
-        self.conv2 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
-        self.bn2 = nn.BatchNorm2d(channels)
-        self.gru2 = GruBlock(channels, channels)
-        self.feature_enhancer = FeatureEnhancer()
-
-        for p in self.parameters():
-            if p.dim() > 1:
-                nn.init.xavier_uniform_(p)
-
-    def forward(self, x):
-        residual = self.conv1(x)
-        residual = self.bn1(residual)
-        residual = self.prelu(residual)
-        residual = self.conv2(residual)
-        residual = self.bn2(residual)
-
-        size = residual.shape
-        residual = residual.reshape([size[0], size[1], -1])
-        residual = self.feature_enhancer(residual)
-        residual = residual.reshape([size[0], size[1], size[2], size[3]])
-        return x + residual
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/transforms/tps.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/transforms/tps.py
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-import os, sys
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from pytorchocr.modeling.common import Activation
-# import paddle
-# from paddle import nn, ParamAttr
-# from paddle.nn import functional as F
-import numpy as np
-
-
-class ConvBNLayer(nn.Module):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 groups=1,
-                 act=None,
-                 name=None):
-        super(ConvBNLayer, self).__init__()
-        self.conv = nn.Conv2d(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=(kernel_size - 1) // 2,
-            groups=groups,
-            bias=False,
-        )
-        bn_name = "bn_" + name
-        self.bn = nn.BatchNorm2d(
-            out_channels, )
-        self.act = act
-        if act is not None:
-            self._act = Activation(act)
-
-    def forward(self, x):
-        x = self.conv(x)
-        x = self.bn(x)
-        if self.act is not None:
-            x = self._act(x)
-        return x
-
-
-class LocalizationNetwork(nn.Module):
-    def __init__(self, in_channels, num_fiducial, loc_lr, model_name):
-        super(LocalizationNetwork, self).__init__()
-        self.F = num_fiducial
-        F = num_fiducial
-        if model_name == "large":
-            num_filters_list = [64, 128, 256, 512]
-            fc_dim = 256
-        else:
-            num_filters_list = [16, 32, 64, 128]
-            fc_dim = 64
-
-        # self.block_list = []
-        self.block_list = nn.Sequential()
-        for fno in range(0, len(num_filters_list)):
-            num_filters = num_filters_list[fno]
-            name = "loc_conv%d" % fno
-            # conv = self.add_sublayer(
-            #     name,
-            #     ConvBNLayer(
-            #         in_channels=in_channels,
-            #         out_channels=num_filters,
-            #         kernel_size=3,
-            #         act='relu',
-            #         name=name))
-            conv = ConvBNLayer(
-                    in_channels=in_channels,
-                    out_channels=num_filters,
-                    kernel_size=3,
-                    act='relu',
-                    name=name)
-            # self.block_list.append(conv)
-            self.block_list.add_module(name, conv)
-            if fno == len(num_filters_list) - 1:
-                pool = nn.AdaptiveAvgPool2d(1)
-            else:
-                # pool = nn.MaxPool2D(kernel_size=2, stride=2, padding=0)
-                pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
-            in_channels = num_filters
-            # self.block_list.append(pool)
-            self.block_list.add_module('{}_pool'.format(name), pool)
-        name = "loc_fc1"
-        stdv = 1.0 / math.sqrt(num_filters_list[-1] * 1.0)
-        self.fc1 = nn.Linear(
-            in_channels,
-            fc_dim,
-            bias=True,
-        )
-
-
-        # Init fc2 in LocalizationNetwork
-        initial_bias = self.get_initial_fiducials()
-        initial_bias = initial_bias.reshape(-1)
-        name = "loc_fc2"
-        self.fc2 = nn.Linear(
-            fc_dim,
-            F * 2,
-            bias=True
-        )
-        self.out_channels = F * 2
-
-    def forward(self, x):
-        """
-           Estimating parameters of geometric transformation
-           Args:
-               image: input
-           Return:
-               batch_C_prime: the matrix of the geometric transformation
-        """
-        B = x.shape[0]
-        i = 0
-        for block in self.block_list:
-            x = block(x)
-        x = x.squeeze(dim=2).squeeze(dim=2)
-        x = self.fc1(x)
-
-        x = F.relu(x)
-        x = self.fc2(x)
-        x = x.reshape(shape=[-1, self.F, 2])
-        return x
-
-    def get_initial_fiducials(self):
-        """ see RARE paper Fig. 6 (a) """
-        F = self.F
-        ctrl_pts_x = np.linspace(-1.0, 1.0, int(F / 2))
-        ctrl_pts_y_top = np.linspace(0.0, -1.0, num=int(F / 2))
-        ctrl_pts_y_bottom = np.linspace(1.0, 0.0, num=int(F / 2))
-        ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1)
-        ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1)
-        initial_bias = np.concatenate([ctrl_pts_top, ctrl_pts_bottom], axis=0)
-        return initial_bias
-
-
-class GridGenerator(nn.Module):
-    def __init__(self, in_channels, num_fiducial):
-        super(GridGenerator, self).__init__()
-        self.eps = 1e-6
-        self.F = num_fiducial
-
-        name = "ex_fc"
-        self.fc = nn.Linear(
-            in_channels,
-            6,
-            bias=True
-        )
-
-    def forward(self, batch_C_prime, I_r_size):
-        """
-        Generate the grid for the grid_sampler.
-        Args:
-            batch_C_prime: the matrix of the geometric transformation
-            I_r_size: the shape of the input image
-        Return:
-            batch_P_prime: the grid for the grid_sampler
-        """
-        C = self.build_C_paddle()
-        P = self.build_P_paddle(I_r_size)
-
-        inv_delta_C_tensor = self.build_inv_delta_C_paddle(C).type(torch.float32)
-        P_hat_tensor = self.build_P_hat_paddle(
-            C, torch.as_tensor(P)).type(torch.float32)
-
-        inv_delta_C_tensor.stop_gradient = True
-        P_hat_tensor.stop_gradient = True
-
-        batch_C_ex_part_tensor = self.get_expand_tensor(batch_C_prime)
-
-        batch_C_ex_part_tensor.stop_gradient = True
-
-        batch_C_prime_with_zeros = torch.cat(
-            [batch_C_prime, batch_C_ex_part_tensor], dim=1)
-        inv_delta_C_tensor = inv_delta_C_tensor.to(batch_C_prime_with_zeros.device)
-        batch_T = torch.matmul(inv_delta_C_tensor, batch_C_prime_with_zeros)
-        P_hat_tensor = P_hat_tensor.to(batch_T.device)
-        batch_P_prime = torch.matmul(P_hat_tensor, batch_T)
-        return batch_P_prime
-
-    def build_C_paddle(self):
-        """ Return coordinates of fiducial points in I_r; C """
-        F = self.F
-        ctrl_pts_x = torch.linspace(-1.0, 1.0, int(F / 2), dtype=torch.float64)
-        ctrl_pts_y_top = -1 * torch.ones([int(F / 2)], dtype=torch.float64)
-        ctrl_pts_y_bottom = torch.ones([int(F / 2)], dtype=torch.float64)
-        ctrl_pts_top = torch.stack([ctrl_pts_x, ctrl_pts_y_top], dim=1)
-        ctrl_pts_bottom = torch.stack([ctrl_pts_x, ctrl_pts_y_bottom], dim=1)
-        C = torch.cat([ctrl_pts_top, ctrl_pts_bottom], dim=0)
-        return C  # F x 2
-
-    def build_P_paddle(self, I_r_size):
-        I_r_height, I_r_width = I_r_size
-        I_r_grid_x = (torch.arange(
-            -I_r_width, I_r_width, 2, dtype=torch.float64) + 1.0
-                      ) / torch.as_tensor(np.array([I_r_width]).astype(np.float64))
-
-        I_r_grid_y = (torch.arange(
-            -I_r_height, I_r_height, 2, dtype=torch.float64) + 1.0
-                      ) / torch.as_tensor(np.array([I_r_height]).astype(np.float64))
-
-        # P: self.I_r_width x self.I_r_height x 2
-        P = torch.stack(torch.meshgrid([I_r_grid_x, I_r_grid_y]), dim=2)
-        # P = paddle.transpose(P, perm=[1, 0, 2])
-        P = P.permute(1, 0, 2)
-        # n (= self.I_r_width x self.I_r_height) x 2
-        return P.reshape([-1, 2])
-
-    def build_inv_delta_C_paddle(self, C):
-        """ Return inv_delta_C which is needed to calculate T """
-        F = self.F
-        hat_C = torch.zeros((F, F), dtype=torch.float64)  # F x F
-        for i in range(0, F):
-            for j in range(i, F):
-                if i == j:
-                    hat_C[i, j] = 1
-                else:
-                    r = torch.norm(C[i] - C[j])
-                    hat_C[i, j] = r
-                    hat_C[j, i] = r
-        hat_C = (hat_C**2) * torch.log(hat_C)
-        delta_C = torch.cat(  # F+3 x F+3
-            [
-                torch.cat(
-                    [torch.ones(
-                        (F, 1), dtype=torch.float64), C, hat_C], dim=1),  # F x F+3
-                torch.cat(
-                    [
-                        torch.zeros(
-                            (2, 3), dtype=torch.float64), C.permute(1,0)
-                    ],
-                    dim=1),  # 2 x F+3
-                torch.cat(
-                    [
-                        torch.zeros(
-                            (1, 3), dtype=torch.float64), torch.ones(
-                                (1, F), dtype=torch.float64)
-                    ],
-                    dim=1)  # 1 x F+3
-            ],
-            dim=0)
-        inv_delta_C = torch.inverse(delta_C)
-        return inv_delta_C  # F+3 x F+3
-
-    def build_P_hat_paddle(self, C, P):
-        F = self.F
-        eps = self.eps
-        n = P.shape[0]  # n (= self.I_r_width x self.I_r_height)
-        # P_tile: n x 2 -> n x 1 x 2 -> n x F x 2
-        # P_tile = paddle.tile(paddle.unsqueeze(P, axis=1), (1, F, 1))
-        P_tile = torch.unsqueeze(P, dim=1).repeat(1, F, 1)
-        C_tile = torch.unsqueeze(C, dim=0)  # 1 x F x 2
-        P_diff = P_tile - C_tile  # n x F x 2
-        # rbf_norm: n x F
-        rbf_norm = torch.norm(P_diff, p=2, dim=2, keepdim=False)
-
-        # rbf: n x F
-        # rbf = torch.mul(
-        #     torch.square(rbf_norm), torch.log(rbf_norm + eps))
-        rbf = torch.mul(
-            rbf_norm**2, torch.log(rbf_norm + eps))
-        P_hat = torch.cat(
-            [torch.ones(
-                (n, 1), dtype=torch.float64), P, rbf], dim=1)
-        return P_hat  # n x F+3
-
-    def get_expand_tensor(self, batch_C_prime):
-        B, H, C = batch_C_prime.shape
-        batch_C_prime = batch_C_prime.reshape([B, H * C])
-        batch_C_ex_part_tensor = self.fc(batch_C_prime)
-        batch_C_ex_part_tensor = batch_C_ex_part_tensor.reshape([-1, 3, 2])
-        return batch_C_ex_part_tensor
-
-
-class TPS(nn.Module):
-    def __init__(self, in_channels, num_fiducial, loc_lr, model_name):
-        super(TPS, self).__init__()
-        self.loc_net = LocalizationNetwork(in_channels, num_fiducial, loc_lr,
-                                           model_name)
-        self.grid_generator = GridGenerator(self.loc_net.out_channels,
-                                            num_fiducial)
-        self.out_channels = in_channels
-
-    def forward(self, image):
-        image.stop_gradient = False
-        batch_C_prime = self.loc_net(image)
-        batch_P_prime = self.grid_generator(batch_C_prime, image.shape[2:])
-        batch_P_prime = batch_P_prime.reshape(
-            [-1, image.shape[2], image.shape[3], 2])
-        if torch.__version__ < '1.3.0':
-            batch_I_r = F.grid_sample(image, grid=batch_P_prime)
-        else:
-            batch_I_r = F.grid_sample(image, grid=batch_P_prime, align_corners=True)
-        return batch_I_r
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/transforms/tps_spatial_transformer.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/transforms/tps_spatial_transformer.py
-"""
-This code is refer from:
-https://github.com/ayumiymk/aster.pytorch/blob/master/lib/models/tps_spatial_transformer.py
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-import torch
-from torch import nn
-from torch.nn import functional as F
-import numpy as np
-import itertools
-
-
-def grid_sample(input, grid, canvas=None):
-    input.stop_gradient = False
-    output = F.grid_sample(input, grid, align_corners=True) if torch.__version__ >= '1.3.0' else F.grid_sample(input, grid)
-    if canvas is None:
-        return output
-    else:
-        # input_mask = paddle.ones(shape=input.shape)
-        input_mask = input.data.new(input.size()).fill_(1)
-        output_mask = F.grid_sample(input_mask, grid)
-        padded_output = output * output_mask + canvas * (1 - output_mask)
-        return padded_output
-
-
-# phi(x1, x2) = r^2 * log(r), where r = ||x1 - x2||_2
-def compute_partial_repr(input_points, control_points):
-    N = input_points.shape[0]
-    M = control_points.shape[0]
-    # pairwise_diff = input_points.view(N, 1, 2) - control_points.view(1, M, 2)
-    pairwise_diff = torch.reshape(
-        input_points, shape=[N, 1, 2]) - torch.reshape(
-            control_points, shape=[1, M, 2])
-    # original implementation, very slow
-    # pairwise_dist = torch.sum(pairwise_diff ** 2, dim = 2) # square of distance
-    pairwise_diff_square = pairwise_diff * pairwise_diff
-    pairwise_dist = pairwise_diff_square[:, :, 0] + pairwise_diff_square[:, :, 1]
-    repr_matrix = 0.5 * pairwise_dist * torch.log(pairwise_dist)
-    # fix numerical error for 0 * log(0), substitute all nan with 0
-    # mask = np.array(repr_matrix != repr_matrix)
-    # repr_matrix[mask] = 0
-    mask = repr_matrix != repr_matrix
-    repr_matrix.masked_fill_(mask, 0)
-    return repr_matrix
-
-
-# output_ctrl_pts are specified, according to our task.
-def build_output_control_points(num_control_points, margins):
-    margin_x, margin_y = margins
-    num_ctrl_pts_per_side = num_control_points // 2
-    ctrl_pts_x = np.linspace(margin_x, 1.0 - margin_x, num_ctrl_pts_per_side)
-    ctrl_pts_y_top = np.ones(num_ctrl_pts_per_side) * margin_y
-    ctrl_pts_y_bottom = np.ones(num_ctrl_pts_per_side) * (1.0 - margin_y)
-    ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1)
-    ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1)
-    output_ctrl_pts_arr = np.concatenate(
-        [ctrl_pts_top, ctrl_pts_bottom], axis=0)
-    output_ctrl_pts = torch.Tensor(output_ctrl_pts_arr)
-    return output_ctrl_pts
-
-
-class TPSSpatialTransformer(nn.Module):
-    def __init__(self,
-                 output_image_size=None,
-                 num_control_points=None,
-                 margins=None):
-        super(TPSSpatialTransformer, self).__init__()
-        self.output_image_size = output_image_size
-        self.num_control_points = num_control_points
-        self.margins = margins
-
-        self.target_height, self.target_width = output_image_size
-        target_control_points = build_output_control_points(num_control_points,
-                                                            margins)
-        N = num_control_points
-
-        # create padded kernel matrix
-        forward_kernel = torch.zeros(N + 3, N + 3)
-        target_control_partial_repr = compute_partial_repr(target_control_points, target_control_points)
-        forward_kernel[:N, :N].copy_(target_control_partial_repr)
-        forward_kernel[:N, -3].fill_(1)
-        forward_kernel[-3, :N].fill_(1)
-        forward_kernel[:N, -2:].copy_(target_control_points)
-        forward_kernel[-2:, :N].copy_(target_control_points.transpose(0, 1))
-        # compute inverse matrix
-        inverse_kernel = torch.inverse(forward_kernel)
-
-        # create target cordinate matrix
-        HW = self.target_height * self.target_width
-        target_coordinate = list(
-            itertools.product(
-                range(self.target_height), range(self.target_width)))
-        target_coordinate = torch.Tensor(target_coordinate)  # HW x 2
-        Y, X = target_coordinate.split(1, dim = 1)
-        Y = Y / (self.target_height - 1)
-        X = X / (self.target_width - 1)
-        target_coordinate = torch.cat([X, Y], dim = 1) # convert from (y, x) to (x, y)
-        target_coordinate_partial_repr = compute_partial_repr(
-            target_coordinate, target_control_points)
-        target_coordinate_repr = torch.cat(
-            [
-                target_coordinate_partial_repr,
-                torch.ones(HW, 1),
-                target_coordinate
-            ],
-            dim=1)
-
-        # register precomputed matrices
-        self.inverse_kernel = inverse_kernel
-        self.padding_matrix = torch.zeros(3, 2)
-        self.target_coordinate_repr = target_coordinate_repr
-        self.target_control_points = target_control_points
-
-    def forward(self, input, source_control_points):
-        assert source_control_points.ndimension() == 3
-        assert source_control_points.shape[1] == self.num_control_points
-        assert source_control_points.shape[2] == 2
-        batch_size = source_control_points.size(0)
-
-        Y = torch.cat([source_control_points, self.padding_matrix.expand(batch_size, 3, 2)], 1)
-        mapping_matrix = torch.matmul(self.inverse_kernel, Y)
-        source_coordinate = torch.matmul(self.target_coordinate_repr, mapping_matrix)
-
-        # grid = source_coordinate.view(-1, self.target_height, self.target_width, 2)
-        grid = torch.reshape(
-            source_coordinate,
-            shape=[-1, self.target_height, self.target_width, 2])
-        grid = torch.clamp(grid, 0, 1)  # the source_control_points may be out of [0, 1].
-        # the input to grid_sample is normalized [-1, 1], but what we get is [0, 1]
-        grid = 2.0 * grid - 1.0
-        output_maps = grid_sample(input, grid, canvas=None)
-        return output_maps, source_coordinate
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/transforms/tsrn.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/transforms/tsrn.py
-# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This code is refer from:
-https://github.com/FudanVI/FudanOCR/blob/main/text-gestalt/model/tsrn.py
-"""
-
-import math
-import torch
-import torch.nn.functional as F
-from torch import nn
-from collections import OrderedDict
-import sys
-import numpy as np
-import warnings
-import math, copy
-import cv2
-
-warnings.filterwarnings("ignore")
-
-from .tps_spatial_transformer import TPSSpatialTransformer
-from .stn import STN as STN_model
-from pytorchocr.modeling.heads.sr_rensnet_transformer import Transformer
-
-
-class TSRN(nn.Module):
-    def __init__(self,
-                 in_channels,
-                 scale_factor=2,
-                 width=128,
-                 height=32,
-                 STN=False,
-                 srb_nums=5,
-                 mask=False,
-                 hidden_units=32,
-                 infer_mode=False,
-                 **kwargs):
-        super(TSRN, self).__init__()
-        in_planes = 3
-        if mask:
-            in_planes = 4
-        assert math.log(scale_factor, 2) % 1 == 0
-        upsample_block_num = int(math.log(scale_factor, 2))
-        self.block1 = nn.Sequential(
-            nn.Conv2d(
-                in_planes, 2 * hidden_units, kernel_size=9, padding=4),
-            nn.PReLU())
-        self.srb_nums = srb_nums
-        for i in range(srb_nums):
-            setattr(self, 'block%d' % (i + 2),
-                    RecurrentResidualBlock(2 * hidden_units))
-
-        setattr(
-            self,
-            'block%d' % (srb_nums + 2),
-            nn.Sequential(
-                nn.Conv2d(
-                    2 * hidden_units,
-                    2 * hidden_units,
-                    kernel_size=3,
-                    padding=1),
-                nn.BatchNorm2d(2 * hidden_units)))
-
-        block_ = [
-            UpsampleBLock(2 * hidden_units, 2)
-            for _ in range(upsample_block_num)
-        ]
-        block_.append(
-            nn.Conv2d(2 * hidden_units, in_planes, kernel_size=9, padding=4)
-        )
-        setattr(self, 'block%d' % (srb_nums + 3), nn.Sequential(*block_))
-        self.tps_inputsize = [height // scale_factor, width // scale_factor]
-        tps_outputsize = [height // scale_factor, width // scale_factor]
-        num_control_points = 20
-        tps_margins = [0.05, 0.05]
-        self.stn = STN
-        if self.stn:
-            self.tps = TPSSpatialTransformer(
-                output_image_size=tuple(tps_outputsize),
-                num_control_points=num_control_points,
-                margins=tuple(tps_margins))
-
-            self.stn_head = STN_model(
-                in_channels=in_planes,
-                num_ctrlpoints=num_control_points,
-                activation='none')
-        self.out_channels = in_channels
-
-        self.r34_transformer = Transformer()
-        for param in self.r34_transformer.parameters():
-            param.trainable = False
-        self.infer_mode = infer_mode
-
-    def forward(self, x):
-        output = {}
-        if self.infer_mode:
-            output["lr_img"] = x
-            y = x
-        else:
-            output["lr_img"] = x[0]
-            output["hr_img"] = x[1]
-            y = x[0]
-        if self.stn and self.training:
-            _, ctrl_points_x = self.stn_head(y)
-            y, _ = self.tps(y, ctrl_points_x)
-        block = {'1': self.block1(y)}
-        for i in range(self.srb_nums + 1):
-            block[str(i + 2)] = getattr(self,
-                                        'block%d' % (i + 2))(block[str(i + 1)])
-
-        block[str(self.srb_nums + 3)] = getattr(self, 'block%d' % (self.srb_nums + 3)) \
-            ((block['1'] + block[str(self.srb_nums + 2)]))
-
-        sr_img = torch.tanh(block[str(self.srb_nums + 3)])
-
-        output["sr_img"] = sr_img
-
-        if self.training:
-            hr_img = x[1]
-            length = x[2]
-            input_tensor = x[3]
-
-            # add transformer 
-            sr_pred, word_attention_map_pred, _ = self.r34_transformer(
-                sr_img, length, input_tensor)
-
-            hr_pred, word_attention_map_gt, _ = self.r34_transformer(
-                hr_img, length, input_tensor)
-
-            output["hr_img"] = hr_img
-            output["hr_pred"] = hr_pred
-            output["word_attention_map_gt"] = word_attention_map_gt
-            output["sr_pred"] = sr_pred
-            output["word_attention_map_pred"] = word_attention_map_pred
-
-        return output
-
-
-class RecurrentResidualBlock(nn.Module):
-    def __init__(self, channels):
-        super(RecurrentResidualBlock, self).__init__()
-        self.conv1 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
-        self.bn1 = nn.BatchNorm2d(channels)
-        self.gru1 = GruBlock(channels, channels)
-        self.prelu = mish()
-        self.conv2 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
-        self.bn2 = nn.BatchNorm2d(channels)
-        self.gru2 = GruBlock(channels, channels)
-
-    def forward(self, x):
-        residual = self.conv1(x)
-        residual = self.bn1(residual)
-        residual = self.prelu(residual)
-        residual = self.conv2(residual)
-        residual = self.bn2(residual)
-        residual = self.gru1(residual.permute(0, 1, 3, 2).contiguous()).permute(0, 1, 3, 2).contiguous()
-
-        return self.gru2(x + residual).contiguous()
-
-
-class UpsampleBLock(nn.Module):
-    def __init__(self, in_channels, up_scale):
-        super(UpsampleBLock, self).__init__()
-        self.conv = nn.Conv2d(
-            in_channels, in_channels * up_scale**2, kernel_size=3, padding=1)
-
-        self.pixel_shuffle = nn.PixelShuffle(up_scale)
-        self.prelu = mish()
-
-    def forward(self, x):
-        x = self.conv(x)
-        x = self.pixel_shuffle(x)
-        x = self.prelu(x)
-        return x
-
-
-class mish(nn.Module):
-    def __init__(self, ):
-        super(mish, self).__init__()
-        self.activated = True
-
-    def forward(self, x):
-        if self.activated:
-            x = x * (torch.tanh(F.softplus(x)))
-        return x
-
-
-class GruBlock(nn.Module):
-    def __init__(self, in_channels, out_channels):
-        super(GruBlock, self).__init__()
-        assert out_channels % 2 == 0
-        self.conv1 = nn.Conv2d(
-            in_channels, out_channels, kernel_size=1, padding=0)
-        self.gru = nn.GRU(out_channels,
-                          out_channels // 2,
-                          bidirectional=True,
-                          batch_first=True,
-                          )
-
-    def forward(self, x):
-        # x: b, c, w, h
-        x = self.conv1(x)
-        x = x.permute(0, 2, 3, 1).contiguous()  # b, w, h, c
-        batch_size, w, h, c = x.size()
-        x = x.view(batch_size * w, h, c)  # b*w, h, c
-        x, _ = self.gru(x)
-        x = x.view(batch_size, w, h, c)
-        x = x.permute(0, 3, 1, 2).contiguous()
-        return x
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/__init__.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/__init__.py
@@ -11,26 +11,18 @@ __all__ = ['build_post_process']

 def build_post_process(config, global_config=None):
    from .db_postprocess import DBPostProcess
-    from .east_postprocess import EASTPostProcess
-    from .sast_postprocess import SASTPostProcess
-    from .fce_postprocess import FCEPostProcess
    from .rec_postprocess import CTCLabelDecode, AttnLabelDecode, SRNLabelDecode, TableLabelDecode, \
        NRTRLabelDecode, SARLabelDecode, ViTSTRLabelDecode, RFLLabelDecode
    from .cls_postprocess import ClsPostProcess
-    from .pg_postprocess import PGPostProcess
    from .rec_postprocess import CANLabelDecode

    support_dict = [
-        'DBPostProcess', 'EASTPostProcess', 'SASTPostProcess', 'CTCLabelDecode',
-        'AttnLabelDecode', 'ClsPostProcess', 'SRNLabelDecode', 'PGPostProcess',
-        'TableLabelDecode', 'NRTRLabelDecode', 'SARLabelDecode', 'FCEPostProcess',
+        'DBPostProcess', 'CTCLabelDecode',
+        'AttnLabelDecode', 'ClsPostProcess', 'SRNLabelDecode',
+        'TableLabelDecode', 'NRTRLabelDecode', 'SARLabelDecode',
        'ViTSTRLabelDecode','CANLabelDecode', 'RFLLabelDecode'
    ]

-    if config['name'] == 'PSEPostProcess':
-        from .pse_postprocess import PSEPostProcess
-        support_dict.append('PSEPostProcess')
-
    config = copy.deepcopy(config)
    module_name = config.pop('name')
    if global_config is not None:

--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/east_postprocess.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/east_postprocess.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-from .locality_aware_nms import nms_locality
-import cv2
-# import paddle
-import torch
-
-import os
-import sys
-
-
-class EASTPostProcess(object):
-    """
-    The post process for EAST.
-    """
-
-    def __init__(self,
-                 score_thresh=0.8,
-                 cover_thresh=0.1,
-                 nms_thresh=0.2,
-                 **kwargs):
-
-        self.score_thresh = score_thresh
-        self.cover_thresh = cover_thresh
-        self.nms_thresh = nms_thresh
-
-        # c++ la-nms is faster, but only support python 3.5
-        self.is_python35 = False
-        if sys.version_info.major == 3 and sys.version_info.minor == 5:
-            self.is_python35 = True
-
-    def restore_rectangle_quad(self, origin, geometry):
-        """
-        Restore rectangle from quadrangle.
-        """
-        # quad
-        origin_concat = np.concatenate(
-            (origin, origin, origin, origin), axis=1)  # (n, 8)
-        pred_quads = origin_concat - geometry
-        pred_quads = pred_quads.reshape((-1, 4, 2))  # (n, 4, 2)
-        return pred_quads
-
-    def detect(self,
-               score_map,
-               geo_map,
-               score_thresh=0.8,
-               cover_thresh=0.1,
-               nms_thresh=0.2):
-        """
-        restore text boxes from score map and geo map
-        """
-        score_map = score_map[0]
-        geo_map = np.swapaxes(geo_map, 1, 0)
-        geo_map = np.swapaxes(geo_map, 1, 2)
-        # filter the score map
-        xy_text = np.argwhere(score_map > score_thresh)
-        if len(xy_text) == 0:
-            return []
-        # sort the text boxes via the y axis
-        xy_text = xy_text[np.argsort(xy_text[:, 0])]
-        # restore quad proposals
-        text_box_restored = self.restore_rectangle_quad(
-            xy_text[:, ::-1] * 4, geo_map[xy_text[:, 0], xy_text[:, 1], :])
-        boxes = np.zeros((text_box_restored.shape[0], 9), dtype=np.float32)
-        boxes[:, :8] = text_box_restored.reshape((-1, 8))
-        boxes[:, 8] = score_map[xy_text[:, 0], xy_text[:, 1]]
-        if self.is_python35:
-            import lanms
-            boxes = lanms.merge_quadrangle_n9(boxes, nms_thresh)
-        else:
-            boxes = nms_locality(boxes.astype(np.float64), nms_thresh)
-        if boxes.shape[0] == 0:
-            return []
-        # Here we filter some low score boxes by the average score map,
-        #   this is different from the orginal paper.
-        for i, box in enumerate(boxes):
-            mask = np.zeros_like(score_map, dtype=np.uint8)
-            cv2.fillPoly(mask, box[:8].reshape(
-                (-1, 4, 2)).astype(np.int32) // 4, 1)
-            boxes[i, 8] = cv2.mean(score_map, mask)[0]
-        boxes = boxes[boxes[:, 8] > cover_thresh]
-        return boxes
-
-    def sort_poly(self, p):
-        """
-        Sort polygons.
-        """
-        min_axis = np.argmin(np.sum(p, axis=1))
-        p = p[[min_axis, (min_axis + 1) % 4, \
-               (min_axis + 2) % 4, (min_axis + 3) % 4]]
-        if abs(p[0, 0] - p[1, 0]) > abs(p[0, 1] - p[1, 1]):
-            return p
-        else:
-            return p[[0, 3, 2, 1]]
-
-    def __call__(self, outs_dict, shape_list):
-        score_list = outs_dict['f_score']
-        geo_list = outs_dict['f_geo']
-        if isinstance(score_list, torch.Tensor):
-            score_list = score_list.cpu().numpy()
-            geo_list = geo_list.cpu().numpy()
-        img_num = len(shape_list)
-        dt_boxes_list = []
-        for ino in range(img_num):
-            score = score_list[ino]
-            geo = geo_list[ino]
-            boxes = self.detect(
-                score_map=score,
-                geo_map=geo,
-                score_thresh=self.score_thresh,
-                cover_thresh=self.cover_thresh,
-                nms_thresh=self.nms_thresh)
-            boxes_norm = []
-            if len(boxes) > 0:
-                h, w = score.shape[1:]
-                src_h, src_w, ratio_h, ratio_w = shape_list[ino]
-                boxes = boxes[:, :8].reshape((-1, 4, 2))
-                boxes[:, :, 0] /= ratio_w
-                boxes[:, :, 1] /= ratio_h
-                for i_box, box in enumerate(boxes):
-                    box = self.sort_poly(box.astype(np.int32))
-                    if np.linalg.norm(box[0] - box[1]) < 5 \
-                            or np.linalg.norm(box[3] - box[0]) < 5:
-                        continue
-                    boxes_norm.append(box)
-            dt_boxes_list.append({'points': np.array(boxes_norm)})
-        return dt_boxes_list
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/fce_postprocess.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/fce_postprocess.py
-"""
-This code is refer from:
-https://github.com/open-mmlab/mmocr/blob/v0.3.0/mmocr/models/textdet/postprocess/wrapper.py
-"""
-
-import cv2
-import torch
-import numpy as np
-from numpy.fft import ifft
-from pytorchocr.utils.poly_nms import poly_nms, valid_boundary
-
-
-def fill_hole(input_mask):
-    h, w = input_mask.shape
-    canvas = np.zeros((h + 2, w + 2), np.uint8)
-    canvas[1:h + 1, 1:w + 1] = input_mask.copy()
-
-    mask = np.zeros((h + 4, w + 4), np.uint8)
-
-    cv2.floodFill(canvas, mask, (0, 0), 1)
-    canvas = canvas[1:h + 1, 1:w + 1].astype(np.bool)
-
-    return ~canvas | input_mask
-
-
-def fourier2poly(fourier_coeff, num_reconstr_points=50):
-    """ Inverse Fourier transform
-        Args:
-            fourier_coeff (ndarray): Fourier coefficients shaped (n, 2k+1),
-                with n and k being candidates number and Fourier degree
-                respectively.
-            num_reconstr_points (int): Number of reconstructed polygon points.
-        Returns:
-            Polygons (ndarray): The reconstructed polygons shaped (n, n')
-        """
-
-    a = np.zeros((len(fourier_coeff), num_reconstr_points), dtype='complex')
-    k = (len(fourier_coeff[0]) - 1) // 2
-
-    a[:, 0:k + 1] = fourier_coeff[:, k:]
-    a[:, -k:] = fourier_coeff[:, :k]
-
-    poly_complex = ifft(a) * num_reconstr_points
-    polygon = np.zeros((len(fourier_coeff), num_reconstr_points, 2))
-    polygon[:, :, 0] = poly_complex.real
-    polygon[:, :, 1] = poly_complex.imag
-    return polygon.astype('int32').reshape((len(fourier_coeff), -1))
-
-
-class FCEPostProcess(object):
-    """
-    The post process for FCENet.
-    """
-
-    def __init__(self,
-                 scales,
-                 fourier_degree=5,
-                 num_reconstr_points=50,
-                 decoding_type='fcenet',
-                 score_thr=0.3,
-                 nms_thr=0.1,
-                 alpha=1.0,
-                 beta=1.0,
-                 box_type='poly',
-                 **kwargs):
-
-        self.scales = scales
-        self.fourier_degree = fourier_degree
-        self.num_reconstr_points = num_reconstr_points
-        self.decoding_type = decoding_type
-        self.score_thr = score_thr
-        self.nms_thr = nms_thr
-        self.alpha = alpha
-        self.beta = beta
-        self.box_type = box_type
-
-    def __call__(self, preds, shape_list):
-        score_maps = []
-        for key, value in preds.items():
-            if isinstance(value, torch.Tensor):
-                value = value.numpy()
-            cls_res = value[:, :4, :, :]
-            reg_res = value[:, 4:, :, :]
-            score_maps.append([cls_res, reg_res])
-
-        return self.get_boundary(score_maps, shape_list)
-
-    def resize_boundary(self, boundaries, scale_factor):
-        """Rescale boundaries via scale_factor.
-
-        Args:
-            boundaries (list[list[float]]): The boundary list. Each boundary
-            with size 2k+1 with k>=4.
-            scale_factor(ndarray): The scale factor of size (4,).
-
-        Returns:
-            boundaries (list[list[float]]): The scaled boundaries.
-        """
-        boxes = []
-        scores = []
-        for b in boundaries:
-            sz = len(b)
-            valid_boundary(b, True)
-            scores.append(b[-1])
-            b = (np.array(b[:sz - 1]) *
-                 (np.tile(scale_factor[:2], int(
-                     (sz - 1) / 2)).reshape(1, sz - 1))).flatten().tolist()
-            boxes.append(np.array(b).reshape([-1, 2]))
-
-        return np.array(boxes, dtype=np.float32), scores
-
-    def get_boundary(self, score_maps, shape_list):
-        assert len(score_maps) == len(self.scales)
-        boundaries = []
-        for idx, score_map in enumerate(score_maps):
-            scale = self.scales[idx]
-            boundaries = boundaries + self._get_boundary_single(score_map,
-                                                                scale)
-
-        # nms
-        boundaries = poly_nms(boundaries, self.nms_thr)
-        boundaries, scores = self.resize_boundary(
-            boundaries, (1 / shape_list[0, 2:]).tolist()[::-1])
-
-        boxes_batch = [dict(points=boundaries, scores=scores)]
-        return boxes_batch
-
-    def _get_boundary_single(self, score_map, scale):
-        assert len(score_map) == 2
-        assert score_map[1].shape[1] == 4 * self.fourier_degree + 2
-
-        return self.fcenet_decode(
-            preds=score_map,
-            fourier_degree=self.fourier_degree,
-            num_reconstr_points=self.num_reconstr_points,
-            scale=scale,
-            alpha=self.alpha,
-            beta=self.beta,
-            box_type=self.box_type,
-            score_thr=self.score_thr,
-            nms_thr=self.nms_thr)
-
-    def fcenet_decode(self,
-                      preds,
-                      fourier_degree,
-                      num_reconstr_points,
-                      scale,
-                      alpha=1.0,
-                      beta=2.0,
-                      box_type='poly',
-                      score_thr=0.3,
-                      nms_thr=0.1):
-        """Decoding predictions of FCENet to instances.
-
-        Args:
-            preds (list(Tensor)): The head output tensors.
-            fourier_degree (int): The maximum Fourier transform degree k.
-            num_reconstr_points (int): The points number of the polygon
-                reconstructed from predicted Fourier coefficients.
-            scale (int): The down-sample scale of the prediction.
-            alpha (float) : The parameter to calculate final scores. Score_{final}
-                    = (Score_{text region} ^ alpha)
-                    * (Score_{text center region}^ beta)
-            beta (float) : The parameter to calculate final score.
-            box_type (str):  Boundary encoding type 'poly' or 'quad'.
-            score_thr (float) : The threshold used to filter out the final
-                candidates.
-            nms_thr (float) :  The threshold of nms.
-
-        Returns:
-            boundaries (list[list[float]]): The instance boundary and confidence
-                list.
-        """
-        assert isinstance(preds, list)
-        assert len(preds) == 2
-        assert box_type in ['poly', 'quad']
-
-        cls_pred = preds[0][0]
-        tr_pred = cls_pred[0:2]
-        tcl_pred = cls_pred[2:]
-
-        reg_pred = preds[1][0].transpose([1, 2, 0])
-        x_pred = reg_pred[:, :, :2 * fourier_degree + 1]
-        y_pred = reg_pred[:, :, 2 * fourier_degree + 1:]
-
-        score_pred = (tr_pred[1]**alpha) * (tcl_pred[1]**beta)
-        tr_pred_mask = (score_pred) > score_thr
-        tr_mask = fill_hole(tr_pred_mask)
-
-        tr_contours, _ = cv2.findContours(
-            tr_mask.astype(np.uint8), cv2.RETR_TREE,
-            cv2.CHAIN_APPROX_SIMPLE)  # opencv4
-
-        mask = np.zeros_like(tr_mask)
-        boundaries = []
-        for cont in tr_contours:
-            deal_map = mask.copy().astype(np.int8)
-            cv2.drawContours(deal_map, [cont], -1, 1, -1)
-
-            score_map = score_pred * deal_map
-            score_mask = score_map > 0
-            xy_text = np.argwhere(score_mask)
-            dxy = xy_text[:, 1] + xy_text[:, 0] * 1j
-
-            x, y = x_pred[score_mask], y_pred[score_mask]
-            c = x + y * 1j
-            c[:, fourier_degree] = c[:, fourier_degree] + dxy
-            c *= scale
-
-            polygons = fourier2poly(c, num_reconstr_points)
-            score = score_map[score_mask].reshape(-1, 1)
-            polygons = poly_nms(np.hstack((polygons, score)).tolist(), nms_thr)
-
-            boundaries = boundaries + polygons
-
-        boundaries = poly_nms(boundaries, nms_thr)
-
-        if box_type == 'quad':
-            new_boundaries = []
-            for boundary in boundaries:
-                poly = np.array(boundary[:-1]).reshape(-1, 2).astype(np.float32)
-                score = boundary[-1]
-                points = cv2.boxPoints(cv2.minAreaRect(poly))
-                points = np.int0(points)
-                new_boundaries.append(points.reshape(-1).tolist() + [score])
-                boundaries = new_boundaries
-
-        return boundaries
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/locality_aware_nms.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/locality_aware_nms.py
-"""
-Locality aware nms.
-"""
-
-import numpy as np
-from shapely.geometry import Polygon
-
-
-def intersection(g, p):
-    """
-    Intersection.
-    """
-    g = Polygon(g[:8].reshape((4, 2)))
-    p = Polygon(p[:8].reshape((4, 2)))
-    g = g.buffer(0)
-    p = p.buffer(0)
-    if not g.is_valid or not p.is_valid:
-        return 0
-    inter = Polygon(g).intersection(Polygon(p)).area
-    union = g.area + p.area - inter
-    if union == 0:
-        return 0
-    else:
-        return inter / union
-
-
-def intersection_iog(g, p):
-    """
-    Intersection_iog.
-    """
-    g = Polygon(g[:8].reshape((4, 2)))
-    p = Polygon(p[:8].reshape((4, 2)))
-    if not g.is_valid or not p.is_valid:
-        return 0
-    inter = Polygon(g).intersection(Polygon(p)).area
-    #union = g.area + p.area - inter
-    union = p.area
-    if union == 0:
-        print("p_area is very small")
-        return 0
-    else:
-        return inter / union
-
-
-def weighted_merge(g, p):
-    """
-    Weighted merge.
-    """
-    g[:8] = (g[8] * g[:8] + p[8] * p[:8]) / (g[8] + p[8])
-    g[8] = (g[8] + p[8])
-    return g
-
-
-def standard_nms(S, thres):
-    """
-    Standard nms.
-    """
-    order = np.argsort(S[:, 8])[::-1]
-    keep = []
-    while order.size > 0:
-        i = order[0]
-        keep.append(i)
-        ovr = np.array([intersection(S[i], S[t]) for t in order[1:]])
-
-        inds = np.where(ovr <= thres)[0]
-        order = order[inds + 1]
-
-    return S[keep]
-
-
-def standard_nms_inds(S, thres):
-    """
-    Standard nms, retun inds.
-    """
-    order = np.argsort(S[:, 8])[::-1]
-    keep = []
-    while order.size > 0:
-        i = order[0]
-        keep.append(i)
-        ovr = np.array([intersection(S[i], S[t]) for t in order[1:]])
-
-        inds = np.where(ovr <= thres)[0]
-        order = order[inds + 1]
-
-    return keep
-
-
-def nms(S, thres):
-    """
-    nms.
-    """
-    order = np.argsort(S[:, 8])[::-1]
-    keep = []
-    while order.size > 0:
-        i = order[0]
-        keep.append(i)
-        ovr = np.array([intersection(S[i], S[t]) for t in order[1:]])
-
-        inds = np.where(ovr <= thres)[0]
-        order = order[inds + 1]
-
-    return keep
-
-
-def soft_nms(boxes_in, Nt_thres=0.3, threshold=0.8, sigma=0.5, method=2):
-    """
-    soft_nms
-    :para boxes_in, N x 9 (coords + score)
-    :para threshould, eliminate cases min score(0.001)
-    :para Nt_thres, iou_threshi
-    :para sigma, gaussian weght
-    :method, linear or gaussian
-    """
-    boxes = boxes_in.copy()
-    N = boxes.shape[0]
-    if N is None or N < 1:
-        return np.array([])
-    pos, maxpos = 0, 0
-    weight = 0.0
-    inds = np.arange(N)
-    tbox, sbox = boxes[0].copy(), boxes[0].copy()
-    for i in range(N):
-        maxscore = boxes[i, 8]
-        maxpos = i
-        tbox = boxes[i].copy()
-        ti = inds[i]
-        pos = i + 1
-        #get max box
-        while pos < N:
-            if maxscore < boxes[pos, 8]:
-                maxscore = boxes[pos, 8]
-                maxpos = pos
-            pos = pos + 1
-        #add max box as a detection
-        boxes[i, :] = boxes[maxpos, :]
-        inds[i] = inds[maxpos]
-        #swap
-        boxes[maxpos, :] = tbox
-        inds[maxpos] = ti
-        tbox = boxes[i].copy()
-        pos = i + 1
-        #NMS iteration
-        while pos < N:
-            sbox = boxes[pos].copy()
-            ts_iou_val = intersection(tbox, sbox)
-            if ts_iou_val > 0:
-                if method == 1:
-                    if ts_iou_val > Nt_thres:
-                        weight = 1 - ts_iou_val
-                    else:
-                        weight = 1
-                elif method == 2:
-                    weight = np.exp(-1.0 * ts_iou_val**2 / sigma)
-                else:
-                    if ts_iou_val > Nt_thres:
-                        weight = 0
-                    else:
-                        weight = 1
-                boxes[pos, 8] = weight * boxes[pos, 8]
-                #if box score falls below thresold, discard the box by
-                #swaping last box update N
-                if boxes[pos, 8] < threshold:
-                    boxes[pos, :] = boxes[N - 1, :]
-                    inds[pos] = inds[N - 1]
-                    N = N - 1
-                    pos = pos - 1
-            pos = pos + 1
-
-    return boxes[:N]
-
-
-def nms_locality(polys, thres=0.3):
-    """
-    locality aware nms of EAST
-    :param polys: a N*9 numpy array. first 8 coordinates, then prob
-    :return: boxes after nms
-    """
-    S = []
-    p = None
-    for g in polys:
-        if p is not None and intersection(g, p) > thres:
-            p = weighted_merge(g, p)
-        else:
-            if p is not None:
-                S.append(p)
-            p = g
-    if p is not None:
-        S.append(p)
-
-    if len(S) == 0:
-        return np.array([])
-    return standard_nms(np.array(S), thres)
-
-
-if __name__ == '__main__':
-    # 343,350,448,135,474,143,369,359
-    print(
-        Polygon(np.array([[343, 350], [448, 135], [474, 143], [369, 359]]))
-        .area)
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/pg_postprocess.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/pg_postprocess.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import sys
-
-__dir__ = os.path.dirname(__file__)
-sys.path.append(__dir__)
-sys.path.append(os.path.join(__dir__, '..'))
-from pytorchocr.utils.e2e_utils.pgnet_pp_utils import PGNet_PostProcess
-
-
-class PGPostProcess(object):
-    """
-    The post process for PGNet.
-    """
-
-    def __init__(self, character_dict_path, valid_set, score_thresh, mode,
-                 **kwargs):
-        self.character_dict_path = character_dict_path
-        self.valid_set = valid_set
-        self.score_thresh = score_thresh
-        self.mode = mode
-
-        # c++ la-nms is faster, but only support python 3.5
-        self.is_python35 = False
-        if sys.version_info.major == 3 and sys.version_info.minor == 5:
-            self.is_python35 = True
-
-    def __call__(self, outs_dict, shape_list):
-        post = PGNet_PostProcess(self.character_dict_path, self.valid_set,
-                                 self.score_thresh, outs_dict, shape_list)
-        if self.mode == 'fast':
-            data = post.pg_postprocess_fast()
-        else:
-            data = post.pg_postprocess_slow()
-        return data
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/pse_postprocess/__init__.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/pse_postprocess/__init__.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .pse_postprocess import PSEPostProcess
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/pse_postprocess/pse/README.md
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/pse_postprocess/pse/README.md
-## 编译
-This code is refer from:
-https://github.com/whai362/PSENet/blob/python3/models/post_processing/pse
-```python
-python3 setup.py build_ext --inplace
-```
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/pse_postprocess/pse/__init__.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/pse_postprocess/pse/__init__.py
-# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import sys
-import os
-import subprocess
-
-python_path = sys.executable
-
-ori_path = os.getcwd()
-os.chdir('pytorchocr/postprocess/pse_postprocess/pse')
-if subprocess.call(
-        '{} setup.py build_ext --inplace'.format(python_path), shell=True) != 0:
-    raise RuntimeError(
-        'Cannot compile pse: {}, if your system is windows, you need to install all the default components of `desktop development using C++` in visual studio 2019+'.
-        format(os.path.dirname(os.path.realpath(__file__))))
-os.chdir(ori_path)
-
-from .pse import pse
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/pse_postprocess/pse/pse.pyx
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/pse_postprocess/pse/pse.pyx
-
-import numpy as np
-import cv2
-cimport numpy as np
-cimport cython
-cimport libcpp
-cimport libcpp.pair
-cimport libcpp.queue
-from libcpp.pair cimport *
-from libcpp.queue  cimport *
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-cdef np.ndarray[np.int32_t, ndim=2] _pse(np.ndarray[np.uint8_t, ndim=3] kernels,
-                                         np.ndarray[np.int32_t, ndim=2] label,
-                                         int kernel_num,
-                                         int label_num,
-                                         float min_area=0):
-    cdef np.ndarray[np.int32_t, ndim=2] pred
-    pred = np.zeros((label.shape[0], label.shape[1]), dtype=np.int32)
-
-    for label_idx in range(1, label_num):
-        if np.sum(label == label_idx) < min_area:
-            label[label == label_idx] = 0
-
-    cdef libcpp.queue.queue[libcpp.pair.pair[np.int16_t,np.int16_t]] que = \
-        queue[libcpp.pair.pair[np.int16_t,np.int16_t]]()
-    cdef libcpp.queue.queue[libcpp.pair.pair[np.int16_t,np.int16_t]] nxt_que = \
-        queue[libcpp.pair.pair[np.int16_t,np.int16_t]]()
-    cdef np.int16_t* dx = [-1, 1, 0, 0]
-    cdef np.int16_t* dy = [0, 0, -1, 1]
-    cdef np.int16_t tmpx, tmpy
-
-    points = np.array(np.where(label > 0)).transpose((1, 0))
-    for point_idx in range(points.shape[0]):
-        tmpx, tmpy = points[point_idx, 0], points[point_idx, 1]
-        que.push(pair[np.int16_t,np.int16_t](tmpx, tmpy))
-        pred[tmpx, tmpy] = label[tmpx, tmpy]
-
-    cdef libcpp.pair.pair[np.int16_t,np.int16_t] cur
-    cdef int cur_label
-    for kernel_idx in range(kernel_num - 1, -1, -1):
-        while not que.empty():
-            cur = que.front()
-            que.pop()
-            cur_label = pred[cur.first, cur.second]
-
-            is_edge = True
-            for j in range(4):
-                tmpx = cur.first + dx[j]
-                tmpy = cur.second + dy[j]
-                if tmpx < 0 or tmpx >= label.shape[0] or tmpy < 0 or tmpy >= label.shape[1]:
-                    continue
-                if kernels[kernel_idx, tmpx, tmpy] == 0 or pred[tmpx, tmpy] > 0:
-                    continue
-
-                que.push(pair[np.int16_t,np.int16_t](tmpx, tmpy))
-                pred[tmpx, tmpy] = cur_label
-                is_edge = False
-            if is_edge:
-                nxt_que.push(cur)
-
-        que, nxt_que = nxt_que, que
-
-    return pred
-
-def pse(kernels, min_area):
-    kernel_num = kernels.shape[0]
-    label_num, label = cv2.connectedComponents(kernels[-1], connectivity=4)
-    return _pse(kernels[:-1], label, kernel_num, label_num, min_area)
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/pse_postprocess/pse/setup.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/pse_postprocess/pse/setup.py
-from distutils.core import setup, Extension
-from Cython.Build import cythonize
-import numpy
-
-setup(ext_modules=cythonize(Extension(
-    'pse',
-    sources=['pse.pyx'],
-    language='c++',
-    include_dirs=[numpy.get_include()],
-    library_dirs=[],
-    libraries=[],
-    extra_compile_args=['-O3'],
-    extra_link_args=[]
-)))
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/pse_postprocess/pse_postprocess.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/pse_postprocess/pse_postprocess.py
-"""
-This code is refer from:
-https://github.com/whai362/PSENet/blob/python3/models/head/psenet_head.py
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import cv2
-import torch
-from torch.nn import functional as F
-
-from pytorchocr.postprocess.pse_postprocess.pse import pse
-
-
-class PSEPostProcess(object):
-    """
-    The post process for PSE.
-    """
-
-    def __init__(self,
-                 thresh=0.5,
-                 box_thresh=0.85,
-                 min_area=16,
-                 box_type='box',
-                 scale=4,
-                 **kwargs):
-        assert box_type in ['box', 'poly'], 'Only box and poly is supported'
-        self.thresh = thresh
-        self.box_thresh = box_thresh
-        self.min_area = min_area
-        self.box_type = box_type
-        self.scale = scale
-
-    def __call__(self, outs_dict, shape_list):
-        pred = outs_dict['maps']
-        if not isinstance(pred, torch.Tensor):
-            pred = torch.as_tensor(pred)
-        pred = F.interpolate(
-            pred, scale_factor=4 // self.scale, mode='bilinear')
-
-        score = F.sigmoid(pred[:, 0, :, :])
-
-        kernels = (pred > self.thresh).type(torch.float32)
-        text_mask = kernels[:, 0, :, :]
-        kernels[:, 0:, :, :] = kernels[:, 0:, :, :] * text_mask
-
-        score = score.numpy()
-        kernels = kernels.numpy().astype(np.uint8)
-
-        boxes_batch = []
-        for batch_index in range(pred.shape[0]):
-            boxes, scores = self.boxes_from_bitmap(score[batch_index],
-                                                   kernels[batch_index],
-                                                   shape_list[batch_index])
-
-            boxes_batch.append({'points': boxes, 'scores': scores})
-        return boxes_batch
-
-    def boxes_from_bitmap(self, score, kernels, shape):
-        label = pse(kernels, self.min_area)
-        return self.generate_box(score, label, shape)
-
-    def generate_box(self, score, label, shape):
-        src_h, src_w, ratio_h, ratio_w = shape
-        label_num = np.max(label) + 1
-
-        boxes = []
-        scores = []
-        for i in range(1, label_num):
-            ind = label == i
-            points = np.array(np.where(ind)).transpose((1, 0))[:, ::-1]
-
-            if points.shape[0] < self.min_area:
-                label[ind] = 0
-                continue
-
-            score_i = np.mean(score[ind])
-            if score_i < self.box_thresh:
-                label[ind] = 0
-                continue
-
-            if self.box_type == 'box':
-                rect = cv2.minAreaRect(points)
-                bbox = cv2.boxPoints(rect)
-            elif self.box_type == 'poly':
-                box_height = np.max(points[:, 1]) + 10
-                box_width = np.max(points[:, 0]) + 10
-
-                mask = np.zeros((box_height, box_width), np.uint8)
-                mask[points[:, 1], points[:, 0]] = 255
-
-                contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL,
-                                               cv2.CHAIN_APPROX_SIMPLE)
-                bbox = np.squeeze(contours[0], 1)
-            else:
-                raise NotImplementedError
-
-            bbox[:, 0] = np.clip(np.round(bbox[:, 0] / ratio_w), 0, src_w)
-            bbox[:, 1] = np.clip(np.round(bbox[:, 1] / ratio_h), 0, src_h)
-            boxes.append(bbox)
-            scores.append(score_i)
-        return boxes, scores
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/rec_postprocess.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/rec_postprocess.py
@@ -12,9 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import numpy as np
-import string
-#import paddle
-# from paddle.nn import functional as F
 import torch



--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/sast_postprocess.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/sast_postprocess.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import sys
-
-__dir__ = os.path.dirname(__file__)
-sys.path.append(__dir__)
-sys.path.append(os.path.join(__dir__, '..'))
-
-import numpy as np
-from .locality_aware_nms import nms_locality
-# import paddle
-import torch
-import cv2
-import time
-
-
-class SASTPostProcess(object):
-    """
-    The post process for SAST.
-    """
-
-    def __init__(self,
-                 score_thresh=0.5,
-                 nms_thresh=0.2,
-                 sample_pts_num=2,
-                 shrink_ratio_of_width=0.3,
-                 expand_scale=1.0,
-                 tcl_map_thresh=0.5,
-                 **kwargs):
-
-        self.score_thresh = score_thresh
-        self.nms_thresh = nms_thresh
-        self.sample_pts_num = sample_pts_num
-        self.shrink_ratio_of_width = shrink_ratio_of_width
-        self.expand_scale = expand_scale
-        self.tcl_map_thresh = tcl_map_thresh
-
-        # c++ la-nms is faster, but only support python 3.5
-        self.is_python35 = False
-        if sys.version_info.major == 3 and sys.version_info.minor == 5:
-            self.is_python35 = True
-
-    def point_pair2poly(self, point_pair_list):
-        """
-        Transfer vertical point_pairs into poly point in clockwise.
-        """
-        # constract poly
-        point_num = len(point_pair_list) * 2
-        point_list = [0] * point_num
-        for idx, point_pair in enumerate(point_pair_list):
-            point_list[idx] = point_pair[0]
-            point_list[point_num - 1 - idx] = point_pair[1]
-        return np.array(point_list).reshape(-1, 2)
-
-    def shrink_quad_along_width(self, quad, begin_width_ratio=0., end_width_ratio=1.):
-        """
-        Generate shrink_quad_along_width.
-        """
-        ratio_pair = np.array([[begin_width_ratio], [end_width_ratio]], dtype=np.float32)
-        p0_1 = quad[0] + (quad[1] - quad[0]) * ratio_pair
-        p3_2 = quad[3] + (quad[2] - quad[3]) * ratio_pair
-        return np.array([p0_1[0], p0_1[1], p3_2[1], p3_2[0]])
-
-    def expand_poly_along_width(self, poly, shrink_ratio_of_width=0.3):
-        """
-        expand poly along width.
-        """
-        point_num = poly.shape[0]
-        left_quad = np.array([poly[0], poly[1], poly[-2], poly[-1]], dtype=np.float32)
-        left_ratio = -shrink_ratio_of_width * np.linalg.norm(left_quad[0] - left_quad[3]) / \
-                     (np.linalg.norm(left_quad[0] - left_quad[1]) + 1e-6)
-        left_quad_expand = self.shrink_quad_along_width(left_quad, left_ratio, 1.0)
-        right_quad = np.array([poly[point_num // 2 - 2], poly[point_num // 2 - 1],
-                               poly[point_num // 2], poly[point_num // 2 + 1]], dtype=np.float32)
-        right_ratio = 1.0 + \
-                      shrink_ratio_of_width * np.linalg.norm(right_quad[0] - right_quad[3]) / \
-                      (np.linalg.norm(right_quad[0] - right_quad[1]) + 1e-6)
-        right_quad_expand = self.shrink_quad_along_width(right_quad, 0.0, right_ratio)
-        poly[0] = left_quad_expand[0]
-        poly[-1] = left_quad_expand[-1]
-        poly[point_num // 2 - 1] = right_quad_expand[1]
-        poly[point_num // 2] = right_quad_expand[2]
-        return poly
-
-    def restore_quad(self, tcl_map, tcl_map_thresh, tvo_map):
-        """Restore quad."""
-        xy_text = np.argwhere(tcl_map[:, :, 0] > tcl_map_thresh)
-        xy_text = xy_text[:, ::-1]  # (n, 2)
-
-        # Sort the text boxes via the y axis
-        xy_text = xy_text[np.argsort(xy_text[:, 1])]
-
-        scores = tcl_map[xy_text[:, 1], xy_text[:, 0], 0]
-        scores = scores[:, np.newaxis]
-
-        # Restore
-        point_num = int(tvo_map.shape[-1] / 2)
-        assert point_num == 4
-        tvo_map = tvo_map[xy_text[:, 1], xy_text[:, 0], :]
-        xy_text_tile = np.tile(xy_text, (1, point_num))  # (n, point_num * 2)
-        quads = xy_text_tile - tvo_map
-
-        return scores, quads, xy_text
-
-    def quad_area(self, quad):
-        """
-        compute area of a quad.
-        """
-        edge = [
-            (quad[1][0] - quad[0][0]) * (quad[1][1] + quad[0][1]),
-            (quad[2][0] - quad[1][0]) * (quad[2][1] + quad[1][1]),
-            (quad[3][0] - quad[2][0]) * (quad[3][1] + quad[2][1]),
-            (quad[0][0] - quad[3][0]) * (quad[0][1] + quad[3][1])
-        ]
-        return np.sum(edge) / 2.
-
-    def nms(self, dets):
-        if self.is_python35:
-            import lanms
-            dets = lanms.merge_quadrangle_n9(dets, self.nms_thresh)
-        else:
-            dets = nms_locality(dets, self.nms_thresh)
-        return dets
-
-    def cluster_by_quads_tco(self, tcl_map, tcl_map_thresh, quads, tco_map):
-        """
-        Cluster pixels in tcl_map based on quads.
-        """
-        instance_count = quads.shape[0] + 1  # contain background
-        instance_label_map = np.zeros(tcl_map.shape[:2], dtype=np.int32)
-        if instance_count == 1:
-            return instance_count, instance_label_map
-
-        # predict text center
-        xy_text = np.argwhere(tcl_map[:, :, 0] > tcl_map_thresh)
-        n = xy_text.shape[0]
-        xy_text = xy_text[:, ::-1]  # (n, 2)
-        tco = tco_map[xy_text[:, 1], xy_text[:, 0], :]  # (n, 2)
-        pred_tc = xy_text - tco
-
-        # get gt text center
-        m = quads.shape[0]
-        gt_tc = np.mean(quads, axis=1)  # (m, 2)
-
-        pred_tc_tile = np.tile(pred_tc[:, np.newaxis, :], (1, m, 1))  # (n, m, 2)
-        gt_tc_tile = np.tile(gt_tc[np.newaxis, :, :], (n, 1, 1))  # (n, m, 2)
-        dist_mat = np.linalg.norm(pred_tc_tile - gt_tc_tile, axis=2)  # (n, m)
-        xy_text_assign = np.argmin(dist_mat, axis=1) + 1  # (n,)
-
-        instance_label_map[xy_text[:, 1], xy_text[:, 0]] = xy_text_assign
-        return instance_count, instance_label_map
-
-    def estimate_sample_pts_num(self, quad, xy_text):
-        """
-        Estimate sample points number.
-        """
-        eh = (np.linalg.norm(quad[0] - quad[3]) + np.linalg.norm(quad[1] - quad[2])) / 2.0
-        ew = (np.linalg.norm(quad[0] - quad[1]) + np.linalg.norm(quad[2] - quad[3])) / 2.0
-
-        dense_sample_pts_num = max(2, int(ew))
-        dense_xy_center_line = xy_text[np.linspace(0, xy_text.shape[0] - 1, dense_sample_pts_num,
-                                                   endpoint=True, dtype=np.float32).astype(np.int32)]
-
-        dense_xy_center_line_diff = dense_xy_center_line[1:] - dense_xy_center_line[:-1]
-        estimate_arc_len = np.sum(np.linalg.norm(dense_xy_center_line_diff, axis=1))
-
-        sample_pts_num = max(2, int(estimate_arc_len / eh))
-        return sample_pts_num
-
-    def detect_sast(self, tcl_map, tvo_map, tbo_map, tco_map, ratio_w, ratio_h, src_w, src_h,
-                    shrink_ratio_of_width=0.3, tcl_map_thresh=0.5, offset_expand=1.0, out_strid=4.0):
-        """
-        first resize the tcl_map, tvo_map and tbo_map to the input_size, then restore the polys
-        """
-        # restore quad
-        scores, quads, xy_text = self.restore_quad(tcl_map, tcl_map_thresh, tvo_map)
-        dets = np.hstack((quads, scores)).astype(np.float32, copy=False)
-        dets = self.nms(dets)
-        if dets.shape[0] == 0:
-            return []
-        quads = dets[:, :-1].reshape(-1, 4, 2)
-
-        # Compute quad area
-        quad_areas = []
-        for quad in quads:
-            quad_areas.append(-self.quad_area(quad))
-
-        # instance segmentation
-        # instance_count, instance_label_map = cv2.connectedComponents(tcl_map.astype(np.uint8), connectivity=8)
-        instance_count, instance_label_map = self.cluster_by_quads_tco(tcl_map, tcl_map_thresh, quads, tco_map)
-
-        # restore single poly with tcl instance.
-        poly_list = []
-        for instance_idx in range(1, instance_count):
-            xy_text = np.argwhere(instance_label_map == instance_idx)[:, ::-1]
-            quad = quads[instance_idx - 1]
-            q_area = quad_areas[instance_idx - 1]
-            if q_area < 5:
-                continue
-
-            #
-            len1 = float(np.linalg.norm(quad[0] - quad[1]))
-            len2 = float(np.linalg.norm(quad[1] - quad[2]))
-            min_len = min(len1, len2)
-            if min_len < 3:
-                continue
-
-            # filter small CC
-            if xy_text.shape[0] <= 0:
-                continue
-
-            # filter low confidence instance
-            xy_text_scores = tcl_map[xy_text[:, 1], xy_text[:, 0], 0]
-            if np.sum(xy_text_scores) / quad_areas[instance_idx - 1] < 0.1:
-                # if np.sum(xy_text_scores) / quad_areas[instance_idx - 1] < 0.05:
-                continue
-
-            # sort xy_text
-            left_center_pt = np.array([[(quad[0, 0] + quad[-1, 0]) / 2.0,
-                                        (quad[0, 1] + quad[-1, 1]) / 2.0]])  # (1, 2)
-            right_center_pt = np.array([[(quad[1, 0] + quad[2, 0]) / 2.0,
-                                         (quad[1, 1] + quad[2, 1]) / 2.0]])  # (1, 2)
-            proj_unit_vec = (right_center_pt - left_center_pt) / \
-                            (np.linalg.norm(right_center_pt - left_center_pt) + 1e-6)
-            proj_value = np.sum(xy_text * proj_unit_vec, axis=1)
-            xy_text = xy_text[np.argsort(proj_value)]
-
-            # Sample pts in tcl map
-            if self.sample_pts_num == 0:
-                sample_pts_num = self.estimate_sample_pts_num(quad, xy_text)
-            else:
-                sample_pts_num = self.sample_pts_num
-            xy_center_line = xy_text[np.linspace(0, xy_text.shape[0] - 1, sample_pts_num,
-                                                 endpoint=True, dtype=np.float32).astype(np.int32)]
-
-            point_pair_list = []
-            for x, y in xy_center_line:
-                # get corresponding offset
-                offset = tbo_map[y, x, :].reshape(2, 2)
-                if offset_expand != 1.0:
-                    offset_length = np.linalg.norm(offset, axis=1, keepdims=True)
-                    expand_length = np.clip(offset_length * (offset_expand - 1), a_min=0.5, a_max=3.0)
-                    offset_detal = offset / offset_length * expand_length
-                    offset = offset + offset_detal
-                    # original point
-                ori_yx = np.array([y, x], dtype=np.float32)
-                point_pair = (ori_yx + offset)[:, ::-1] * out_strid / np.array([ratio_w, ratio_h]).reshape(-1, 2)
-                point_pair_list.append(point_pair)
-
-            # ndarry: (x, 2), expand poly along width
-            detected_poly = self.point_pair2poly(point_pair_list)
-            detected_poly = self.expand_poly_along_width(detected_poly, shrink_ratio_of_width)
-            detected_poly[:, 0] = np.clip(detected_poly[:, 0], a_min=0, a_max=src_w)
-            detected_poly[:, 1] = np.clip(detected_poly[:, 1], a_min=0, a_max=src_h)
-            poly_list.append(detected_poly)
-
-        return poly_list
-
-    def __call__(self, outs_dict, shape_list):
-        score_list = outs_dict['f_score']
-        border_list = outs_dict['f_border']
-        tvo_list = outs_dict['f_tvo']
-        tco_list = outs_dict['f_tco']
-        if isinstance(score_list, torch.Tensor):
-            score_list = score_list.cpu().numpy()
-            border_list = border_list.cpu().numpy()
-            tvo_list = tvo_list.cpu().numpy()
-            tco_list = tco_list.cpu().numpy()
-
-        img_num = len(shape_list)
-        poly_lists = []
-        for ino in range(img_num):
-            p_score = score_list[ino].transpose((1, 2, 0))
-            p_border = border_list[ino].transpose((1, 2, 0))
-            p_tvo = tvo_list[ino].transpose((1, 2, 0))
-            p_tco = tco_list[ino].transpose((1, 2, 0))
-            src_h, src_w, ratio_h, ratio_w = shape_list[ino]
-
-            poly_list = self.detect_sast(p_score, p_tvo, p_border, p_tco, ratio_w, ratio_h, src_w, src_h,
-                                         shrink_ratio_of_width=self.shrink_ratio_of_width,
-                                         tcl_map_thresh=self.tcl_map_thresh, offset_expand=self.expand_scale)
-            poly_lists.append({'points': np.array(poly_list)})
-
-        return poly_lists
-
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/e2e_utils/extract_batchsize.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/e2e_utils/extract_batchsize.py
-raise ValueError('utils -> e2e_utils -> extract_batchsize')
-import paddle
-import numpy as np
-import copy
-
-
-def org_tcl_rois(batch_size, pos_lists, pos_masks, label_lists, tcl_bs):
-    """
-    """
-    pos_lists_, pos_masks_, label_lists_ = [], [], []
-    img_bs = batch_size
-    ngpu = int(batch_size / img_bs)
-    img_ids = np.array(pos_lists, dtype=np.int32)[:, 0, 0].copy()
-    pos_lists_split, pos_masks_split, label_lists_split = [], [], []
-    for i in range(ngpu):
-        pos_lists_split.append([])
-        pos_masks_split.append([])
-        label_lists_split.append([])
-
-    for i in range(img_ids.shape[0]):
-        img_id = img_ids[i]
-        gpu_id = int(img_id / img_bs)
-        img_id = img_id % img_bs
-        pos_list = pos_lists[i].copy()
-        pos_list[:, 0] = img_id
-        pos_lists_split[gpu_id].append(pos_list)
-        pos_masks_split[gpu_id].append(pos_masks[i].copy())
-        label_lists_split[gpu_id].append(copy.deepcopy(label_lists[i]))
-    # repeat or delete
-    for i in range(ngpu):
-        vp_len = len(pos_lists_split[i])
-        if vp_len <= tcl_bs:
-            for j in range(0, tcl_bs - vp_len):
-                pos_list = pos_lists_split[i][j].copy()
-                pos_lists_split[i].append(pos_list)
-                pos_mask = pos_masks_split[i][j].copy()
-                pos_masks_split[i].append(pos_mask)
-                label_list = copy.deepcopy(label_lists_split[i][j])
-                label_lists_split[i].append(label_list)
-        else:
-            for j in range(0, vp_len - tcl_bs):
-                c_len = len(pos_lists_split[i])
-                pop_id = np.random.permutation(c_len)[0]
-                pos_lists_split[i].pop(pop_id)
-                pos_masks_split[i].pop(pop_id)
-                label_lists_split[i].pop(pop_id)
-    # merge
-    for i in range(ngpu):
-        pos_lists_.extend(pos_lists_split[i])
-        pos_masks_.extend(pos_masks_split[i])
-        label_lists_.extend(label_lists_split[i])
-    return pos_lists_, pos_masks_, label_lists_
-
-
-def pre_process(label_list, pos_list, pos_mask, max_text_length, max_text_nums,
-                pad_num, tcl_bs):
-    label_list = label_list.numpy()
-    batch, _, _, _ = label_list.shape
-    pos_list = pos_list.numpy()
-    pos_mask = pos_mask.numpy()
-    pos_list_t = []
-    pos_mask_t = []
-    label_list_t = []
-    for i in range(batch):
-        for j in range(max_text_nums):
-            if pos_mask[i, j].any():
-                pos_list_t.append(pos_list[i][j])
-                pos_mask_t.append(pos_mask[i][j])
-                label_list_t.append(label_list[i][j])
-    pos_list, pos_mask, label_list = org_tcl_rois(batch, pos_list_t, pos_mask_t,
-                                                  label_list_t, tcl_bs)
-    label = []
-    tt = [l.tolist() for l in label_list]
-    for i in range(tcl_bs):
-        k = 0
-        for j in range(max_text_length):
-            if tt[i][j][0] != pad_num:
-                k += 1
-            else:
-                break
-        label.append(k)
-    label = paddle.to_tensor(label)
-    label = paddle.cast(label, dtype='int64')
-    pos_list = paddle.to_tensor(pos_list)
-    pos_mask = paddle.to_tensor(pos_mask)
-    label_list = paddle.squeeze(paddle.to_tensor(label_list), axis=2)
-    label_list = paddle.cast(label_list, dtype='int32')
-    return pos_list, pos_mask, label_list, label
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/e2e_utils/extract_textpoint_fast.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/e2e_utils/extract_textpoint_fast.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Contains various CTC decoders."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import cv2
-import math
-
-import numpy as np
-from itertools import groupby
-from skimage.morphology._skeletonize import thin
-
-
-def get_dict(character_dict_path):
-    character_str = ""
-    with open(character_dict_path, "rb") as fin:
-        lines = fin.readlines()
-        for line in lines:
-            line = line.decode('utf-8').strip("\n").strip("\r\n")
-            character_str += line
-        dict_character = list(character_str)
-    return dict_character
-
-
-def softmax(logits):
-    """
-    logits: N x d
-    """
-    max_value = np.max(logits, axis=1, keepdims=True)
-    exp = np.exp(logits - max_value)
-    exp_sum = np.sum(exp, axis=1, keepdims=True)
-    dist = exp / exp_sum
-    return dist
-
-
-def get_keep_pos_idxs(labels, remove_blank=None):
-    """
-    Remove duplicate and get pos idxs of keep items.
-    The value of keep_blank should be [None, 95].
-    """
-    duplicate_len_list = []
-    keep_pos_idx_list = []
-    keep_char_idx_list = []
-    for k, v_ in groupby(labels):
-        current_len = len(list(v_))
-        if k != remove_blank:
-            current_idx = int(sum(duplicate_len_list) + current_len // 2)
-            keep_pos_idx_list.append(current_idx)
-            keep_char_idx_list.append(k)
-        duplicate_len_list.append(current_len)
-    return keep_char_idx_list, keep_pos_idx_list
-
-
-def remove_blank(labels, blank=0):
-    new_labels = [x for x in labels if x != blank]
-    return new_labels
-
-
-def insert_blank(labels, blank=0):
-    new_labels = [blank]
-    for l in labels:
-        new_labels += [l, blank]
-    return new_labels
-
-
-def ctc_greedy_decoder(probs_seq, blank=95, keep_blank_in_idxs=True):
-    """
-    CTC greedy (best path) decoder.
-    """
-    raw_str = np.argmax(np.array(probs_seq), axis=1)
-    remove_blank_in_pos = None if keep_blank_in_idxs else blank
-    dedup_str, keep_idx_list = get_keep_pos_idxs(
-        raw_str, remove_blank=remove_blank_in_pos)
-    dst_str = remove_blank(dedup_str, blank=blank)
-    return dst_str, keep_idx_list
-
-
-def instance_ctc_greedy_decoder(gather_info, logits_map, pts_num=4):
-    _, _, C = logits_map.shape
-    ys, xs = zip(*gather_info)
-    logits_seq = logits_map[list(ys), list(xs)]
-    probs_seq = logits_seq
-    labels = np.argmax(probs_seq, axis=1)
-    dst_str = [k for k, v_ in groupby(labels) if k != C - 1]
-    detal = len(gather_info) // (pts_num - 1)
-    keep_idx_list = [0] + [detal * (i + 1) for i in range(pts_num - 2)] + [-1]
-    keep_gather_list = [gather_info[idx] for idx in keep_idx_list]
-    return dst_str, keep_gather_list
-
-
-def ctc_decoder_for_image(gather_info_list,
-                          logits_map,
-                          Lexicon_Table,
-                          pts_num=6):
-    """
-    CTC decoder using multiple processes.
-    """
-    decoder_str = []
-    decoder_xys = []
-    for gather_info in gather_info_list:
-        if len(gather_info) < pts_num:
-            continue
-        dst_str, xys_list = instance_ctc_greedy_decoder(
-            gather_info, logits_map, pts_num=pts_num)
-        dst_str_readable = ''.join([Lexicon_Table[idx] for idx in dst_str])
-        if len(dst_str_readable) < 2:
-            continue
-        decoder_str.append(dst_str_readable)
-        decoder_xys.append(xys_list)
-    return decoder_str, decoder_xys
-
-
-def sort_with_direction(pos_list, f_direction):
-    """
-    f_direction: h x w x 2
-    pos_list: [[y, x], [y, x], [y, x] ...]
-    """
-
-    def sort_part_with_direction(pos_list, point_direction):
-        pos_list = np.array(pos_list).reshape(-1, 2)
-        point_direction = np.array(point_direction).reshape(-1, 2)
-        average_direction = np.mean(point_direction, axis=0, keepdims=True)
-        pos_proj_leng = np.sum(pos_list * average_direction, axis=1)
-        sorted_list = pos_list[np.argsort(pos_proj_leng)].tolist()
-        sorted_direction = point_direction[np.argsort(pos_proj_leng)].tolist()
-        return sorted_list, sorted_direction
-
-    pos_list = np.array(pos_list).reshape(-1, 2)
-    point_direction = f_direction[pos_list[:, 0], pos_list[:, 1]]  # x, y
-    point_direction = point_direction[:, ::-1]  # x, y -> y, x
-    sorted_point, sorted_direction = sort_part_with_direction(pos_list,
-                                                              point_direction)
-
-    point_num = len(sorted_point)
-    if point_num >= 16:
-        middle_num = point_num // 2
-        first_part_point = sorted_point[:middle_num]
-        first_point_direction = sorted_direction[:middle_num]
-        sorted_fist_part_point, sorted_fist_part_direction = sort_part_with_direction(
-            first_part_point, first_point_direction)
-
-        last_part_point = sorted_point[middle_num:]
-        last_point_direction = sorted_direction[middle_num:]
-        sorted_last_part_point, sorted_last_part_direction = sort_part_with_direction(
-            last_part_point, last_point_direction)
-        sorted_point = sorted_fist_part_point + sorted_last_part_point
-        sorted_direction = sorted_fist_part_direction + sorted_last_part_direction
-
-    return sorted_point, np.array(sorted_direction)
-
-
-def add_id(pos_list, image_id=0):
-    """
-    Add id for gather feature, for inference.
-    """
-    new_list = []
-    for item in pos_list:
-        new_list.append((image_id, item[0], item[1]))
-    return new_list
-
-
-def sort_and_expand_with_direction(pos_list, f_direction):
-    """
-    f_direction: h x w x 2
-    pos_list: [[y, x], [y, x], [y, x] ...]
-    """
-    h, w, _ = f_direction.shape
-    sorted_list, point_direction = sort_with_direction(pos_list, f_direction)
-
-    point_num = len(sorted_list)
-    sub_direction_len = max(point_num // 3, 2)
-    left_direction = point_direction[:sub_direction_len, :]
-    right_dirction = point_direction[point_num - sub_direction_len:, :]
-
-    left_average_direction = -np.mean(left_direction, axis=0, keepdims=True)
-    left_average_len = np.linalg.norm(left_average_direction)
-    left_start = np.array(sorted_list[0])
-    left_step = left_average_direction / (left_average_len + 1e-6)
-
-    right_average_direction = np.mean(right_dirction, axis=0, keepdims=True)
-    right_average_len = np.linalg.norm(right_average_direction)
-    right_step = right_average_direction / (right_average_len + 1e-6)
-    right_start = np.array(sorted_list[-1])
-
-    append_num = max(
-        int((left_average_len + right_average_len) / 2.0 * 0.15), 1)
-    left_list = []
-    right_list = []
-    for i in range(append_num):
-        ly, lx = np.round(left_start + left_step * (i + 1)).flatten().astype(
-            'int32').tolist()
-        if ly < h and lx < w and (ly, lx) not in left_list:
-            left_list.append((ly, lx))
-        ry, rx = np.round(right_start + right_step * (i + 1)).flatten().astype(
-            'int32').tolist()
-        if ry < h and rx < w and (ry, rx) not in right_list:
-            right_list.append((ry, rx))
-
-    all_list = left_list[::-1] + sorted_list + right_list
-    return all_list
-
-
-def sort_and_expand_with_direction_v2(pos_list, f_direction, binary_tcl_map):
-    """
-    f_direction: h x w x 2
-    pos_list: [[y, x], [y, x], [y, x] ...]
-    binary_tcl_map: h x w
-    """
-    h, w, _ = f_direction.shape
-    sorted_list, point_direction = sort_with_direction(pos_list, f_direction)
-
-    point_num = len(sorted_list)
-    sub_direction_len = max(point_num // 3, 2)
-    left_direction = point_direction[:sub_direction_len, :]
-    right_dirction = point_direction[point_num - sub_direction_len:, :]
-
-    left_average_direction = -np.mean(left_direction, axis=0, keepdims=True)
-    left_average_len = np.linalg.norm(left_average_direction)
-    left_start = np.array(sorted_list[0])
-    left_step = left_average_direction / (left_average_len + 1e-6)
-
-    right_average_direction = np.mean(right_dirction, axis=0, keepdims=True)
-    right_average_len = np.linalg.norm(right_average_direction)
-    right_step = right_average_direction / (right_average_len + 1e-6)
-    right_start = np.array(sorted_list[-1])
-
-    append_num = max(
-        int((left_average_len + right_average_len) / 2.0 * 0.15), 1)
-    max_append_num = 2 * append_num
-
-    left_list = []
-    right_list = []
-    for i in range(max_append_num):
-        ly, lx = np.round(left_start + left_step * (i + 1)).flatten().astype(
-            'int32').tolist()
-        if ly < h and lx < w and (ly, lx) not in left_list:
-            if binary_tcl_map[ly, lx] > 0.5:
-                left_list.append((ly, lx))
-            else:
-                break
-
-    for i in range(max_append_num):
-        ry, rx = np.round(right_start + right_step * (i + 1)).flatten().astype(
-            'int32').tolist()
-        if ry < h and rx < w and (ry, rx) not in right_list:
-            if binary_tcl_map[ry, rx] > 0.5:
-                right_list.append((ry, rx))
-            else:
-                break
-
-    all_list = left_list[::-1] + sorted_list + right_list
-    return all_list
-
-
-def point_pair2poly(point_pair_list):
-    """
-    Transfer vertical point_pairs into poly point in clockwise.
-    """
-    point_num = len(point_pair_list) * 2
-    point_list = [0] * point_num
-    for idx, point_pair in enumerate(point_pair_list):
-        point_list[idx] = point_pair[0]
-        point_list[point_num - 1 - idx] = point_pair[1]
-    return np.array(point_list).reshape(-1, 2)
-
-
-def shrink_quad_along_width(quad, begin_width_ratio=0., end_width_ratio=1.):
-    ratio_pair = np.array(
-        [[begin_width_ratio], [end_width_ratio]], dtype=np.float32)
-    p0_1 = quad[0] + (quad[1] - quad[0]) * ratio_pair
-    p3_2 = quad[3] + (quad[2] - quad[3]) * ratio_pair
-    return np.array([p0_1[0], p0_1[1], p3_2[1], p3_2[0]])
-
-
-def expand_poly_along_width(poly, shrink_ratio_of_width=0.3):
-    """
-    expand poly along width.
-    """
-    point_num = poly.shape[0]
-    left_quad = np.array(
-        [poly[0], poly[1], poly[-2], poly[-1]], dtype=np.float32)
-    left_ratio = -shrink_ratio_of_width * np.linalg.norm(left_quad[0] - left_quad[3]) / \
-                 (np.linalg.norm(left_quad[0] - left_quad[1]) + 1e-6)
-    left_quad_expand = shrink_quad_along_width(left_quad, left_ratio, 1.0)
-    right_quad = np.array(
-        [
-            poly[point_num // 2 - 2], poly[point_num // 2 - 1],
-            poly[point_num // 2], poly[point_num // 2 + 1]
-        ],
-        dtype=np.float32)
-    right_ratio = 1.0 + shrink_ratio_of_width * np.linalg.norm(right_quad[0] - right_quad[3]) / \
-                  (np.linalg.norm(right_quad[0] - right_quad[1]) + 1e-6)
-    right_quad_expand = shrink_quad_along_width(right_quad, 0.0, right_ratio)
-    poly[0] = left_quad_expand[0]
-    poly[-1] = left_quad_expand[-1]
-    poly[point_num // 2 - 1] = right_quad_expand[1]
-    poly[point_num // 2] = right_quad_expand[2]
-    return poly
-
-
-def restore_poly(instance_yxs_list, seq_strs, p_border, ratio_w, ratio_h, src_w,
-                 src_h, valid_set):
-    poly_list = []
-    keep_str_list = []
-    for yx_center_line, keep_str in zip(instance_yxs_list, seq_strs):
-        if len(keep_str) < 2:
-            print('--> too short, {}'.format(keep_str))
-            continue
-
-        offset_expand = 1.0
-        if valid_set == 'totaltext':
-            offset_expand = 1.2
-
-        point_pair_list = []
-        for y, x in yx_center_line:
-            offset = p_border[:, y, x].reshape(2, 2) * offset_expand
-            ori_yx = np.array([y, x], dtype=np.float32)
-            point_pair = (ori_yx + offset)[:, ::-1] * 4.0 / np.array(
-                [ratio_w, ratio_h]).reshape(-1, 2)
-            point_pair_list.append(point_pair)
-
-        detected_poly = point_pair2poly(point_pair_list)
-        detected_poly = expand_poly_along_width(
-            detected_poly, shrink_ratio_of_width=0.2)
-        detected_poly[:, 0] = np.clip(detected_poly[:, 0], a_min=0, a_max=src_w)
-        detected_poly[:, 1] = np.clip(detected_poly[:, 1], a_min=0, a_max=src_h)
-
-        keep_str_list.append(keep_str)
-        if valid_set == 'partvgg':
-            middle_point = len(detected_poly) // 2
-            detected_poly = detected_poly[
-                [0, middle_point - 1, middle_point, -1], :]
-            poly_list.append(detected_poly)
-        elif valid_set == 'totaltext':
-            poly_list.append(detected_poly)
-        else:
-            print('--> Not supported format.')
-            exit(-1)
-    return poly_list, keep_str_list
-
-
-def generate_pivot_list_fast(p_score,
-                             p_char_maps,
-                             f_direction,
-                             Lexicon_Table,
-                             score_thresh=0.5):
-    """
-    return center point and end point of TCL instance; filter with the char maps;
-    """
-    p_score = p_score[0]
-    f_direction = f_direction.transpose(1, 2, 0)
-    p_tcl_map = (p_score > score_thresh) * 1.0
-    skeleton_map = thin(p_tcl_map.astype(np.uint8))
-    instance_count, instance_label_map = cv2.connectedComponents(
-        skeleton_map.astype(np.uint8), connectivity=8)
-
-    # get TCL Instance
-    all_pos_yxs = []
-    if instance_count > 0:
-        for instance_id in range(1, instance_count):
-            pos_list = []
-            ys, xs = np.where(instance_label_map == instance_id)
-            pos_list = list(zip(ys, xs))
-
-            if len(pos_list) < 3:
-                continue
-
-            pos_list_sorted = sort_and_expand_with_direction_v2(
-                pos_list, f_direction, p_tcl_map)
-            all_pos_yxs.append(pos_list_sorted)
-
-    p_char_maps = p_char_maps.transpose([1, 2, 0])
-    decoded_str, keep_yxs_list = ctc_decoder_for_image(
-        all_pos_yxs, logits_map=p_char_maps, Lexicon_Table=Lexicon_Table)
-    return keep_yxs_list, decoded_str
-
-
-def extract_main_direction(pos_list, f_direction):
-    """
-    f_direction: h x w x 2
-    pos_list: [[y, x], [y, x], [y, x] ...]
-    """
-    pos_list = np.array(pos_list)
-    point_direction = f_direction[pos_list[:, 0], pos_list[:, 1]]
-    point_direction = point_direction[:, ::-1]  # x, y -> y, x
-    average_direction = np.mean(point_direction, axis=0, keepdims=True)
-    average_direction = average_direction / (
-        np.linalg.norm(average_direction) + 1e-6)
-    return average_direction
-
-
-def sort_by_direction_with_image_id_deprecated(pos_list, f_direction):
-    """
-    f_direction: h x w x 2
-    pos_list: [[id, y, x], [id, y, x], [id, y, x] ...]
-    """
-    pos_list_full = np.array(pos_list).reshape(-1, 3)
-    pos_list = pos_list_full[:, 1:]
-    point_direction = f_direction[pos_list[:, 0], pos_list[:, 1]]  # x, y
-    point_direction = point_direction[:, ::-1]  # x, y -> y, x
-    average_direction = np.mean(point_direction, axis=0, keepdims=True)
-    pos_proj_leng = np.sum(pos_list * average_direction, axis=1)
-    sorted_list = pos_list_full[np.argsort(pos_proj_leng)].tolist()
-    return sorted_list
-
-
-def sort_by_direction_with_image_id(pos_list, f_direction):
-    """
-    f_direction: h x w x 2
-    pos_list: [[y, x], [y, x], [y, x] ...]
-    """
-
-    def sort_part_with_direction(pos_list_full, point_direction):
-        pos_list_full = np.array(pos_list_full).reshape(-1, 3)
-        pos_list = pos_list_full[:, 1:]
-        point_direction = np.array(point_direction).reshape(-1, 2)
-        average_direction = np.mean(point_direction, axis=0, keepdims=True)
-        pos_proj_leng = np.sum(pos_list * average_direction, axis=1)
-        sorted_list = pos_list_full[np.argsort(pos_proj_leng)].tolist()
-        sorted_direction = point_direction[np.argsort(pos_proj_leng)].tolist()
-        return sorted_list, sorted_direction
-
-    pos_list = np.array(pos_list).reshape(-1, 3)
-    point_direction = f_direction[pos_list[:, 1], pos_list[:, 2]]  # x, y
-    point_direction = point_direction[:, ::-1]  # x, y -> y, x
-    sorted_point, sorted_direction = sort_part_with_direction(pos_list,
-                                                              point_direction)
-
-    point_num = len(sorted_point)
-    if point_num >= 16:
-        middle_num = point_num // 2
-        first_part_point = sorted_point[:middle_num]
-        first_point_direction = sorted_direction[:middle_num]
-        sorted_fist_part_point, sorted_fist_part_direction = sort_part_with_direction(
-            first_part_point, first_point_direction)
-
-        last_part_point = sorted_point[middle_num:]
-        last_point_direction = sorted_direction[middle_num:]
-        sorted_last_part_point, sorted_last_part_direction = sort_part_with_direction(
-            last_part_point, last_point_direction)
-        sorted_point = sorted_fist_part_point + sorted_last_part_point
-        sorted_direction = sorted_fist_part_direction + sorted_last_part_direction
-
-    return sorted_point