feat(model): add OCR model base structure and utilities

- Add base model structure for OCR in pytorch - Implement data augmentation and transformation modules - Create utilities for dictionary handling and state dict conversion - Include post-processing modules for OCR - Add weight initialization and loading functions

feat(model): add OCR model base structure and utilities
- Add base model structure for OCR in pytorch - Implement data augmentation and transformation modules - Create utilities for dictionary handling and state dict conversion - Include post-processing modules for OCR - Add weight initialization and loading functions
a7a899f6 · myhloli · 72e66c2d · a7a899f6 · a7a899f6 · a7a899f6
Commit a7a899f6 authored Mar 27, 2025 by myhloli
20 changed files
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/sast_fpn.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/sast_fpn.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os, sys
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from pytorchocr.modeling.common import Activation
+# import paddle
+# from paddle import nn
+# import paddle.nn.functional as F
+# from paddle import ParamAttr
+class ConvBNLayer(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 groups=1,
+                 if_act=True,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self.if_act = if_act
+        self.act = act
+        # self.conv = nn.Conv2D(
+        #     in_channels=in_channels,
+        #     out_channels=out_channels,
+        #     kernel_size=kernel_size,
+        #     stride=stride,
+        #     padding=(kernel_size - 1) // 2,
+        #     groups=groups,
+        #     weight_attr=ParamAttr(name=name + '_weights'),
+        #     bias_attr=False)
+        self.conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=(kernel_size - 1) // 2,
+            groups=groups,
+            bias=False)
+        self.bn = nn.BatchNorm2d(
+            out_channels,)
+        self.act = act
+        if act is not None:
+            self._act = Activation(act)
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.act is not None:
+            x = self._act(x)
+        return x
+class DeConvBNLayer(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 groups=1,
+                 if_act=True,
+                 act=None,
+                 name=None):
+        super(DeConvBNLayer, self).__init__()
+        self.if_act = if_act
+        self.act = act
+        self.deconv = nn.ConvTranspose2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=(kernel_size - 1) // 2,
+            groups=groups,
+            bias=False)
+        self.bn = nn.BatchNorm2d(
+            out_channels,
+            )
+        self.act = act
+        if act is not None:
+            self._act = Activation(act)
+    def forward(self, x):
+        x = self.deconv(x)
+        x = self.bn(x)
+        if self.act is not None:
+            x = self._act(x)
+        return x
+class FPN_Up_Fusion(nn.Module):
+    def __init__(self, in_channels):
+        super(FPN_Up_Fusion, self).__init__()
+        in_channels = in_channels[::-1]
+        out_channels = [256, 256, 192, 192, 128]
+        self.h0_conv = ConvBNLayer(in_channels[0], out_channels[0], 1, 1, act=None, name='fpn_up_h0')
+        self.h1_conv = ConvBNLayer(in_channels[1], out_channels[1], 1, 1, act=None, name='fpn_up_h1')
+        self.h2_conv = ConvBNLayer(in_channels[2], out_channels[2], 1, 1, act=None, name='fpn_up_h2')
+        self.h3_conv = ConvBNLayer(in_channels[3], out_channels[3], 1, 1, act=None, name='fpn_up_h3')
+        self.h4_conv = ConvBNLayer(in_channels[4], out_channels[4], 1, 1, act=None, name='fpn_up_h4')
+        self.g0_conv = DeConvBNLayer(out_channels[0], out_channels[1], 4, 2, act=None, name='fpn_up_g0')
+        self.g1_conv = nn.Sequential(
+            ConvBNLayer(out_channels[1], out_channels[1], 3, 1, act='relu', name='fpn_up_g1_1'),
+            DeConvBNLayer(out_channels[1], out_channels[2], 4, 2, act=None, name='fpn_up_g1_2')
+        )
+        self.g2_conv = nn.Sequential(
+            ConvBNLayer(out_channels[2], out_channels[2], 3, 1, act='relu', name='fpn_up_g2_1'),
+            DeConvBNLayer(out_channels[2], out_channels[3], 4, 2, act=None, name='fpn_up_g2_2')
+        )
+        self.g3_conv = nn.Sequential(
+            ConvBNLayer(out_channels[3], out_channels[3], 3, 1, act='relu', name='fpn_up_g3_1'),
+            DeConvBNLayer(out_channels[3], out_channels[4], 4, 2, act=None, name='fpn_up_g3_2')
+        )
+        self.g4_conv = nn.Sequential(
+            ConvBNLayer(out_channels[4], out_channels[4], 3, 1, act='relu', name='fpn_up_fusion_1'),
+            ConvBNLayer(out_channels[4], out_channels[4], 1, 1, act=None, name='fpn_up_fusion_2')
+        )
+    def _add_relu(self, x1, x2):
+        # x = paddle.add(x=x1, y=x2)
+        x = torch.add(x1, x2)
+        x = F.relu(x)
+        return x
+    def forward(self, x):
+        f = x[2:][::-1]
+        h0 = self.h0_conv(f[0])
+        h1 = self.h1_conv(f[1])
+        h2 = self.h2_conv(f[2])
+        h3 = self.h3_conv(f[3])
+        h4 = self.h4_conv(f[4])
+        g0 = self.g0_conv(h0)
+        g1 = self._add_relu(g0, h1)
+        g1 = self.g1_conv(g1)
+        g2 = self.g2_conv(self._add_relu(g1, h2))
+        g3 = self.g3_conv(self._add_relu(g2, h3))
+        g4 = self.g4_conv(self._add_relu(g3, h4))
+        return g4
+class FPN_Down_Fusion(nn.Module):
+    def __init__(self, in_channels):
+        super(FPN_Down_Fusion, self).__init__()
+        out_channels = [32, 64, 128]
+        self.h0_conv = ConvBNLayer(in_channels[0], out_channels[0], 3, 1, act=None, name='fpn_down_h0')
+        self.h1_conv = ConvBNLayer(in_channels[1], out_channels[1], 3, 1, act=None, name='fpn_down_h1')
+        self.h2_conv = ConvBNLayer(in_channels[2], out_channels[2], 3, 1, act=None, name='fpn_down_h2')
+        self.g0_conv = ConvBNLayer(out_channels[0], out_channels[1], 3, 2, act=None, name='fpn_down_g0')
+        self.g1_conv = nn.Sequential(
+            ConvBNLayer(out_channels[1], out_channels[1], 3, 1, act='relu', name='fpn_down_g1_1'),
+            ConvBNLayer(out_channels[1], out_channels[2], 3, 2, act=None, name='fpn_down_g1_2')
+        )
+        self.g2_conv = nn.Sequential(
+            ConvBNLayer(out_channels[2], out_channels[2], 3, 1, act='relu', name='fpn_down_fusion_1'),
+            ConvBNLayer(out_channels[2], out_channels[2], 1, 1, act=None, name='fpn_down_fusion_2')
+        )
+    def forward(self, x):
+        f = x[:3]
+        h0 = self.h0_conv(f[0])
+        h1 = self.h1_conv(f[1])
+        h2 = self.h2_conv(f[2])
+        g0 = self.g0_conv(h0)
+        # g1 = paddle.add(x=g0, y=h1)
+        g1 = torch.add(g0, h1)
+        g1 = F.relu(g1)
+        g1 = self.g1_conv(g1)
+        # g2 = paddle.add(x=g1, y=h2)
+        g2 = torch.add(g1, h2)
+        g2 = F.relu(g2)
+        g2 = self.g2_conv(g2)
+        return g2
+class Cross_Attention(nn.Module):
+    def __init__(self, in_channels):
+        super(Cross_Attention, self).__init__()
+        self.theta_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act='relu', name='f_theta')
+        self.phi_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act='relu', name='f_phi')
+        self.g_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act='relu', name='f_g')
+        self.fh_weight_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act=None, name='fh_weight')
+        self.fh_sc_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act=None, name='fh_sc')
+        self.fv_weight_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act=None, name='fv_weight')
+        self.fv_sc_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act=None, name='fv_sc')
+        self.f_attn_conv = ConvBNLayer(in_channels * 2, in_channels, 1, 1, act='relu', name='f_attn')
+    def _cal_fweight(self, f, shape):
+        f_theta, f_phi, f_g = f
+        # flatten
+        # f_theta = paddle.transpose(f_theta, [0, 2, 3, 1])
+        f_theta = f_theta.permute(0, 2, 3, 1)
+        # f_theta = paddle.reshape(f_theta, [shape[0] * shape[1], shape[2], 128])
+        f_theta = torch.reshape(f_theta, [shape[0] * shape[1], shape[2], 128])
+        # f_phi = paddle.transpose(f_phi, [0, 2, 3, 1])
+        f_phi = f_phi.permute(0, 2, 3, 1)
+        # f_phi = paddle.reshape(f_phi, [shape[0] * shape[1], shape[2], 128])
+        f_phi = torch.reshape(f_phi, [shape[0] * shape[1], shape[2], 128])
+        # f_g = paddle.transpose(f_g, [0, 2, 3, 1])
+        f_g = f_g.permute(0, 2, 3, 1)
+        # f_g = paddle.reshape(f_g, [shape[0] * shape[1], shape[2], 128])
+        f_g = torch.reshape(f_g, [shape[0] * shape[1], shape[2], 128])
+        # correlation
+        # f_attn = paddle.matmul(f_theta, paddle.transpose(f_phi, [0, 2, 1]))
+        f_attn = torch.matmul(f_theta, f_phi.permute(0, 2, 1))
+        # scale
+        f_attn = f_attn / (128 ** 0.5)
+        f_attn = F.softmax(f_attn, dim=-1)
+        # weighted sum
+        # f_weight = paddle.matmul(f_attn, f_g)
+        f_weight = torch.matmul(f_attn, f_g)
+        # f_weight = paddle.reshape(
+        #     f_weight, [shape[0], shape[1], shape[2], 128])
+        f_weight = torch.reshape(
+            f_weight, [shape[0], shape[1], shape[2], 128])
+        return f_weight
+    def forward(self, f_common):
+        # f_shape = paddle.shape(f_common)
+        f_shape = f_common.size()
+        # print('f_shape: ', f_shape)
+        f_theta = self.theta_conv(f_common)
+        f_phi = self.phi_conv(f_common)
+        f_g = self.g_conv(f_common)
+        ######## horizon ########
+        fh_weight = self._cal_fweight([f_theta, f_phi, f_g],
+                                      [f_shape[0], f_shape[2], f_shape[3]])
+        # fh_weight = paddle.transpose(fh_weight, [0, 3, 1, 2])
+        fh_weight = fh_weight.permute(0, 3, 1, 2)
+        fh_weight = self.fh_weight_conv(fh_weight)
+        # short cut
+        fh_sc = self.fh_sc_conv(f_common)
+        f_h = F.relu(fh_weight + fh_sc)
+        ######## vertical ########
+        # fv_theta = paddle.transpose(f_theta, [0, 1, 3, 2])
+        fv_theta = f_theta.permute(0, 1, 3, 2)
+        # fv_phi = paddle.transpose(f_phi, [0, 1, 3, 2])
+        fv_phi = f_phi.permute(0, 1, 3, 2)
+        # fv_g = paddle.transpose(f_g, [0, 1, 3, 2])
+        fv_g = f_g.permute(0, 1, 3, 2)
+        fv_weight = self._cal_fweight([fv_theta, fv_phi, fv_g],
+                                      [f_shape[0], f_shape[3], f_shape[2]])
+        # fv_weight = paddle.transpose(fv_weight, [0, 3, 2, 1])
+        fv_weight = fv_weight.permute(0, 3, 2, 1)
+        fv_weight = self.fv_weight_conv(fv_weight)
+        # short cut
+        fv_sc = self.fv_sc_conv(f_common)
+        f_v = F.relu(fv_weight + fv_sc)
+        ######## merge ########
+        # f_attn = paddle.concat([f_h, f_v], axis=1)
+        f_attn = torch.cat([f_h, f_v], dim=1)
+        f_attn = self.f_attn_conv(f_attn)
+        return f_attn
+class SASTFPN(nn.Module):
+    def __init__(self, in_channels, with_cab=False, **kwargs):
+        super(SASTFPN, self).__init__()
+        self.in_channels = in_channels
+        self.with_cab = with_cab
+        self.FPN_Down_Fusion = FPN_Down_Fusion(self.in_channels)
+        self.FPN_Up_Fusion = FPN_Up_Fusion(self.in_channels)
+        self.out_channels = 128
+        self.cross_attention = Cross_Attention(self.out_channels)
+    def forward(self, x):
+        # down fpn
+        f_down = self.FPN_Down_Fusion(x)
+        # up fpn
+        f_up = self.FPN_Up_Fusion(x)
+        # fusion
+        # f_common = paddle.add(x=f_down, y=f_up)
+        f_common = torch.add(f_down, f_up)
+        f_common = F.relu(f_common)
+        if self.with_cab:
+            # print('enhence f_common with CAB.')
+            f_common = self.cross_attention(f_common)
+        return f_common
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/table_fpn.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/table_fpn.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from pytorchocr.modeling.common import Activation
+class TableFPN(nn.Module):
+    def __init__(self, in_channels, out_channels, **kwargs):
+        super(TableFPN, self).__init__()
+        self.out_channels = 512
+        self.in2_conv = nn.Conv2d(
+            in_channels=in_channels[0],
+            out_channels=self.out_channels,
+            kernel_size=1,
+            bias=False)
+        self.in3_conv = nn.Conv2d(
+            in_channels=in_channels[1],
+            out_channels=self.out_channels,
+            kernel_size=1,
+            stride = 1,
+            bias=False)
+        self.in4_conv = nn.Conv2d(
+            in_channels=in_channels[2],
+            out_channels=self.out_channels,
+            kernel_size=1,
+            bias=False)
+        self.in5_conv = nn.Conv2d(
+            in_channels=in_channels[3],
+            out_channels=self.out_channels,
+            kernel_size=1,
+            bias=False)
+        self.p5_conv = nn.Conv2d(
+            in_channels=self.out_channels,
+            out_channels=self.out_channels // 4,
+            kernel_size=3,
+            padding=1,
+            bias=False)
+        self.p4_conv = nn.Conv2d(
+            in_channels=self.out_channels,
+            out_channels=self.out_channels // 4,
+            kernel_size=3,
+            padding=1,
+            bias=False)
+        self.p3_conv = nn.Conv2d(
+            in_channels=self.out_channels,
+            out_channels=self.out_channels // 4,
+            kernel_size=3,
+            padding=1,
+            bias=False)
+        self.p2_conv = nn.Conv2d(
+            in_channels=self.out_channels,
+            out_channels=self.out_channels // 4,
+            kernel_size=3,
+            padding=1,
+            bias=False)
+        self.fuse_conv = nn.Conv2d(
+            in_channels=self.out_channels * 4,
+            out_channels=512,
+            kernel_size=3,
+            padding=1,
+            bias=False)
+    def forward(self, x):
+        c2, c3, c4, c5 = x
+        in5 = self.in5_conv(c5)
+        in4 = self.in4_conv(c4)
+        in3 = self.in3_conv(c3)
+        in2 = self.in2_conv(c2)
+        out4 = in4 + F.interpolate(
+            in5, size=in4.shape[2:4], mode="nearest", )#align_mode=1)  # 1/16
+        out3 = in3 + F.interpolate(
+            out4, size=in3.shape[2:4], mode="nearest", )#align_mode=1)  # 1/8
+        out2 = in2 + F.interpolate(
+            out3, size=in2.shape[2:4], mode="nearest", )#align_mode=1)  # 1/4
+        p4 = F.interpolate(out4, size=in5.shape[2:4], mode="nearest", )#align_mode=1)
+        p3 = F.interpolate(out3, size=in5.shape[2:4], mode="nearest", )#align_mode=1)
+        p2 = F.interpolate(out2, size=in5.shape[2:4], mode="nearest", )#align_mode=1)
+        fuse = torch.cat([in5, p4, p3, p2], dim=1)
+        fuse_conv = self.fuse_conv(fuse) * 0.005
+        return [c5 + fuse_conv]
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/transforms/__init__.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/transforms/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+__all__ = ['build_transform']
+def build_transform(config):
+    from .tps import TPS
+    from .stn import STN_ON
+    from .tsrn import TSRN
+    from .tbsrn import TBSRN
+    support_dict = ['TPS', 'STN_ON', 'TSRN', 'TBSRN']
+    module_name = config.pop('name')
+    assert module_name in support_dict, Exception(
+        'transform only support {}'.format(support_dict))
+    module_class = eval(module_name)(**config)
+    return module_class
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/transforms/stn.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/transforms/stn.py
+"""
+This code is refer from:
+https://github.com/ayumiymk/aster.pytorch/blob/master/lib/models/stn_head.py
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import math
+import torch
+from torch import nn
+from torch.nn import functional as F
+import numpy as np
+from .tps_spatial_transformer import TPSSpatialTransformer
+def conv3x3_block(in_channels, out_channels, stride=1):
+    n = 3 * 3 * out_channels
+    w = math.sqrt(2. / n)
+    conv_layer = nn.Conv2d(
+        in_channels,
+        out_channels,
+        kernel_size=3,
+        stride=stride,
+        padding=1,
+        bias=True)
+    block = nn.Sequential(conv_layer, nn.BatchNorm2d(out_channels), nn.ReLU())
+    return block
+class STN(nn.Module):
+    def __init__(self, in_channels, num_ctrlpoints, activation='none'):
+        super(STN, self).__init__()
+        self.in_channels = in_channels
+        self.num_ctrlpoints = num_ctrlpoints
+        self.activation = activation
+        self.stn_convnet = nn.Sequential(
+            conv3x3_block(in_channels, 32),  #32x64
+            nn.MaxPool2d(
+                kernel_size=2, stride=2),
+            conv3x3_block(32, 64),  #16x32
+            nn.MaxPool2d(
+                kernel_size=2, stride=2),
+            conv3x3_block(64, 128),  # 8*16
+            nn.MaxPool2d(
+                kernel_size=2, stride=2),
+            conv3x3_block(128, 256),  # 4*8
+            nn.MaxPool2d(
+                kernel_size=2, stride=2),
+            conv3x3_block(256, 256),  # 2*4,
+            nn.MaxPool2d(
+                kernel_size=2, stride=2),
+            conv3x3_block(256, 256))  # 1*2
+        self.stn_fc1 = nn.Sequential(
+            nn.Linear(
+                2 * 256,
+                512,
+                bias=True),
+            nn.BatchNorm1d(512),
+            nn.ReLU(inplace=True))
+        fc2_bias = self.init_stn()
+        self.stn_fc2 = nn.Linear(
+            512,
+            num_ctrlpoints * 2,
+            bias=True)
+    def init_stn(self):
+        margin = 0.01
+        sampling_num_per_side = int(self.num_ctrlpoints / 2)
+        ctrl_pts_x = np.linspace(margin, 1. - margin, sampling_num_per_side)
+        ctrl_pts_y_top = np.ones(sampling_num_per_side) * margin
+        ctrl_pts_y_bottom = np.ones(sampling_num_per_side) * (1 - margin)
+        ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1)
+        ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1)
+        ctrl_points = np.concatenate(
+            [ctrl_pts_top, ctrl_pts_bottom], axis=0).astype(np.float32)
+        if self.activation == 'none':
+            pass
+        elif self.activation == 'sigmoid':
+            ctrl_points = -np.log(1. / ctrl_points - 1.)
+        ctrl_points = torch.Tensor(ctrl_points)
+        # fc2_bias = ctrl_points.view(-1)
+        fc2_bias = torch.reshape(
+            ctrl_points, shape=[ctrl_points.shape[0] * ctrl_points.shape[1]])
+        return fc2_bias
+    def forward(self, x):
+        x = self.stn_convnet(x)
+        batch_size, _, h, w = x.shape
+        # x = x.view(batch_size, -1)
+        x = torch.reshape(x, shape=(batch_size, -1))
+        img_feat = self.stn_fc1(x)
+        x = self.stn_fc2(0.1 * img_feat)
+        if self.activation == 'sigmoid':
+            x = F.sigmoid(x)
+        # x = x.view(-1, self.num_ctrlpoints, 2)
+        x = torch.reshape(x, shape=[-1, self.num_ctrlpoints, 2])
+        return img_feat, x
+class STN_ON(nn.Module):
+    def __init__(self, in_channels, tps_inputsize, tps_outputsize,
+                 num_control_points, tps_margins, stn_activation):
+        super(STN_ON, self).__init__()
+        self.tps = TPSSpatialTransformer(
+            output_image_size=tuple(tps_outputsize),
+            num_control_points=num_control_points,
+            margins=tuple(tps_margins))
+        self.stn_head = STN(in_channels=in_channels,
+                            num_ctrlpoints=num_control_points,
+                            activation=stn_activation)
+        self.tps_inputsize = tps_inputsize
+        self.out_channels = in_channels
+    def forward(self, image):
+        stn_input = torch.nn.functional.interpolate(
+            image, self.tps_inputsize, mode="bilinear", align_corners=True)
+        stn_img_feat, ctrl_points = self.stn_head(stn_input)
+        x, _ = self.tps(image, ctrl_points)
+        return x
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/transforms/tbsrn.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/transforms/tbsrn.py
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is refer from:
+https://github.com/FudanVI/FudanOCR/blob/main/scene-text-telescope/model/tbsrn.py
+"""
+import math
+import warnings
+import numpy as np
+import torch
+from torch import nn
+import string
+warnings.filterwarnings("ignore")
+from .tps_spatial_transformer import TPSSpatialTransformer
+from .stn import STN as STNHead
+from .tsrn import GruBlock, mish, UpsampleBLock
+from pytorchocr.modeling.heads.sr_rensnet_transformer import Transformer, LayerNorm, \
+    PositionwiseFeedForward, MultiHeadedAttention
+def positionalencoding2d(d_model, height, width):
+    """
+    :param d_model: dimension of the model
+    :param height: height of the positions
+    :param width: width of the positions
+    :return: d_model*height*width position matrix
+    """
+    if d_model % 4 != 0:
+        raise ValueError("Cannot use sin/cos positional encoding with "
+                         "odd dimension (got dim={:d})".format(d_model))
+    pe = torch.zeros([d_model, height, width])
+    # Each dimension use half of d_model
+    d_model = int(d_model / 2)
+    div_term = torch.exp(torch.arange(0., d_model, 2) *
+                          -(math.log(10000.0) / d_model))
+    pos_w = torch.arange(0., width, dtype=torch.float32).unsqueeze(1)
+    pos_h = torch.arange(0., height, dtype=torch.float32).unsqueeze(1)
+    pe[0:d_model:2, :, :] = torch.sin(pos_w * div_term).transpose(0, 1).unsqueeze(1).repeat(1, height, 1)
+    pe[1:d_model:2, :, :] = torch.cos(pos_w * div_term).transpose(0, 1).unsqueeze(1).repeat(1, height, 1)
+    pe[d_model::2, :, :] = torch.sin(pos_h * div_term).transpose(0, 1).unsqueeze(2).repeat(1, 1, width)
+    pe[d_model + 1::2, :, :] = torch.cos(pos_h * div_term).transpose(0, 1).unsqueeze(2).repeat(1, 1, width)
+    return pe
+class FeatureEnhancer(nn.Module):
+    def __init__(self):
+        super(FeatureEnhancer, self).__init__()
+        self.multihead = MultiHeadedAttention(h=4, d_model=128, dropout=0.1)
+        self.mul_layernorm1 = LayerNorm(features=128)
+        self.pff = PositionwiseFeedForward(128, 128)
+        self.mul_layernorm3 = LayerNorm(features=128)
+        self.linear = nn.Linear(128, 64)
+    def forward(self, conv_feature):
+        '''
+        text : (batch, seq_len, embedding_size)
+        global_info: (batch, embedding_size, 1, 1)
+        conv_feature: (batch, channel, H, W)
+        '''
+        batch = conv_feature.shape[0]
+        if torch.cuda.is_available():
+            position2d = positionalencoding2d(64, 16, 64).float().cuda().unsqueeze(0).reshape([1, 64, 1024])
+        else:
+            position2d = positionalencoding2d(64, 16, 64).float().unsqueeze(0).reshape([1, 64, 1024])
+        position2d = position2d.repeat(batch, 1, 1)
+        conv_feature = torch.cat([conv_feature, position2d], 1)  # batch, 128(64+64), 32, 128
+        result = conv_feature.permute(0, 2, 1).contiguous()
+        origin_result = result
+        result = self.mul_layernorm1(origin_result + self.multihead(result, result, result, mask=None)[0])
+        origin_result = result
+        result = self.mul_layernorm3(origin_result + self.pff(result))
+        result = self.linear(result)
+        return result.permute(0, 2, 1).contiguous()
+def str_filt(str_, voc_type):
+    alpha_dict = {
+        'digit': string.digits,
+        'lower': string.digits + string.ascii_lowercase,
+        'upper': string.digits + string.ascii_letters,
+        'all': string.digits + string.ascii_letters + string.punctuation
+    }
+    if voc_type == 'lower':
+        str_ = str_.lower()
+    for char in str_:
+        if char not in alpha_dict[voc_type]:
+            str_ = str_.replace(char, '')
+    str_ = str_.lower()
+    return str_
+class TBSRN(nn.Module):
+    def __init__(self,
+                 in_channels=3,
+                 scale_factor=2,
+                 width=128,
+                 height=32,
+                 STN=True,
+                 srb_nums=5,
+                 mask=False,
+                 hidden_units=32,
+                 infer_mode=False):
+        super(TBSRN, self).__init__()
+        in_planes = 3
+        if mask:
+            in_planes = 4
+        assert math.log(scale_factor, 2) % 1 == 0
+        upsample_block_num = int(math.log(scale_factor, 2))
+        self.block1 = nn.Sequential(
+            nn.Conv2d(in_planes, 2 * hidden_units, kernel_size=9, padding=4),
+            nn.PReLU()
+            # nn.ReLU()
+        )
+        self.srb_nums = srb_nums
+        for i in range(srb_nums):
+            setattr(self, 'block%d' % (i + 2), RecurrentResidualBlock(2 * hidden_units))
+        setattr(self, 'block%d' % (srb_nums + 2),
+                nn.Sequential(
+                    nn.Conv2d(2 * hidden_units, 2 * hidden_units, kernel_size=3, padding=1),
+                    nn.BatchNorm2d(2 * hidden_units)
+                ))
+        # self.non_local = NonLocalBlock2D(64, 64)
+        block_ = [UpsampleBLock(2 * hidden_units, 2) for _ in range(upsample_block_num)]
+        block_.append(nn.Conv2d(2 * hidden_units, in_planes, kernel_size=9, padding=4))
+        setattr(self, 'block%d' % (srb_nums + 3), nn.Sequential(*block_))
+        self.tps_inputsize = [height // scale_factor, width // scale_factor]
+        tps_outputsize = [height // scale_factor, width // scale_factor]
+        num_control_points = 20
+        tps_margins = [0.05, 0.05]
+        self.stn = STN
+        self.out_channels = in_channels
+        if self.stn:
+            self.tps = TPSSpatialTransformer(
+                output_image_size=tuple(tps_outputsize),
+                num_control_points=num_control_points,
+                margins=tuple(tps_margins))
+            self.stn_head = STNHead(
+                in_channels=in_planes,
+                num_ctrlpoints=num_control_points,
+                activation='none')
+        self.infer_mode = infer_mode
+        self.english_alphabet = '-0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
+        self.english_dict = {}
+        for index in range(len(self.english_alphabet)):
+            self.english_dict[self.english_alphabet[index]] = index
+        transformer = Transformer(alphabet='-0123456789abcdefghijklmnopqrstuvwxyz')
+        self.transformer = transformer
+        for param in self.transformer.parameters():
+            param.trainable = False
+    def label_encoder(self, label):
+        batch = len(label)
+        length = [len(i) for i in label]
+        length_tensor = torch.Tensor(length).type(torch.int64)
+        max_length = max(length)
+        input_tensor = np.zeros((batch, max_length))
+        for i in range(batch):
+            for j in range(length[i] - 1):
+                input_tensor[i][j + 1] = self.english_dict[label[i][j]]
+        text_gt = []
+        for i in label:
+            for j in i:
+                text_gt.append(self.english_dict[j])
+        text_gt = torch.Tensor(text_gt).type(torch.int64)
+        input_tensor = torch.Tensor(input_tensor).type(torch.int64)
+        return length_tensor, input_tensor, text_gt
+    def forward(self, x):
+        output = {}
+        if self.infer_mode:
+            output["lr_img"] = x
+            y = x
+        else:
+            output["lr_img"] = x[0]
+            output["hr_img"] = x[1]
+            y = x[0]
+        if self.stn and self.training:
+            _, ctrl_points_x = self.stn_head(y)
+            y, _ = self.tps(y, ctrl_points_x)
+        block = {'1': self.block1(y)}
+        for i in range(self.srb_nums + 1):
+            block[str(i + 2)] = getattr(self,
+                                        'block%d' % (i + 2))(block[str(i + 1)])
+        block[str(self.srb_nums + 3)] = getattr(self, 'block%d' % (self.srb_nums + 3)) \
+            ((block['1'] + block[str(self.srb_nums + 2)]))
+        sr_img = torch.tanh(block[str(self.srb_nums + 3)])
+        output["sr_img"] = sr_img
+        if self.training:
+            hr_img = x[1]
+            # add transformer
+            label = [str_filt(i, 'lower') + '-' for i in x[2]]
+            length_tensor, input_tensor, text_gt = self.label_encoder(label)
+            hr_pred, word_attention_map_gt, hr_correct_list = self.transformer(hr_img, length_tensor,
+                                                                               input_tensor)
+            sr_pred, word_attention_map_pred, sr_correct_list = self.transformer(sr_img, length_tensor,
+                                                                                 input_tensor)
+            output["hr_img"] = hr_img
+            output["hr_pred"] = hr_pred
+            output["text_gt"] = text_gt
+            output["word_attention_map_gt"] = word_attention_map_gt
+            output["sr_pred"] = sr_pred
+            output["word_attention_map_pred"] = word_attention_map_pred
+        return output
+class RecurrentResidualBlock(nn.Module):
+    def __init__(self, channels):
+        super(RecurrentResidualBlock, self).__init__()
+        self.conv1 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
+        self.bn1 = nn.BatchNorm2d(channels)
+        self.gru1 = GruBlock(channels, channels)
+        # self.prelu = nn.ReLU()
+        self.prelu = mish()
+        self.conv2 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
+        self.bn2 = nn.BatchNorm2d(channels)
+        self.gru2 = GruBlock(channels, channels)
+        self.feature_enhancer = FeatureEnhancer()
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def forward(self, x):
+        residual = self.conv1(x)
+        residual = self.bn1(residual)
+        residual = self.prelu(residual)
+        residual = self.conv2(residual)
+        residual = self.bn2(residual)
+        size = residual.shape
+        residual = residual.reshape([size[0], size[1], -1])
+        residual = self.feature_enhancer(residual)
+        residual = residual.reshape([size[0], size[1], size[2], size[3]])
+        return x + residual
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/transforms/tps.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/transforms/tps.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import math
+import os, sys
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from pytorchocr.modeling.common import Activation
+# import paddle
+# from paddle import nn, ParamAttr
+# from paddle.nn import functional as F
+import numpy as np
+class ConvBNLayer(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=(kernel_size - 1) // 2,
+            groups=groups,
+            bias=False,
+        )
+        bn_name = "bn_" + name
+        self.bn = nn.BatchNorm2d(
+            out_channels, )
+        self.act = act
+        if act is not None:
+            self._act = Activation(act)
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.act is not None:
+            x = self._act(x)
+        return x
+class LocalizationNetwork(nn.Module):
+    def __init__(self, in_channels, num_fiducial, loc_lr, model_name):
+        super(LocalizationNetwork, self).__init__()
+        self.F = num_fiducial
+        F = num_fiducial
+        if model_name == "large":
+            num_filters_list = [64, 128, 256, 512]
+            fc_dim = 256
+        else:
+            num_filters_list = [16, 32, 64, 128]
+            fc_dim = 64
+        # self.block_list = []
+        self.block_list = nn.Sequential()
+        for fno in range(0, len(num_filters_list)):
+            num_filters = num_filters_list[fno]
+            name = "loc_conv%d" % fno
+            # conv = self.add_sublayer(
+            #     name,
+            #     ConvBNLayer(
+            #         in_channels=in_channels,
+            #         out_channels=num_filters,
+            #         kernel_size=3,
+            #         act='relu',
+            #         name=name))
+            conv = ConvBNLayer(
+                    in_channels=in_channels,
+                    out_channels=num_filters,
+                    kernel_size=3,
+                    act='relu',
+                    name=name)
+            # self.block_list.append(conv)
+            self.block_list.add_module(name, conv)
+            if fno == len(num_filters_list) - 1:
+                pool = nn.AdaptiveAvgPool2d(1)
+            else:
+                # pool = nn.MaxPool2D(kernel_size=2, stride=2, padding=0)
+                pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
+            in_channels = num_filters
+            # self.block_list.append(pool)
+            self.block_list.add_module('{}_pool'.format(name), pool)
+        name = "loc_fc1"
+        stdv = 1.0 / math.sqrt(num_filters_list[-1] * 1.0)
+        self.fc1 = nn.Linear(
+            in_channels,
+            fc_dim,
+            bias=True,
+        )
+        # Init fc2 in LocalizationNetwork
+        initial_bias = self.get_initial_fiducials()
+        initial_bias = initial_bias.reshape(-1)
+        name = "loc_fc2"
+        self.fc2 = nn.Linear(
+            fc_dim,
+            F * 2,
+            bias=True
+        )
+        self.out_channels = F * 2
+    def forward(self, x):
+        """
+           Estimating parameters of geometric transformation
+           Args:
+               image: input
+           Return:
+               batch_C_prime: the matrix of the geometric transformation
+        """
+        B = x.shape[0]
+        i = 0
+        for block in self.block_list:
+            x = block(x)
+        x = x.squeeze(dim=2).squeeze(dim=2)
+        x = self.fc1(x)
+        x = F.relu(x)
+        x = self.fc2(x)
+        x = x.reshape(shape=[-1, self.F, 2])
+        return x
+    def get_initial_fiducials(self):
+        """ see RARE paper Fig. 6 (a) """
+        F = self.F
+        ctrl_pts_x = np.linspace(-1.0, 1.0, int(F / 2))
+        ctrl_pts_y_top = np.linspace(0.0, -1.0, num=int(F / 2))
+        ctrl_pts_y_bottom = np.linspace(1.0, 0.0, num=int(F / 2))
+        ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1)
+        ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1)
+        initial_bias = np.concatenate([ctrl_pts_top, ctrl_pts_bottom], axis=0)
+        return initial_bias
+class GridGenerator(nn.Module):
+    def __init__(self, in_channels, num_fiducial):
+        super(GridGenerator, self).__init__()
+        self.eps = 1e-6
+        self.F = num_fiducial
+        name = "ex_fc"
+        self.fc = nn.Linear(
+            in_channels,
+            6,
+            bias=True
+        )
+    def forward(self, batch_C_prime, I_r_size):
+        """
+        Generate the grid for the grid_sampler.
+        Args:
+            batch_C_prime: the matrix of the geometric transformation
+            I_r_size: the shape of the input image
+        Return:
+            batch_P_prime: the grid for the grid_sampler
+        """
+        C = self.build_C_paddle()
+        P = self.build_P_paddle(I_r_size)
+        inv_delta_C_tensor = self.build_inv_delta_C_paddle(C).type(torch.float32)
+        P_hat_tensor = self.build_P_hat_paddle(
+            C, torch.as_tensor(P)).type(torch.float32)
+        inv_delta_C_tensor.stop_gradient = True
+        P_hat_tensor.stop_gradient = True
+        batch_C_ex_part_tensor = self.get_expand_tensor(batch_C_prime)
+        batch_C_ex_part_tensor.stop_gradient = True
+        batch_C_prime_with_zeros = torch.cat(
+            [batch_C_prime, batch_C_ex_part_tensor], dim=1)
+        inv_delta_C_tensor = inv_delta_C_tensor.to(batch_C_prime_with_zeros.device)
+        batch_T = torch.matmul(inv_delta_C_tensor, batch_C_prime_with_zeros)
+        P_hat_tensor = P_hat_tensor.to(batch_T.device)
+        batch_P_prime = torch.matmul(P_hat_tensor, batch_T)
+        return batch_P_prime
+    def build_C_paddle(self):
+        """ Return coordinates of fiducial points in I_r; C """
+        F = self.F
+        ctrl_pts_x = torch.linspace(-1.0, 1.0, int(F / 2), dtype=torch.float64)
+        ctrl_pts_y_top = -1 * torch.ones([int(F / 2)], dtype=torch.float64)
+        ctrl_pts_y_bottom = torch.ones([int(F / 2)], dtype=torch.float64)
+        ctrl_pts_top = torch.stack([ctrl_pts_x, ctrl_pts_y_top], dim=1)
+        ctrl_pts_bottom = torch.stack([ctrl_pts_x, ctrl_pts_y_bottom], dim=1)
+        C = torch.cat([ctrl_pts_top, ctrl_pts_bottom], dim=0)
+        return C  # F x 2
+    def build_P_paddle(self, I_r_size):
+        I_r_height, I_r_width = I_r_size
+        I_r_grid_x = (torch.arange(
+            -I_r_width, I_r_width, 2, dtype=torch.float64) + 1.0
+                      ) / torch.as_tensor(np.array([I_r_width]).astype(np.float64))
+        I_r_grid_y = (torch.arange(
+            -I_r_height, I_r_height, 2, dtype=torch.float64) + 1.0
+                      ) / torch.as_tensor(np.array([I_r_height]).astype(np.float64))
+        # P: self.I_r_width x self.I_r_height x 2
+        P = torch.stack(torch.meshgrid([I_r_grid_x, I_r_grid_y]), dim=2)
+        # P = paddle.transpose(P, perm=[1, 0, 2])
+        P = P.permute(1, 0, 2)
+        # n (= self.I_r_width x self.I_r_height) x 2
+        return P.reshape([-1, 2])
+    def build_inv_delta_C_paddle(self, C):
+        """ Return inv_delta_C which is needed to calculate T """
+        F = self.F
+        hat_C = torch.zeros((F, F), dtype=torch.float64)  # F x F
+        for i in range(0, F):
+            for j in range(i, F):
+                if i == j:
+                    hat_C[i, j] = 1
+                else:
+                    r = torch.norm(C[i] - C[j])
+                    hat_C[i, j] = r
+                    hat_C[j, i] = r
+        hat_C = (hat_C**2) * torch.log(hat_C)
+        delta_C = torch.cat(  # F+3 x F+3
+            [
+                torch.cat(
+                    [torch.ones(
+                        (F, 1), dtype=torch.float64), C, hat_C], dim=1),  # F x F+3
+                torch.cat(
+                    [
+                        torch.zeros(
+                            (2, 3), dtype=torch.float64), C.permute(1,0)
+                    ],
+                    dim=1),  # 2 x F+3
+                torch.cat(
+                    [
+                        torch.zeros(
+                            (1, 3), dtype=torch.float64), torch.ones(
+                                (1, F), dtype=torch.float64)
+                    ],
+                    dim=1)  # 1 x F+3
+            ],
+            dim=0)
+        inv_delta_C = torch.inverse(delta_C)
+        return inv_delta_C  # F+3 x F+3
+    def build_P_hat_paddle(self, C, P):
+        F = self.F
+        eps = self.eps
+        n = P.shape[0]  # n (= self.I_r_width x self.I_r_height)
+        # P_tile: n x 2 -> n x 1 x 2 -> n x F x 2
+        # P_tile = paddle.tile(paddle.unsqueeze(P, axis=1), (1, F, 1))
+        P_tile = torch.unsqueeze(P, dim=1).repeat(1, F, 1)
+        C_tile = torch.unsqueeze(C, dim=0)  # 1 x F x 2
+        P_diff = P_tile - C_tile  # n x F x 2
+        # rbf_norm: n x F
+        rbf_norm = torch.norm(P_diff, p=2, dim=2, keepdim=False)
+        # rbf: n x F
+        # rbf = torch.mul(
+        #     torch.square(rbf_norm), torch.log(rbf_norm + eps))
+        rbf = torch.mul(
+            rbf_norm**2, torch.log(rbf_norm + eps))
+        P_hat = torch.cat(
+            [torch.ones(
+                (n, 1), dtype=torch.float64), P, rbf], dim=1)
+        return P_hat  # n x F+3
+    def get_expand_tensor(self, batch_C_prime):
+        B, H, C = batch_C_prime.shape
+        batch_C_prime = batch_C_prime.reshape([B, H * C])
+        batch_C_ex_part_tensor = self.fc(batch_C_prime)
+        batch_C_ex_part_tensor = batch_C_ex_part_tensor.reshape([-1, 3, 2])
+        return batch_C_ex_part_tensor
+class TPS(nn.Module):
+    def __init__(self, in_channels, num_fiducial, loc_lr, model_name):
+        super(TPS, self).__init__()
+        self.loc_net = LocalizationNetwork(in_channels, num_fiducial, loc_lr,
+                                           model_name)
+        self.grid_generator = GridGenerator(self.loc_net.out_channels,
+                                            num_fiducial)
+        self.out_channels = in_channels
+    def forward(self, image):
+        image.stop_gradient = False
+        batch_C_prime = self.loc_net(image)
+        batch_P_prime = self.grid_generator(batch_C_prime, image.shape[2:])
+        batch_P_prime = batch_P_prime.reshape(
+            [-1, image.shape[2], image.shape[3], 2])
+        if torch.__version__ < '1.3.0':
+            batch_I_r = F.grid_sample(image, grid=batch_P_prime)
+        else:
+            batch_I_r = F.grid_sample(image, grid=batch_P_prime, align_corners=True)
+        return batch_I_r
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/transforms/tps_spatial_transformer.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/transforms/tps_spatial_transformer.py
+"""
+This code is refer from:
+https://github.com/ayumiymk/aster.pytorch/blob/master/lib/models/tps_spatial_transformer.py
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import math
+import torch
+from torch import nn
+from torch.nn import functional as F
+import numpy as np
+import itertools
+def grid_sample(input, grid, canvas=None):
+    input.stop_gradient = False
+    output = F.grid_sample(input, grid, align_corners=True) if torch.__version__ >= '1.3.0' else F.grid_sample(input, grid)
+    if canvas is None:
+        return output
+    else:
+        # input_mask = paddle.ones(shape=input.shape)
+        input_mask = input.data.new(input.size()).fill_(1)
+        output_mask = F.grid_sample(input_mask, grid)
+        padded_output = output * output_mask + canvas * (1 - output_mask)
+        return padded_output
+# phi(x1, x2) = r^2 * log(r), where r = ||x1 - x2||_2
+def compute_partial_repr(input_points, control_points):
+    N = input_points.shape[0]
+    M = control_points.shape[0]
+    # pairwise_diff = input_points.view(N, 1, 2) - control_points.view(1, M, 2)
+    pairwise_diff = torch.reshape(
+        input_points, shape=[N, 1, 2]) - torch.reshape(
+            control_points, shape=[1, M, 2])
+    # original implementation, very slow
+    # pairwise_dist = torch.sum(pairwise_diff ** 2, dim = 2) # square of distance
+    pairwise_diff_square = pairwise_diff * pairwise_diff
+    pairwise_dist = pairwise_diff_square[:, :, 0] + pairwise_diff_square[:, :, 1]
+    repr_matrix = 0.5 * pairwise_dist * torch.log(pairwise_dist)
+    # fix numerical error for 0 * log(0), substitute all nan with 0
+    # mask = np.array(repr_matrix != repr_matrix)
+    # repr_matrix[mask] = 0
+    mask = repr_matrix != repr_matrix
+    repr_matrix.masked_fill_(mask, 0)
+    return repr_matrix
+# output_ctrl_pts are specified, according to our task.
+def build_output_control_points(num_control_points, margins):
+    margin_x, margin_y = margins
+    num_ctrl_pts_per_side = num_control_points // 2
+    ctrl_pts_x = np.linspace(margin_x, 1.0 - margin_x, num_ctrl_pts_per_side)
+    ctrl_pts_y_top = np.ones(num_ctrl_pts_per_side) * margin_y
+    ctrl_pts_y_bottom = np.ones(num_ctrl_pts_per_side) * (1.0 - margin_y)
+    ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1)
+    ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1)
+    output_ctrl_pts_arr = np.concatenate(
+        [ctrl_pts_top, ctrl_pts_bottom], axis=0)
+    output_ctrl_pts = torch.Tensor(output_ctrl_pts_arr)
+    return output_ctrl_pts
+class TPSSpatialTransformer(nn.Module):
+    def __init__(self,
+                 output_image_size=None,
+                 num_control_points=None,
+                 margins=None):
+        super(TPSSpatialTransformer, self).__init__()
+        self.output_image_size = output_image_size
+        self.num_control_points = num_control_points
+        self.margins = margins
+        self.target_height, self.target_width = output_image_size
+        target_control_points = build_output_control_points(num_control_points,
+                                                            margins)
+        N = num_control_points
+        # create padded kernel matrix
+        forward_kernel = torch.zeros(N + 3, N + 3)
+        target_control_partial_repr = compute_partial_repr(target_control_points, target_control_points)
+        forward_kernel[:N, :N].copy_(target_control_partial_repr)
+        forward_kernel[:N, -3].fill_(1)
+        forward_kernel[-3, :N].fill_(1)
+        forward_kernel[:N, -2:].copy_(target_control_points)
+        forward_kernel[-2:, :N].copy_(target_control_points.transpose(0, 1))
+        # compute inverse matrix
+        inverse_kernel = torch.inverse(forward_kernel)
+        # create target cordinate matrix
+        HW = self.target_height * self.target_width
+        target_coordinate = list(
+            itertools.product(
+                range(self.target_height), range(self.target_width)))
+        target_coordinate = torch.Tensor(target_coordinate)  # HW x 2
+        Y, X = target_coordinate.split(1, dim = 1)
+        Y = Y / (self.target_height - 1)
+        X = X / (self.target_width - 1)
+        target_coordinate = torch.cat([X, Y], dim = 1) # convert from (y, x) to (x, y)
+        target_coordinate_partial_repr = compute_partial_repr(
+            target_coordinate, target_control_points)
+        target_coordinate_repr = torch.cat(
+            [
+                target_coordinate_partial_repr,
+                torch.ones(HW, 1),
+                target_coordinate
+            ],
+            dim=1)
+        # register precomputed matrices
+        self.inverse_kernel = inverse_kernel
+        self.padding_matrix = torch.zeros(3, 2)
+        self.target_coordinate_repr = target_coordinate_repr
+        self.target_control_points = target_control_points
+    def forward(self, input, source_control_points):
+        assert source_control_points.ndimension() == 3
+        assert source_control_points.shape[1] == self.num_control_points
+        assert source_control_points.shape[2] == 2
+        batch_size = source_control_points.size(0)
+        Y = torch.cat([source_control_points, self.padding_matrix.expand(batch_size, 3, 2)], 1)
+        mapping_matrix = torch.matmul(self.inverse_kernel, Y)
+        source_coordinate = torch.matmul(self.target_coordinate_repr, mapping_matrix)
+        # grid = source_coordinate.view(-1, self.target_height, self.target_width, 2)
+        grid = torch.reshape(
+            source_coordinate,
+            shape=[-1, self.target_height, self.target_width, 2])
+        grid = torch.clamp(grid, 0, 1)  # the source_control_points may be out of [0, 1].
+        # the input to grid_sample is normalized [-1, 1], but what we get is [0, 1]
+        grid = 2.0 * grid - 1.0
+        output_maps = grid_sample(input, grid, canvas=None)
+        return output_maps, source_coordinate
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/transforms/tsrn.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/transforms/tsrn.py
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is refer from:
+https://github.com/FudanVI/FudanOCR/blob/main/text-gestalt/model/tsrn.py
+"""
+import math
+import torch
+import torch.nn.functional as F
+from torch import nn
+from collections import OrderedDict
+import sys
+import numpy as np
+import warnings
+import math, copy
+import cv2
+warnings.filterwarnings("ignore")
+from .tps_spatial_transformer import TPSSpatialTransformer
+from .stn import STN as STN_model
+from pytorchocr.modeling.heads.sr_rensnet_transformer import Transformer
+class TSRN(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 scale_factor=2,
+                 width=128,
+                 height=32,
+                 STN=False,
+                 srb_nums=5,
+                 mask=False,
+                 hidden_units=32,
+                 infer_mode=False,
+                 **kwargs):
+        super(TSRN, self).__init__()
+        in_planes = 3
+        if mask:
+            in_planes = 4
+        assert math.log(scale_factor, 2) % 1 == 0
+        upsample_block_num = int(math.log(scale_factor, 2))
+        self.block1 = nn.Sequential(
+            nn.Conv2d(
+                in_planes, 2 * hidden_units, kernel_size=9, padding=4),
+            nn.PReLU())
+        self.srb_nums = srb_nums
+        for i in range(srb_nums):
+            setattr(self, 'block%d' % (i + 2),
+                    RecurrentResidualBlock(2 * hidden_units))
+        setattr(
+            self,
+            'block%d' % (srb_nums + 2),
+            nn.Sequential(
+                nn.Conv2d(
+                    2 * hidden_units,
+                    2 * hidden_units,
+                    kernel_size=3,
+                    padding=1),
+                nn.BatchNorm2d(2 * hidden_units)))
+        block_ = [
+            UpsampleBLock(2 * hidden_units, 2)
+            for _ in range(upsample_block_num)
+        ]
+        block_.append(
+            nn.Conv2d(2 * hidden_units, in_planes, kernel_size=9, padding=4)
+        )
+        setattr(self, 'block%d' % (srb_nums + 3), nn.Sequential(*block_))
+        self.tps_inputsize = [height // scale_factor, width // scale_factor]
+        tps_outputsize = [height // scale_factor, width // scale_factor]
+        num_control_points = 20
+        tps_margins = [0.05, 0.05]
+        self.stn = STN
+        if self.stn:
+            self.tps = TPSSpatialTransformer(
+                output_image_size=tuple(tps_outputsize),
+                num_control_points=num_control_points,
+                margins=tuple(tps_margins))
+            self.stn_head = STN_model(
+                in_channels=in_planes,
+                num_ctrlpoints=num_control_points,
+                activation='none')
+        self.out_channels = in_channels
+        self.r34_transformer = Transformer()
+        for param in self.r34_transformer.parameters():
+            param.trainable = False
+        self.infer_mode = infer_mode
+    def forward(self, x):
+        output = {}
+        if self.infer_mode:
+            output["lr_img"] = x
+            y = x
+        else:
+            output["lr_img"] = x[0]
+            output["hr_img"] = x[1]
+            y = x[0]
+        if self.stn and self.training:
+            _, ctrl_points_x = self.stn_head(y)
+            y, _ = self.tps(y, ctrl_points_x)
+        block = {'1': self.block1(y)}
+        for i in range(self.srb_nums + 1):
+            block[str(i + 2)] = getattr(self,
+                                        'block%d' % (i + 2))(block[str(i + 1)])
+        block[str(self.srb_nums + 3)] = getattr(self, 'block%d' % (self.srb_nums + 3)) \
+            ((block['1'] + block[str(self.srb_nums + 2)]))
+        sr_img = torch.tanh(block[str(self.srb_nums + 3)])
+        output["sr_img"] = sr_img
+        if self.training:
+            hr_img = x[1]
+            length = x[2]
+            input_tensor = x[3]
+            # add transformer 
+            sr_pred, word_attention_map_pred, _ = self.r34_transformer(
+                sr_img, length, input_tensor)
+            hr_pred, word_attention_map_gt, _ = self.r34_transformer(
+                hr_img, length, input_tensor)
+            output["hr_img"] = hr_img
+            output["hr_pred"] = hr_pred
+            output["word_attention_map_gt"] = word_attention_map_gt
+            output["sr_pred"] = sr_pred
+            output["word_attention_map_pred"] = word_attention_map_pred
+        return output
+class RecurrentResidualBlock(nn.Module):
+    def __init__(self, channels):
+        super(RecurrentResidualBlock, self).__init__()
+        self.conv1 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
+        self.bn1 = nn.BatchNorm2d(channels)
+        self.gru1 = GruBlock(channels, channels)
+        self.prelu = mish()
+        self.conv2 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
+        self.bn2 = nn.BatchNorm2d(channels)
+        self.gru2 = GruBlock(channels, channels)
+    def forward(self, x):
+        residual = self.conv1(x)
+        residual = self.bn1(residual)
+        residual = self.prelu(residual)
+        residual = self.conv2(residual)
+        residual = self.bn2(residual)
+        residual = self.gru1(residual.permute(0, 1, 3, 2).contiguous()).permute(0, 1, 3, 2).contiguous()
+        return self.gru2(x + residual).contiguous()
+class UpsampleBLock(nn.Module):
+    def __init__(self, in_channels, up_scale):
+        super(UpsampleBLock, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels, in_channels * up_scale**2, kernel_size=3, padding=1)
+        self.pixel_shuffle = nn.PixelShuffle(up_scale)
+        self.prelu = mish()
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.pixel_shuffle(x)
+        x = self.prelu(x)
+        return x
+class mish(nn.Module):
+    def __init__(self, ):
+        super(mish, self).__init__()
+        self.activated = True
+    def forward(self, x):
+        if self.activated:
+            x = x * (torch.tanh(F.softplus(x)))
+        return x
+class GruBlock(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(GruBlock, self).__init__()
+        assert out_channels % 2 == 0
+        self.conv1 = nn.Conv2d(
+            in_channels, out_channels, kernel_size=1, padding=0)
+        self.gru = nn.GRU(out_channels,
+                          out_channels // 2,
+                          bidirectional=True,
+                          batch_first=True,
+                          )
+    def forward(self, x):
+        # x: b, c, w, h
+        x = self.conv1(x)
+        x = x.permute(0, 2, 3, 1).contiguous()  # b, w, h, c
+        batch_size, w, h, c = x.size()
+        x = x.view(batch_size * w, h, c)  # b*w, h, c
+        x, _ = self.gru(x)
+        x = x.view(batch_size, w, h, c)
+        x = x.permute(0, 3, 1, 2).contiguous()
+        return x
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/__init__.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/__init__.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import copy
+__all__ = ['build_post_process']
+def build_post_process(config, global_config=None):
+    from .db_postprocess import DBPostProcess
+    from .east_postprocess import EASTPostProcess
+    from .sast_postprocess import SASTPostProcess
+    from .fce_postprocess import FCEPostProcess
+    from .rec_postprocess import CTCLabelDecode, AttnLabelDecode, SRNLabelDecode, TableLabelDecode, \
+        NRTRLabelDecode, SARLabelDecode, ViTSTRLabelDecode, RFLLabelDecode
+    from .cls_postprocess import ClsPostProcess
+    from .pg_postprocess import PGPostProcess
+    from .rec_postprocess import CANLabelDecode
+    support_dict = [
+        'DBPostProcess', 'EASTPostProcess', 'SASTPostProcess', 'CTCLabelDecode',
+        'AttnLabelDecode', 'ClsPostProcess', 'SRNLabelDecode', 'PGPostProcess',
+        'TableLabelDecode', 'NRTRLabelDecode', 'SARLabelDecode', 'FCEPostProcess',
+        'ViTSTRLabelDecode','CANLabelDecode', 'RFLLabelDecode'
+    ]
+    if config['name'] == 'PSEPostProcess':
+        from .pse_postprocess import PSEPostProcess
+        support_dict.append('PSEPostProcess')
+    config = copy.deepcopy(config)
+    module_name = config.pop('name')
+    if global_config is not None:
+        config.update(global_config)
+    assert module_name in support_dict, Exception(
+        'post process only support {}, but got {}'.format(support_dict, module_name))
+    module_class = eval(module_name)(**config)
+    return module_class
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/cls_postprocess.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/cls_postprocess.py
+import torch
+class ClsPostProcess(object):
+    """ Convert between text-label and text-index """
+    def __init__(self, label_list, **kwargs):
+        super(ClsPostProcess, self).__init__()
+        self.label_list = label_list
+    def __call__(self, preds, label=None, *args, **kwargs):
+        if isinstance(preds, torch.Tensor):
+            preds = preds.cpu().numpy()
+        pred_idxs = preds.argmax(axis=1)
+        decode_out = [(self.label_list[idx], preds[i, idx])
+                      for i, idx in enumerate(pred_idxs)]
+        if label is None:
+            return decode_out
+        label = [(self.label_list[idx], 1.0) for idx in label]
+        return decode_out, label
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/db_postprocess.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/db_postprocess.py
+"""
+This code is refered from:
+https://github.com/WenmuZhou/DBNet.pytorch/blob/master/post_processing/seg_detector_representer.py
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+import cv2
+import torch
+from shapely.geometry import Polygon
+import pyclipper
+class DBPostProcess(object):
+    """
+    The post process for Differentiable Binarization (DB).
+    """
+    def __init__(self,
+                 thresh=0.3,
+                 box_thresh=0.7,
+                 max_candidates=1000,
+                 unclip_ratio=2.0,
+                 use_dilation=False,
+                 score_mode="fast",
+                 **kwargs):
+        self.thresh = thresh
+        self.box_thresh = box_thresh
+        self.max_candidates = max_candidates
+        self.unclip_ratio = unclip_ratio
+        self.min_size = 3
+        self.score_mode = score_mode
+        assert score_mode in [
+            "slow", "fast"
+        ], "Score mode must be in [slow, fast] but got: {}".format(score_mode)
+        self.dilation_kernel = None if not use_dilation else np.array(
+            [[1, 1], [1, 1]])
+    def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height):
+        '''
+        _bitmap: single map with shape (1, H, W),
+                whose values are binarized as {0, 1}
+        '''
+        bitmap = _bitmap
+        height, width = bitmap.shape
+        outs = cv2.findContours((bitmap * 255).astype(np.uint8), cv2.RETR_LIST,
+                                cv2.CHAIN_APPROX_SIMPLE)
+        if len(outs) == 3:
+            img, contours, _ = outs[0], outs[1], outs[2]
+        elif len(outs) == 2:
+            contours, _ = outs[0], outs[1]
+        num_contours = min(len(contours), self.max_candidates)
+        boxes = []
+        scores = []
+        for index in range(num_contours):
+            contour = contours[index]
+            points, sside = self.get_mini_boxes(contour)
+            if sside < self.min_size:
+                continue
+            points = np.array(points)
+            if self.score_mode == "fast":
+                score = self.box_score_fast(pred, points.reshape(-1, 2))
+            else:
+                score = self.box_score_slow(pred, contour)
+            if self.box_thresh > score:
+                continue
+            box = self.unclip(points).reshape(-1, 1, 2)
+            box, sside = self.get_mini_boxes(box)
+            if sside < self.min_size + 2:
+                continue
+            box = np.array(box)
+            box[:, 0] = np.clip(
+                np.round(box[:, 0] / width * dest_width), 0, dest_width)
+            box[:, 1] = np.clip(
+                np.round(box[:, 1] / height * dest_height), 0, dest_height)
+            boxes.append(box.astype(np.int16))
+            scores.append(score)
+        return np.array(boxes, dtype=np.int16), scores
+    def unclip(self, box):
+        unclip_ratio = self.unclip_ratio
+        poly = Polygon(box)
+        distance = poly.area * unclip_ratio / poly.length
+        offset = pyclipper.PyclipperOffset()
+        offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
+        expanded = np.array(offset.Execute(distance))
+        return expanded
+    def get_mini_boxes(self, contour):
+        bounding_box = cv2.minAreaRect(contour)
+        points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])
+        index_1, index_2, index_3, index_4 = 0, 1, 2, 3
+        if points[1][1] > points[0][1]:
+            index_1 = 0
+            index_4 = 1
+        else:
+            index_1 = 1
+            index_4 = 0
+        if points[3][1] > points[2][1]:
+            index_2 = 2
+            index_3 = 3
+        else:
+            index_2 = 3
+            index_3 = 2
+        box = [
+            points[index_1], points[index_2], points[index_3], points[index_4]
+        ]
+        return box, min(bounding_box[1])
+    def box_score_fast(self, bitmap, _box):
+        '''
+        box_score_fast: use bbox mean score as the mean score
+        '''
+        h, w = bitmap.shape[:2]
+        box = _box.copy()
+        xmin = np.clip(np.floor(box[:, 0].min()).astype(np.int64), 0, w - 1)
+        xmax = np.clip(np.ceil(box[:, 0].max()).astype(np.int64), 0, w - 1)
+        ymin = np.clip(np.floor(box[:, 1].min()).astype(np.int64), 0, h - 1)
+        ymax = np.clip(np.ceil(box[:, 1].max()).astype(np.int64), 0, h - 1)
+        mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
+        box[:, 0] = box[:, 0] - xmin
+        box[:, 1] = box[:, 1] - ymin
+        cv2.fillPoly(mask, box.reshape(1, -1, 2).astype(np.int32), 1)
+        return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0]
+    def box_score_slow(self, bitmap, contour):
+        '''
+        box_score_slow: use polyon mean score as the mean score
+        '''
+        h, w = bitmap.shape[:2]
+        contour = contour.copy()
+        contour = np.reshape(contour, (-1, 2))
+        xmin = np.clip(np.min(contour[:, 0]), 0, w - 1)
+        xmax = np.clip(np.max(contour[:, 0]), 0, w - 1)
+        ymin = np.clip(np.min(contour[:, 1]), 0, h - 1)
+        ymax = np.clip(np.max(contour[:, 1]), 0, h - 1)
+        mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
+        contour[:, 0] = contour[:, 0] - xmin
+        contour[:, 1] = contour[:, 1] - ymin
+        cv2.fillPoly(mask, contour.reshape(1, -1, 2).astype(np.int32), 1)
+        return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0]
+    def __call__(self, outs_dict, shape_list):
+        pred = outs_dict['maps']
+        if isinstance(pred, torch.Tensor):
+            pred = pred.cpu().numpy()
+        pred = pred[:, 0, :, :]
+        segmentation = pred > self.thresh
+        boxes_batch = []
+        for batch_index in range(pred.shape[0]):
+            src_h, src_w, ratio_h, ratio_w = shape_list[batch_index]
+            if self.dilation_kernel is not None:
+                mask = cv2.dilate(
+                    np.array(segmentation[batch_index]).astype(np.uint8),
+                    self.dilation_kernel)
+            else:
+                mask = segmentation[batch_index]
+            boxes, scores = self.boxes_from_bitmap(pred[batch_index], mask,
+                                                   src_w, src_h)
+            boxes_batch.append({'points': boxes})
+        return boxes_batch
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/east_postprocess.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/east_postprocess.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+from .locality_aware_nms import nms_locality
+import cv2
+# import paddle
+import torch
+import os
+import sys
+class EASTPostProcess(object):
+    """
+    The post process for EAST.
+    """
+    def __init__(self,
+                 score_thresh=0.8,
+                 cover_thresh=0.1,
+                 nms_thresh=0.2,
+                 **kwargs):
+        self.score_thresh = score_thresh
+        self.cover_thresh = cover_thresh
+        self.nms_thresh = nms_thresh
+        # c++ la-nms is faster, but only support python 3.5
+        self.is_python35 = False
+        if sys.version_info.major == 3 and sys.version_info.minor == 5:
+            self.is_python35 = True
+    def restore_rectangle_quad(self, origin, geometry):
+        """
+        Restore rectangle from quadrangle.
+        """
+        # quad
+        origin_concat = np.concatenate(
+            (origin, origin, origin, origin), axis=1)  # (n, 8)
+        pred_quads = origin_concat - geometry
+        pred_quads = pred_quads.reshape((-1, 4, 2))  # (n, 4, 2)
+        return pred_quads
+    def detect(self,
+               score_map,
+               geo_map,
+               score_thresh=0.8,
+               cover_thresh=0.1,
+               nms_thresh=0.2):
+        """
+        restore text boxes from score map and geo map
+        """
+        score_map = score_map[0]
+        geo_map = np.swapaxes(geo_map, 1, 0)
+        geo_map = np.swapaxes(geo_map, 1, 2)
+        # filter the score map
+        xy_text = np.argwhere(score_map > score_thresh)
+        if len(xy_text) == 0:
+            return []
+        # sort the text boxes via the y axis
+        xy_text = xy_text[np.argsort(xy_text[:, 0])]
+        # restore quad proposals
+        text_box_restored = self.restore_rectangle_quad(
+            xy_text[:, ::-1] * 4, geo_map[xy_text[:, 0], xy_text[:, 1], :])
+        boxes = np.zeros((text_box_restored.shape[0], 9), dtype=np.float32)
+        boxes[:, :8] = text_box_restored.reshape((-1, 8))
+        boxes[:, 8] = score_map[xy_text[:, 0], xy_text[:, 1]]
+        if self.is_python35:
+            import lanms
+            boxes = lanms.merge_quadrangle_n9(boxes, nms_thresh)
+        else:
+            boxes = nms_locality(boxes.astype(np.float64), nms_thresh)
+        if boxes.shape[0] == 0:
+            return []
+        # Here we filter some low score boxes by the average score map,
+        #   this is different from the orginal paper.
+        for i, box in enumerate(boxes):
+            mask = np.zeros_like(score_map, dtype=np.uint8)
+            cv2.fillPoly(mask, box[:8].reshape(
+                (-1, 4, 2)).astype(np.int32) // 4, 1)
+            boxes[i, 8] = cv2.mean(score_map, mask)[0]
+        boxes = boxes[boxes[:, 8] > cover_thresh]
+        return boxes
+    def sort_poly(self, p):
+        """
+        Sort polygons.
+        """
+        min_axis = np.argmin(np.sum(p, axis=1))
+        p = p[[min_axis, (min_axis + 1) % 4, \
+               (min_axis + 2) % 4, (min_axis + 3) % 4]]
+        if abs(p[0, 0] - p[1, 0]) > abs(p[0, 1] - p[1, 1]):
+            return p
+        else:
+            return p[[0, 3, 2, 1]]
+    def __call__(self, outs_dict, shape_list):
+        score_list = outs_dict['f_score']
+        geo_list = outs_dict['f_geo']
+        if isinstance(score_list, torch.Tensor):
+            score_list = score_list.cpu().numpy()
+            geo_list = geo_list.cpu().numpy()
+        img_num = len(shape_list)
+        dt_boxes_list = []
+        for ino in range(img_num):
+            score = score_list[ino]
+            geo = geo_list[ino]
+            boxes = self.detect(
+                score_map=score,
+                geo_map=geo,
+                score_thresh=self.score_thresh,
+                cover_thresh=self.cover_thresh,
+                nms_thresh=self.nms_thresh)
+            boxes_norm = []
+            if len(boxes) > 0:
+                h, w = score.shape[1:]
+                src_h, src_w, ratio_h, ratio_w = shape_list[ino]
+                boxes = boxes[:, :8].reshape((-1, 4, 2))
+                boxes[:, :, 0] /= ratio_w
+                boxes[:, :, 1] /= ratio_h
+                for i_box, box in enumerate(boxes):
+                    box = self.sort_poly(box.astype(np.int32))
+                    if np.linalg.norm(box[0] - box[1]) < 5 \
+                            or np.linalg.norm(box[3] - box[0]) < 5:
+                        continue
+                    boxes_norm.append(box)
+            dt_boxes_list.append({'points': np.array(boxes_norm)})
+        return dt_boxes_list
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/fce_postprocess.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/fce_postprocess.py
+"""
+This code is refer from:
+https://github.com/open-mmlab/mmocr/blob/v0.3.0/mmocr/models/textdet/postprocess/wrapper.py
+"""
+import cv2
+import torch
+import numpy as np
+from numpy.fft import ifft
+from pytorchocr.utils.poly_nms import poly_nms, valid_boundary
+def fill_hole(input_mask):
+    h, w = input_mask.shape
+    canvas = np.zeros((h + 2, w + 2), np.uint8)
+    canvas[1:h + 1, 1:w + 1] = input_mask.copy()
+    mask = np.zeros((h + 4, w + 4), np.uint8)
+    cv2.floodFill(canvas, mask, (0, 0), 1)
+    canvas = canvas[1:h + 1, 1:w + 1].astype(np.bool)
+    return ~canvas | input_mask
+def fourier2poly(fourier_coeff, num_reconstr_points=50):
+    """ Inverse Fourier transform
+        Args:
+            fourier_coeff (ndarray): Fourier coefficients shaped (n, 2k+1),
+                with n and k being candidates number and Fourier degree
+                respectively.
+            num_reconstr_points (int): Number of reconstructed polygon points.
+        Returns:
+            Polygons (ndarray): The reconstructed polygons shaped (n, n')
+        """
+    a = np.zeros((len(fourier_coeff), num_reconstr_points), dtype='complex')
+    k = (len(fourier_coeff[0]) - 1) // 2
+    a[:, 0:k + 1] = fourier_coeff[:, k:]
+    a[:, -k:] = fourier_coeff[:, :k]
+    poly_complex = ifft(a) * num_reconstr_points
+    polygon = np.zeros((len(fourier_coeff), num_reconstr_points, 2))
+    polygon[:, :, 0] = poly_complex.real
+    polygon[:, :, 1] = poly_complex.imag
+    return polygon.astype('int32').reshape((len(fourier_coeff), -1))
+class FCEPostProcess(object):
+    """
+    The post process for FCENet.
+    """
+    def __init__(self,
+                 scales,
+                 fourier_degree=5,
+                 num_reconstr_points=50,
+                 decoding_type='fcenet',
+                 score_thr=0.3,
+                 nms_thr=0.1,
+                 alpha=1.0,
+                 beta=1.0,
+                 box_type='poly',
+                 **kwargs):
+        self.scales = scales
+        self.fourier_degree = fourier_degree
+        self.num_reconstr_points = num_reconstr_points
+        self.decoding_type = decoding_type
+        self.score_thr = score_thr
+        self.nms_thr = nms_thr
+        self.alpha = alpha
+        self.beta = beta
+        self.box_type = box_type
+    def __call__(self, preds, shape_list):
+        score_maps = []
+        for key, value in preds.items():
+            if isinstance(value, torch.Tensor):
+                value = value.numpy()
+            cls_res = value[:, :4, :, :]
+            reg_res = value[:, 4:, :, :]
+            score_maps.append([cls_res, reg_res])
+        return self.get_boundary(score_maps, shape_list)
+    def resize_boundary(self, boundaries, scale_factor):
+        """Rescale boundaries via scale_factor.
+        Args:
+            boundaries (list[list[float]]): The boundary list. Each boundary
+            with size 2k+1 with k>=4.
+            scale_factor(ndarray): The scale factor of size (4,).
+        Returns:
+            boundaries (list[list[float]]): The scaled boundaries.
+        """
+        boxes = []
+        scores = []
+        for b in boundaries:
+            sz = len(b)
+            valid_boundary(b, True)
+            scores.append(b[-1])
+            b = (np.array(b[:sz - 1]) *
+                 (np.tile(scale_factor[:2], int(
+                     (sz - 1) / 2)).reshape(1, sz - 1))).flatten().tolist()
+            boxes.append(np.array(b).reshape([-1, 2]))
+        return np.array(boxes, dtype=np.float32), scores
+    def get_boundary(self, score_maps, shape_list):
+        assert len(score_maps) == len(self.scales)
+        boundaries = []
+        for idx, score_map in enumerate(score_maps):
+            scale = self.scales[idx]
+            boundaries = boundaries + self._get_boundary_single(score_map,
+                                                                scale)
+        # nms
+        boundaries = poly_nms(boundaries, self.nms_thr)
+        boundaries, scores = self.resize_boundary(
+            boundaries, (1 / shape_list[0, 2:]).tolist()[::-1])
+        boxes_batch = [dict(points=boundaries, scores=scores)]
+        return boxes_batch
+    def _get_boundary_single(self, score_map, scale):
+        assert len(score_map) == 2
+        assert score_map[1].shape[1] == 4 * self.fourier_degree + 2
+        return self.fcenet_decode(
+            preds=score_map,
+            fourier_degree=self.fourier_degree,
+            num_reconstr_points=self.num_reconstr_points,
+            scale=scale,
+            alpha=self.alpha,
+            beta=self.beta,
+            box_type=self.box_type,
+            score_thr=self.score_thr,
+            nms_thr=self.nms_thr)
+    def fcenet_decode(self,
+                      preds,
+                      fourier_degree,
+                      num_reconstr_points,
+                      scale,
+                      alpha=1.0,
+                      beta=2.0,
+                      box_type='poly',
+                      score_thr=0.3,
+                      nms_thr=0.1):
+        """Decoding predictions of FCENet to instances.
+        Args:
+            preds (list(Tensor)): The head output tensors.
+            fourier_degree (int): The maximum Fourier transform degree k.
+            num_reconstr_points (int): The points number of the polygon
+                reconstructed from predicted Fourier coefficients.
+            scale (int): The down-sample scale of the prediction.
+            alpha (float) : The parameter to calculate final scores. Score_{final}
+                    = (Score_{text region} ^ alpha)
+                    * (Score_{text center region}^ beta)
+            beta (float) : The parameter to calculate final score.
+            box_type (str):  Boundary encoding type 'poly' or 'quad'.
+            score_thr (float) : The threshold used to filter out the final
+                candidates.
+            nms_thr (float) :  The threshold of nms.
+        Returns:
+            boundaries (list[list[float]]): The instance boundary and confidence
+                list.
+        """
+        assert isinstance(preds, list)
+        assert len(preds) == 2
+        assert box_type in ['poly', 'quad']
+        cls_pred = preds[0][0]
+        tr_pred = cls_pred[0:2]
+        tcl_pred = cls_pred[2:]
+        reg_pred = preds[1][0].transpose([1, 2, 0])
+        x_pred = reg_pred[:, :, :2 * fourier_degree + 1]
+        y_pred = reg_pred[:, :, 2 * fourier_degree + 1:]
+        score_pred = (tr_pred[1]**alpha) * (tcl_pred[1]**beta)
+        tr_pred_mask = (score_pred) > score_thr
+        tr_mask = fill_hole(tr_pred_mask)
+        tr_contours, _ = cv2.findContours(
+            tr_mask.astype(np.uint8), cv2.RETR_TREE,
+            cv2.CHAIN_APPROX_SIMPLE)  # opencv4
+        mask = np.zeros_like(tr_mask)
+        boundaries = []
+        for cont in tr_contours:
+            deal_map = mask.copy().astype(np.int8)
+            cv2.drawContours(deal_map, [cont], -1, 1, -1)
+            score_map = score_pred * deal_map
+            score_mask = score_map > 0
+            xy_text = np.argwhere(score_mask)
+            dxy = xy_text[:, 1] + xy_text[:, 0] * 1j
+            x, y = x_pred[score_mask], y_pred[score_mask]
+            c = x + y * 1j
+            c[:, fourier_degree] = c[:, fourier_degree] + dxy
+            c *= scale
+            polygons = fourier2poly(c, num_reconstr_points)
+            score = score_map[score_mask].reshape(-1, 1)
+            polygons = poly_nms(np.hstack((polygons, score)).tolist(), nms_thr)
+            boundaries = boundaries + polygons
+        boundaries = poly_nms(boundaries, nms_thr)
+        if box_type == 'quad':
+            new_boundaries = []
+            for boundary in boundaries:
+                poly = np.array(boundary[:-1]).reshape(-1, 2).astype(np.float32)
+                score = boundary[-1]
+                points = cv2.boxPoints(cv2.minAreaRect(poly))
+                points = np.int0(points)
+                new_boundaries.append(points.reshape(-1).tolist() + [score])
+                boundaries = new_boundaries
+        return boundaries
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/locality_aware_nms.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/locality_aware_nms.py
+"""
+Locality aware nms.
+"""
+import numpy as np
+from shapely.geometry import Polygon
+def intersection(g, p):
+    """
+    Intersection.
+    """
+    g = Polygon(g[:8].reshape((4, 2)))
+    p = Polygon(p[:8].reshape((4, 2)))
+    g = g.buffer(0)
+    p = p.buffer(0)
+    if not g.is_valid or not p.is_valid:
+        return 0
+    inter = Polygon(g).intersection(Polygon(p)).area
+    union = g.area + p.area - inter
+    if union == 0:
+        return 0
+    else:
+        return inter / union
+def intersection_iog(g, p):
+    """
+    Intersection_iog.
+    """
+    g = Polygon(g[:8].reshape((4, 2)))
+    p = Polygon(p[:8].reshape((4, 2)))
+    if not g.is_valid or not p.is_valid:
+        return 0
+    inter = Polygon(g).intersection(Polygon(p)).area
+    #union = g.area + p.area - inter
+    union = p.area
+    if union == 0:
+        print("p_area is very small")
+        return 0
+    else:
+        return inter / union
+def weighted_merge(g, p):
+    """
+    Weighted merge.
+    """
+    g[:8] = (g[8] * g[:8] + p[8] * p[:8]) / (g[8] + p[8])
+    g[8] = (g[8] + p[8])
+    return g
+def standard_nms(S, thres):
+    """
+    Standard nms.
+    """
+    order = np.argsort(S[:, 8])[::-1]
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+        ovr = np.array([intersection(S[i], S[t]) for t in order[1:]])
+        inds = np.where(ovr <= thres)[0]
+        order = order[inds + 1]
+    return S[keep]
+def standard_nms_inds(S, thres):
+    """
+    Standard nms, retun inds.
+    """
+    order = np.argsort(S[:, 8])[::-1]
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+        ovr = np.array([intersection(S[i], S[t]) for t in order[1:]])
+        inds = np.where(ovr <= thres)[0]
+        order = order[inds + 1]
+    return keep
+def nms(S, thres):
+    """
+    nms.
+    """
+    order = np.argsort(S[:, 8])[::-1]
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+        ovr = np.array([intersection(S[i], S[t]) for t in order[1:]])
+        inds = np.where(ovr <= thres)[0]
+        order = order[inds + 1]
+    return keep
+def soft_nms(boxes_in, Nt_thres=0.3, threshold=0.8, sigma=0.5, method=2):
+    """
+    soft_nms
+    :para boxes_in, N x 9 (coords + score)
+    :para threshould, eliminate cases min score(0.001)
+    :para Nt_thres, iou_threshi
+    :para sigma, gaussian weght
+    :method, linear or gaussian
+    """
+    boxes = boxes_in.copy()
+    N = boxes.shape[0]
+    if N is None or N < 1:
+        return np.array([])
+    pos, maxpos = 0, 0
+    weight = 0.0
+    inds = np.arange(N)
+    tbox, sbox = boxes[0].copy(), boxes[0].copy()
+    for i in range(N):
+        maxscore = boxes[i, 8]
+        maxpos = i
+        tbox = boxes[i].copy()
+        ti = inds[i]
+        pos = i + 1
+        #get max box
+        while pos < N:
+            if maxscore < boxes[pos, 8]:
+                maxscore = boxes[pos, 8]
+                maxpos = pos
+            pos = pos + 1
+        #add max box as a detection
+        boxes[i, :] = boxes[maxpos, :]
+        inds[i] = inds[maxpos]
+        #swap
+        boxes[maxpos, :] = tbox
+        inds[maxpos] = ti
+        tbox = boxes[i].copy()
+        pos = i + 1
+        #NMS iteration
+        while pos < N:
+            sbox = boxes[pos].copy()
+            ts_iou_val = intersection(tbox, sbox)
+            if ts_iou_val > 0:
+                if method == 1:
+                    if ts_iou_val > Nt_thres:
+                        weight = 1 - ts_iou_val
+                    else:
+                        weight = 1
+                elif method == 2:
+                    weight = np.exp(-1.0 * ts_iou_val**2 / sigma)
+                else:
+                    if ts_iou_val > Nt_thres:
+                        weight = 0
+                    else:
+                        weight = 1
+                boxes[pos, 8] = weight * boxes[pos, 8]
+                #if box score falls below thresold, discard the box by
+                #swaping last box update N
+                if boxes[pos, 8] < threshold:
+                    boxes[pos, :] = boxes[N - 1, :]
+                    inds[pos] = inds[N - 1]
+                    N = N - 1
+                    pos = pos - 1
+            pos = pos + 1
+    return boxes[:N]
+def nms_locality(polys, thres=0.3):
+    """
+    locality aware nms of EAST
+    :param polys: a N*9 numpy array. first 8 coordinates, then prob
+    :return: boxes after nms
+    """
+    S = []
+    p = None
+    for g in polys:
+        if p is not None and intersection(g, p) > thres:
+            p = weighted_merge(g, p)
+        else:
+            if p is not None:
+                S.append(p)
+            p = g
+    if p is not None:
+        S.append(p)
+    if len(S) == 0:
+        return np.array([])
+    return standard_nms(np.array(S), thres)
+if __name__ == '__main__':
+    # 343,350,448,135,474,143,369,359
+    print(
+        Polygon(np.array([[343, 350], [448, 135], [474, 143], [369, 359]]))
+        .area)
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/pg_postprocess.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/pg_postprocess.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import sys
+__dir__ = os.path.dirname(__file__)
+sys.path.append(__dir__)
+sys.path.append(os.path.join(__dir__, '..'))
+from pytorchocr.utils.e2e_utils.pgnet_pp_utils import PGNet_PostProcess
+class PGPostProcess(object):
+    """
+    The post process for PGNet.
+    """
+    def __init__(self, character_dict_path, valid_set, score_thresh, mode,
+                 **kwargs):
+        self.character_dict_path = character_dict_path
+        self.valid_set = valid_set
+        self.score_thresh = score_thresh
+        self.mode = mode
+        # c++ la-nms is faster, but only support python 3.5
+        self.is_python35 = False
+        if sys.version_info.major == 3 and sys.version_info.minor == 5:
+            self.is_python35 = True
+    def __call__(self, outs_dict, shape_list):
+        post = PGNet_PostProcess(self.character_dict_path, self.valid_set,
+                                 self.score_thresh, outs_dict, shape_list)
+        if self.mode == 'fast':
+            data = post.pg_postprocess_fast()
+        else:
+            data = post.pg_postprocess_slow()
+        return data
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/pse_postprocess/__init__.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/pse_postprocess/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .pse_postprocess import PSEPostProcess
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/pse_postprocess/pse/README.md
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/pse_postprocess/pse/README.md
+## 编译
+This code is refer from:
+https://github.com/whai362/PSENet/blob/python3/models/post_processing/pse
+```python
+python3 setup.py build_ext --inplace
+```
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/pse_postprocess/pse/__init__.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/pse_postprocess/pse/__init__.py
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import os
+import subprocess
+python_path = sys.executable
+ori_path = os.getcwd()
+os.chdir('pytorchocr/postprocess/pse_postprocess/pse')
+if subprocess.call(
+        '{} setup.py build_ext --inplace'.format(python_path), shell=True) != 0:
+    raise RuntimeError(
+        'Cannot compile pse: {}, if your system is windows, you need to install all the default components of `desktop development using C++` in visual studio 2019+'.
+        format(os.path.dirname(os.path.realpath(__file__))))
+os.chdir(ori_path)
+from .pse import pse
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/pse_postprocess/pse/pse.pyx
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/pse_postprocess/pse/pse.pyx
+import numpy as np
+import cv2
+cimport numpy as np
+cimport cython
+cimport libcpp
+cimport libcpp.pair
+cimport libcpp.queue
+from libcpp.pair cimport *
+from libcpp.queue  cimport *
+@cython.boundscheck(False)
+@cython.wraparound(False)
+cdef np.ndarray[np.int32_t, ndim=2] _pse(np.ndarray[np.uint8_t, ndim=3] kernels,
+                                         np.ndarray[np.int32_t, ndim=2] label,
+                                         int kernel_num,
+                                         int label_num,
+                                         float min_area=0):
+    cdef np.ndarray[np.int32_t, ndim=2] pred
+    pred = np.zeros((label.shape[0], label.shape[1]), dtype=np.int32)
+    for label_idx in range(1, label_num):
+        if np.sum(label == label_idx) < min_area:
+            label[label == label_idx] = 0
+    cdef libcpp.queue.queue[libcpp.pair.pair[np.int16_t,np.int16_t]] que = \
+        queue[libcpp.pair.pair[np.int16_t,np.int16_t]]()
+    cdef libcpp.queue.queue[libcpp.pair.pair[np.int16_t,np.int16_t]] nxt_que = \
+        queue[libcpp.pair.pair[np.int16_t,np.int16_t]]()
+    cdef np.int16_t* dx = [-1, 1, 0, 0]
+    cdef np.int16_t* dy = [0, 0, -1, 1]
+    cdef np.int16_t tmpx, tmpy
+    points = np.array(np.where(label > 0)).transpose((1, 0))
+    for point_idx in range(points.shape[0]):
+        tmpx, tmpy = points[point_idx, 0], points[point_idx, 1]
+        que.push(pair[np.int16_t,np.int16_t](tmpx, tmpy))
+        pred[tmpx, tmpy] = label[tmpx, tmpy]
+    cdef libcpp.pair.pair[np.int16_t,np.int16_t] cur
+    cdef int cur_label
+    for kernel_idx in range(kernel_num - 1, -1, -1):
+        while not que.empty():
+            cur = que.front()
+            que.pop()
+            cur_label = pred[cur.first, cur.second]
+            is_edge = True
+            for j in range(4):
+                tmpx = cur.first + dx[j]
+                tmpy = cur.second + dy[j]
+                if tmpx < 0 or tmpx >= label.shape[0] or tmpy < 0 or tmpy >= label.shape[1]:
+                    continue
+                if kernels[kernel_idx, tmpx, tmpy] == 0 or pred[tmpx, tmpy] > 0:
+                    continue
+                que.push(pair[np.int16_t,np.int16_t](tmpx, tmpy))
+                pred[tmpx, tmpy] = cur_label
+                is_edge = False
+            if is_edge:
+                nxt_que.push(cur)
+        que, nxt_que = nxt_que, que
+    return pred
+def pse(kernels, min_area):
+    kernel_num = kernels.shape[0]
+    label_num, label = cv2.connectedComponents(kernels[-1], connectivity=4)
+    return _pse(kernels[:-1], label, kernel_num, label_num, min_area)
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/pse_postprocess/pse/setup.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/pse_postprocess/pse/setup.py
+from distutils.core import setup, Extension
+from Cython.Build import cythonize
+import numpy
+setup(ext_modules=cythonize(Extension(
+    'pse',
+    sources=['pse.pyx'],
+    language='c++',
+    include_dirs=[numpy.get_include()],
+    library_dirs=[],
+    libraries=[],
+    extra_compile_args=['-O3'],
+    extra_link_args=[]
+)))