feat(model): add OCR model base structure and utilities

- Add base model structure for OCR in pytorch - Implement data augmentation and transformation modules - Create utilities for dictionary handling and state dict conversion - Include post-processing modules for OCR - Add weight initialization and loading functions

feat(model): add OCR model base structure and utilities
- Add base model structure for OCR in pytorch - Implement data augmentation and transformation modules - Create utilities for dictionary handling and state dict conversion - Include post-processing modules for OCR - Add weight initialization and loading functions
a7a899f6 · myhloli · 72e66c2d · a7a899f6 · a7a899f6 · a7a899f6
Commit a7a899f6 authored Mar 27, 2025 by myhloli
20 changed files
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_lcnetv3.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_lcnetv3.py
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from pytorchocr.modeling.common import Activation
+# from paddle.nn.initializer import Constant, KaimingNormal
+# from paddle.nn import AdaptiveAvgPool2D, BatchNorm2D, Conv2D, Dropout, Hardsigmoid, Hardswish, Identity, Linear, ReLU
+# from paddle.regularizer import L2Decay
+NET_CONFIG_det = {
+    "blocks2":
+    #k, in_c, out_c, s, use_se
+    [[3, 16, 32, 1, False]],
+    "blocks3": [[3, 32, 64, 2, False], [3, 64, 64, 1, False]],
+    "blocks4": [[3, 64, 128, 2, False], [3, 128, 128, 1, False]],
+    "blocks5":
+    [[3, 128, 256, 2, False], [5, 256, 256, 1, False], [5, 256, 256, 1, False],
+     [5, 256, 256, 1, False], [5, 256, 256, 1, False]],
+    "blocks6": [[5, 256, 512, 2, True], [5, 512, 512, 1, True],
+                [5, 512, 512, 1, False], [5, 512, 512, 1, False]]
+}
+NET_CONFIG_rec = {
+    "blocks2":
+    #k, in_c, out_c, s, use_se
+    [[3, 16, 32, 1, False]],
+    "blocks3": [[3, 32, 64, 1, False], [3, 64, 64, 1, False]],
+    "blocks4": [[3, 64, 128, (2, 1), False], [3, 128, 128, 1, False]],
+    "blocks5":
+    [[3, 128, 256, (1, 2), False], [5, 256, 256, 1, False],
+     [5, 256, 256, 1, False], [5, 256, 256, 1, False], [5, 256, 256, 1, False]],
+    "blocks6": [[5, 256, 512, (2, 1), True], [5, 512, 512, 1, True],
+                [5, 512, 512, (2, 1), False], [5, 512, 512, 1, False]]
+}
+def make_divisible(v, divisor=16, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+class LearnableAffineBlock(nn.Module):
+    def __init__(self, scale_value=1.0, bias_value=0.0, lr_mult=1.0,
+                 lab_lr=0.1):
+        super().__init__()
+        self.scale = nn.Parameter(torch.Tensor([scale_value]))
+        self.bias = nn.Parameter(torch.Tensor([bias_value]))
+    def forward(self, x):
+        return self.scale * x + self.bias
+class ConvBNLayer(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 groups=1,
+                 lr_mult=1.0):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=(kernel_size - 1) // 2,
+            groups=groups,
+            bias=False)
+        self.bn = nn.BatchNorm2d(
+            out_channels,
+        )
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        return x
+class Act(nn.Module):
+    def __init__(self, act="hswish", lr_mult=1.0, lab_lr=0.1):
+        super().__init__()
+        if act == "hswish":
+            self.act = nn.Hardswish(inplace=True)
+        else:
+            assert act == "relu"
+            self.act = Activation(act)
+        self.lab = LearnableAffineBlock(lr_mult=lr_mult, lab_lr=lab_lr)
+    def forward(self, x):
+        return self.lab(self.act(x))
+class LearnableRepLayer(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 groups=1,
+                 num_conv_branches=1,
+                 lr_mult=1.0,
+                 lab_lr=0.1):
+        super().__init__()
+        self.is_repped = False
+        self.groups = groups
+        self.stride = stride
+        self.kernel_size = kernel_size
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_conv_branches = num_conv_branches
+        self.padding = (kernel_size - 1) // 2
+        self.identity = nn.BatchNorm2d(
+            num_features=in_channels,
+        ) if out_channels == in_channels and stride == 1 else None
+        self.conv_kxk = nn.ModuleList([
+            ConvBNLayer(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride,
+                groups=groups,
+                lr_mult=lr_mult) for _ in range(self.num_conv_branches)
+        ])
+        self.conv_1x1 = ConvBNLayer(
+            in_channels,
+            out_channels,
+            1,
+            stride,
+            groups=groups,
+            lr_mult=lr_mult) if kernel_size > 1 else None
+        self.lab = LearnableAffineBlock(lr_mult=lr_mult, lab_lr=lab_lr)
+        self.act = Act(lr_mult=lr_mult, lab_lr=lab_lr)
+    def forward(self, x):
+        # for export
+        if self.is_repped:
+            out = self.lab(self.reparam_conv(x))
+            if self.stride != 2:
+                out = self.act(out)
+            return out
+        out = 0
+        if self.identity is not None:
+            out += self.identity(x)
+        if self.conv_1x1 is not None:
+            out += self.conv_1x1(x)
+        for conv in self.conv_kxk:
+            out += conv(x)
+        out = self.lab(out)
+        if self.stride != 2:
+            out = self.act(out)
+        return out
+    def rep(self):
+        if self.is_repped:
+            return
+        kernel, bias = self._get_kernel_bias()
+        self.reparam_conv = nn.Conv2d(
+            in_channels=self.in_channels,
+            out_channels=self.out_channels,
+            kernel_size=self.kernel_size,
+            stride=self.stride,
+            padding=self.padding,
+            groups=self.groups)
+        self.reparam_conv.weight.data = kernel
+        self.reparam_conv.bias.data = bias
+        self.is_repped = True
+    def _pad_kernel_1x1_to_kxk(self, kernel1x1, pad):
+        if not isinstance(kernel1x1, torch.Tensor):
+            return 0
+        else:
+            return nn.functional.pad(kernel1x1, [pad, pad, pad, pad])
+    def _get_kernel_bias(self):
+        kernel_conv_1x1, bias_conv_1x1 = self._fuse_bn_tensor(self.conv_1x1)
+        kernel_conv_1x1 = self._pad_kernel_1x1_to_kxk(kernel_conv_1x1,
+                                                      self.kernel_size // 2)
+        kernel_identity, bias_identity = self._fuse_bn_tensor(self.identity)
+        kernel_conv_kxk = 0
+        bias_conv_kxk = 0
+        for conv in self.conv_kxk:
+            kernel, bias = self._fuse_bn_tensor(conv)
+            kernel_conv_kxk += kernel
+            bias_conv_kxk += bias
+        kernel_reparam = kernel_conv_kxk + kernel_conv_1x1 + kernel_identity
+        bias_reparam = bias_conv_kxk + bias_conv_1x1 + bias_identity
+        return kernel_reparam, bias_reparam
+    def _fuse_bn_tensor(self, branch):
+        if not branch:
+            return 0, 0
+        elif isinstance(branch, ConvBNLayer):
+            kernel = branch.conv.weight
+            running_mean = branch.bn._mean
+            running_var = branch.bn._variance
+            gamma = branch.bn.weight
+            beta = branch.bn.bias
+            eps = branch.bn._epsilon
+        else:
+            assert isinstance(branch, nn.BatchNorm2d)
+            if not hasattr(self, 'id_tensor'):
+                input_dim = self.in_channels // self.groups
+                kernel_value = torch.zeros(
+                    (self.in_channels, input_dim, self.kernel_size,
+                     self.kernel_size),
+                    dtype=branch.weight.dtype)
+                for i in range(self.in_channels):
+                    kernel_value[i, i % input_dim, self.kernel_size // 2,
+                                 self.kernel_size // 2] = 1
+                self.id_tensor = kernel_value
+            kernel = self.id_tensor
+            running_mean = branch._mean
+            running_var = branch._variance
+            gamma = branch.weight
+            beta = branch.bias
+            eps = branch._epsilon
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape((-1, 1, 1, 1))
+        return kernel * t, beta - running_mean * gamma / std
+class SELayer(nn.Module):
+    def __init__(self, channel, reduction=4, lr_mult=1.0):
+        super().__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.conv1 = nn.Conv2d(
+            in_channels=channel,
+            out_channels=channel // reduction,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+        self.relu = nn.ReLU()
+        self.conv2 = nn.Conv2d(
+            in_channels=channel // reduction,
+            out_channels=channel,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+        self.hardsigmoid = nn.Hardsigmoid(inplace=True)
+    def forward(self, x):
+        identity = x
+        x = self.avg_pool(x)
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.hardsigmoid(x)
+        x = identity * x
+        return x
+class LCNetV3Block(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 dw_size,
+                 use_se=False,
+                 conv_kxk_num=4,
+                 lr_mult=1.0,
+                 lab_lr=0.1):
+        super().__init__()
+        self.use_se = use_se
+        self.dw_conv = LearnableRepLayer(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=dw_size,
+            stride=stride,
+            groups=in_channels,
+            num_conv_branches=conv_kxk_num,
+            lr_mult=lr_mult,
+            lab_lr=lab_lr)
+        if use_se:
+            self.se = SELayer(in_channels, lr_mult=lr_mult)
+        self.pw_conv = LearnableRepLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            num_conv_branches=conv_kxk_num,
+            lr_mult=lr_mult,
+            lab_lr=lab_lr)
+    def forward(self, x):
+        x = self.dw_conv(x)
+        if self.use_se:
+            x = self.se(x)
+        x = self.pw_conv(x)
+        return x
+class PPLCNetV3(nn.Module):
+    def __init__(self,
+                 scale=1.0,
+                 conv_kxk_num=4,
+                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+                 lab_lr=0.1,
+                 det=False,
+                 **kwargs):
+        super().__init__()
+        self.scale = scale
+        self.lr_mult_list = lr_mult_list
+        self.det = det
+        self.net_config = NET_CONFIG_det if self.det else NET_CONFIG_rec
+        assert isinstance(self.lr_mult_list, (
+            list, tuple
+        )), "lr_mult_list should be in (list, tuple) but got {}".format(
+            type(self.lr_mult_list))
+        assert len(self.lr_mult_list
+                   ) == 6, "lr_mult_list length should be 6 but got {}".format(
+                       len(self.lr_mult_list))
+        self.conv1 = ConvBNLayer(
+            in_channels=3,
+            out_channels=make_divisible(16 * scale),
+            kernel_size=3,
+            stride=2,
+            lr_mult=self.lr_mult_list[0])
+        self.blocks2 = nn.Sequential(*[
+            LCNetV3Block(
+                in_channels=make_divisible(in_c * scale),
+                out_channels=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se,
+                conv_kxk_num=conv_kxk_num,
+                lr_mult=self.lr_mult_list[1],
+                lab_lr=lab_lr)
+            for i, (k, in_c, out_c, s, se
+                    ) in enumerate(self.net_config["blocks2"])
+        ])
+        self.blocks3 = nn.Sequential(*[
+            LCNetV3Block(
+                in_channels=make_divisible(in_c * scale),
+                out_channels=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se,
+                conv_kxk_num=conv_kxk_num,
+                lr_mult=self.lr_mult_list[2],
+                lab_lr=lab_lr)
+            for i, (k, in_c, out_c, s, se
+                    ) in enumerate(self.net_config["blocks3"])
+        ])
+        self.blocks4 = nn.Sequential(*[
+            LCNetV3Block(
+                in_channels=make_divisible(in_c * scale),
+                out_channels=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se,
+                conv_kxk_num=conv_kxk_num,
+                lr_mult=self.lr_mult_list[3],
+                lab_lr=lab_lr)
+            for i, (k, in_c, out_c, s, se
+                    ) in enumerate(self.net_config["blocks4"])
+        ])
+        self.blocks5 = nn.Sequential(*[
+            LCNetV3Block(
+                in_channels=make_divisible(in_c * scale),
+                out_channels=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se,
+                conv_kxk_num=conv_kxk_num,
+                lr_mult=self.lr_mult_list[4],
+                lab_lr=lab_lr)
+            for i, (k, in_c, out_c, s, se
+                    ) in enumerate(self.net_config["blocks5"])
+        ])
+        self.blocks6 = nn.Sequential(*[
+            LCNetV3Block(
+                in_channels=make_divisible(in_c * scale),
+                out_channels=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se,
+                conv_kxk_num=conv_kxk_num,
+                lr_mult=self.lr_mult_list[5],
+                lab_lr=lab_lr)
+            for i, (k, in_c, out_c, s, se
+                    ) in enumerate(self.net_config["blocks6"])
+        ])
+        self.out_channels = make_divisible(512 * scale)
+        if self.det:
+            mv_c = [16, 24, 56, 480]
+            self.out_channels = [
+                make_divisible(self.net_config["blocks3"][-1][2] * scale),
+                make_divisible(self.net_config["blocks4"][-1][2] * scale),
+                make_divisible(self.net_config["blocks5"][-1][2] * scale),
+                make_divisible(self.net_config["blocks6"][-1][2] * scale),
+            ]
+            self.layer_list = nn.ModuleList([
+                nn.Conv2d(self.out_channels[0], int(mv_c[0] * scale), 1, 1, 0),
+                nn.Conv2d(self.out_channels[1], int(mv_c[1] * scale), 1, 1, 0),
+                nn.Conv2d(self.out_channels[2], int(mv_c[2] * scale), 1, 1, 0),
+                nn.Conv2d(self.out_channels[3], int(mv_c[3] * scale), 1, 1, 0)
+            ])
+            self.out_channels = [
+                int(mv_c[0] * scale), int(mv_c[1] * scale),
+                int(mv_c[2] * scale), int(mv_c[3] * scale)
+            ]
+    def forward(self, x):
+        out_list = []
+        x = self.conv1(x)
+        x = self.blocks2(x)
+        x = self.blocks3(x)
+        out_list.append(x)
+        x = self.blocks4(x)
+        out_list.append(x)
+        x = self.blocks5(x)
+        out_list.append(x)
+        x = self.blocks6(x)
+        out_list.append(x)
+        if self.det:
+            out_list[0] = self.layer_list[0](out_list[0])
+            out_list[1] = self.layer_list[1](out_list[1])
+            out_list[2] = self.layer_list[2](out_list[2])
+            out_list[3] = self.layer_list[3](out_list[3])
+            return out_list
+        if self.training:
+            x = F.adaptive_avg_pool2d(x, [1, 40])
+        else:
+            x = F.avg_pool2d(x, [3, 2])
+        return x
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_lcnetv3_bak.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_lcnetv3_bak.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from pytorchocr.modeling.common import Activation
+NET_CONFIG_det = {
+    "blocks2":
+    # k, in_c, out_c, s, use_se
+        [[3, 16, 32, 1, False]],
+    "blocks3": [[3, 32, 64, 2, False], [3, 64, 64, 1, False]],
+    "blocks4": [[3, 64, 128, 2, False], [3, 128, 128, 1, False]],
+    "blocks5":
+        [[3, 128, 256, 2, False], [5, 256, 256, 1, False], [5, 256, 256, 1, False],
+         [5, 256, 256, 1, False], [5, 256, 256, 1, False]],
+    "blocks6": [[5, 256, 512, 2, True], [5, 512, 512, 1, True],
+                [5, 512, 512, 1, False], [5, 512, 512, 1, False]]
+}
+NET_CONFIG_rec = {
+    "blocks2":
+    # k, in_c, out_c, s, use_se
+        [[3, 16, 32, 1, False]],
+    "blocks3": [[3, 32, 64, 1, False], [3, 64, 64, 1, False]],
+    "blocks4": [[3, 64, 128, (2, 1), False], [3, 128, 128, 1, False]],
+    "blocks5":
+        [[3, 128, 256, (1, 2), False], [5, 256, 256, 1, False],
+         [5, 256, 256, 1, False], [5, 256, 256, 1, False], [5, 256, 256, 1, False]],
+    "blocks6": [[5, 256, 512, (2, 1), True], [5, 512, 512, 1, True],
+                [5, 512, 512, (2, 1), False], [5, 512, 512, 1, False]]
+}
+def make_divisible(v, divisor=16, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+class LearnableAffineBlock(nn.Module):
+    def __init__(self, scale_value=1.0, bias_value=0.0, lr_mult=1.0,
+                 lab_lr=0.1):
+        super().__init__()
+        self.scale = nn.Parameter(torch.Tensor([scale_value]))
+        self.bias = nn.Parameter(torch.Tensor([bias_value]))
+    def forward(self, x):
+        return self.scale * x + self.bias
+class ConvBNLayer(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 groups=1,
+                 lr_mult=1.0):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=(kernel_size - 1) // 2,
+            groups=groups,
+            bias=False)
+        self.bn = nn.BatchNorm2d(out_channels)
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        return x
+class Act(nn.Module):
+    def __init__(self, act="hard_swish", lr_mult=1.0, lab_lr=0.1):
+        super().__init__()
+        assert act in ['hard_swish', 'relu']
+        self.act = Activation(act)
+        self.lab = LearnableAffineBlock(lr_mult=lr_mult, lab_lr=lab_lr)
+    def forward(self, x):
+        return self.lab(self.act(x))
+class LearnableRepLayer(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 groups=1,
+                 num_conv_branches=1,
+                 lr_mult=1.0,
+                 lab_lr=0.1):
+        super().__init__()
+        self.is_repped = False
+        self.groups = groups
+        self.stride = stride
+        self.kernel_size = kernel_size
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_conv_branches = num_conv_branches
+        self.padding = (kernel_size - 1) // 2
+        self.identity = nn.BatchNorm2d(in_channels) if out_channels == in_channels and stride == 1 else None
+        self.conv_kxk = nn.ModuleList([
+            ConvBNLayer(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride,
+                groups=groups,
+                lr_mult=lr_mult) for _ in range(self.num_conv_branches)
+        ])
+        self.conv_1x1 = ConvBNLayer(
+            in_channels,
+            out_channels,
+            1,
+            stride,
+            groups=groups,
+            lr_mult=lr_mult) if kernel_size > 1 else None
+        self.lab = LearnableAffineBlock(lr_mult=lr_mult, lab_lr=lab_lr)
+        self.act = Act(lr_mult=lr_mult, lab_lr=lab_lr)
+    def forward(self, x):
+        # for export
+        if self.is_repped:
+            out = self.lab(self.reparam_conv(x))
+            if self.stride != 2:
+                out = self.act(out)
+            return out
+        out = 0
+        if self.identity is not None:
+            out += self.identity(x)
+        if self.conv_1x1 is not None:
+            out += self.conv_1x1(x)
+        for conv in self.conv_kxk:
+            out += conv(x)
+        out = self.lab(out)
+        if self.stride != 2:
+            out = self.act(out)
+        return out
+    def rep(self):
+        if self.is_repped:
+            return
+        kernel, bias = self._get_kernel_bias()
+        self.reparam_conv = nn.Conv2d(
+            in_channels=self.in_channels,
+            out_channels=self.out_channels,
+            kernel_size=self.kernel_size,
+            stride=self.stride,
+            padding=self.padding,
+            groups=self.groups)
+        self.reparam_conv.weight.data = kernel
+        self.reparam_conv.bias.data = bias
+        self.is_repped = True
+    def _pad_kernel_1x1_to_kxk(self, kernel1x1, pad):
+        if not isinstance(kernel1x1, torch.Tensor):
+            return 0
+        else:
+            return nn.functional.pad(kernel1x1, [pad, pad, pad, pad])
+    def _get_kernel_bias(self):
+        kernel_conv_1x1, bias_conv_1x1 = self._fuse_bn_tensor(self.conv_1x1)
+        kernel_conv_1x1 = self._pad_kernel_1x1_to_kxk(kernel_conv_1x1,
+                                                      self.kernel_size // 2)
+        kernel_identity, bias_identity = self._fuse_bn_tensor(self.identity)
+        kernel_conv_kxk = 0
+        bias_conv_kxk = 0
+        for conv in self.conv_kxk:
+            kernel, bias = self._fuse_bn_tensor(conv)
+            kernel_conv_kxk += kernel
+            bias_conv_kxk += bias
+        kernel_reparam = kernel_conv_kxk + kernel_conv_1x1 + kernel_identity
+        bias_reparam = bias_conv_kxk + bias_conv_1x1 + bias_identity
+        return kernel_reparam, bias_reparam
+    def _fuse_bn_tensor(self, branch):
+        if not branch:
+            return 0, 0
+        elif isinstance(branch, ConvBNLayer):
+            kernel = branch.conv.weight
+            running_mean = branch.bn.running_mean
+            running_var = branch.bn.running_var
+            gamma = branch.bn.weight
+            beta = branch.bn.bias
+            eps = branch.bn.eps
+        else:
+            assert isinstance(branch, nn.BatchNorm2d)
+            if not hasattr(self, 'id_tensor'):
+                input_dim = self.in_channels // self.groups
+                kernel_value = torch.zeros(
+                    (self.in_channels, input_dim, self.kernel_size,
+                     self.kernel_size),
+                    dtype=branch.weight.dtype)
+                for i in range(self.in_channels):
+                    kernel_value[i, i % input_dim, self.kernel_size // 2,
+                                    self.kernel_size // 2] = 1
+                self.id_tensor = kernel_value
+            kernel = self.id_tensor
+            running_mean = branch.running_mean
+            running_var = branch.running_var
+            gamma = branch.weight
+            beta = branch.bias
+            eps = branch.eps
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape((-1, 1, 1, 1))
+        return kernel * t, beta - running_mean * gamma / std
+class SELayer(nn.Module):
+    def __init__(self, channel, reduction=4, lr_mult=1.0):
+        super().__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.conv1 = nn.Conv2d(
+            in_channels=channel,
+            out_channels=channel // reduction,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.relu = nn.ReLU()
+        self.conv2 = nn.Conv2d(
+            in_channels=channel // reduction,
+            out_channels=channel,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.hardsigmoid = Activation('hard_sigmoid')
+    def forward(self, x):
+        identity = x
+        x = self.avg_pool(x)
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.hardsigmoid(x)
+        x = x * identity
+        return x
+class LCNetV3Block(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 dw_size,
+                 use_se=False,
+                 conv_kxk_num=4,
+                 lr_mult=1.0,
+                 lab_lr=0.1):
+        super().__init__()
+        self.use_se = use_se
+        self.dw_conv = LearnableRepLayer(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=dw_size,
+            stride=stride,
+            groups=in_channels,
+            num_conv_branches=conv_kxk_num,
+            lr_mult=lr_mult,
+            lab_lr=lab_lr)
+        if use_se:
+            self.se = SELayer(in_channels, lr_mult=lr_mult)
+        self.pw_conv = LearnableRepLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            num_conv_branches=conv_kxk_num,
+            lr_mult=lr_mult,
+            lab_lr=lab_lr)
+    def forward(self, x):
+        x = self.dw_conv(x)
+        if self.use_se:
+            x = self.se(x)
+        x = self.pw_conv(x)
+        return x
+class PPLCNetV3(nn.Module):
+    def __init__(self,
+                 scale=1.0,
+                 conv_kxk_num=4,
+                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+                 lab_lr=0.1,
+                 det=False,
+                 **kwargs):
+        super().__init__()
+        self.scale = scale
+        self.lr_mult_list = lr_mult_list
+        self.det = det
+        self.net_config = NET_CONFIG_det if self.det else NET_CONFIG_rec
+        assert isinstance(self.lr_mult_list, (
+            list, tuple
+        )), "lr_mult_list should be in (list, tuple) but got {}".format(
+            type(self.lr_mult_list))
+        assert len(self.lr_mult_list
+                   ) == 6, "lr_mult_list length should be 6 but got {}".format(
+            len(self.lr_mult_list))
+        self.conv1 = ConvBNLayer(
+            in_channels=3,
+            out_channels=make_divisible(16 * scale),
+            kernel_size=3,
+            stride=2,
+            lr_mult=self.lr_mult_list[0])
+        self.blocks2 = nn.Sequential(*[
+            LCNetV3Block(
+                in_channels=make_divisible(in_c * scale),
+                out_channels=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se,
+                conv_kxk_num=conv_kxk_num,
+                lr_mult=self.lr_mult_list[1],
+                lab_lr=lab_lr)
+            for i, (k, in_c, out_c, s, se
+                    ) in enumerate(self.net_config["blocks2"])
+        ])
+        self.blocks3 = nn.Sequential(*[
+            LCNetV3Block(
+                in_channels=make_divisible(in_c * scale),
+                out_channels=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se,
+                conv_kxk_num=conv_kxk_num,
+                lr_mult=self.lr_mult_list[2],
+                lab_lr=lab_lr)
+            for i, (k, in_c, out_c, s, se
+                    ) in enumerate(self.net_config["blocks3"])
+        ])
+        self.blocks4 = nn.Sequential(*[
+            LCNetV3Block(
+                in_channels=make_divisible(in_c * scale),
+                out_channels=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se,
+                conv_kxk_num=conv_kxk_num,
+                lr_mult=self.lr_mult_list[3],
+                lab_lr=lab_lr)
+            for i, (k, in_c, out_c, s, se
+                    ) in enumerate(self.net_config["blocks4"])
+        ])
+        self.blocks5 = nn.Sequential(*[
+            LCNetV3Block(
+                in_channels=make_divisible(in_c * scale),
+                out_channels=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se,
+                conv_kxk_num=conv_kxk_num,
+                lr_mult=self.lr_mult_list[4],
+                lab_lr=lab_lr)
+            for i, (k, in_c, out_c, s, se
+                    ) in enumerate(self.net_config["blocks5"])
+        ])
+        self.blocks6 = nn.Sequential(*[
+            LCNetV3Block(
+                in_channels=make_divisible(in_c * scale),
+                out_channels=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se,
+                conv_kxk_num=conv_kxk_num,
+                lr_mult=self.lr_mult_list[5],
+                lab_lr=lab_lr)
+            for i, (k, in_c, out_c, s, se
+                    ) in enumerate(self.net_config["blocks6"])
+        ])
+        self.out_channels = make_divisible(512 * scale)
+        if self.det:
+            mv_c = [16, 24, 56, 480]
+            self.out_channels = [
+                make_divisible(self.net_config["blocks3"][-1][2] * scale),
+                make_divisible(self.net_config["blocks4"][-1][2] * scale),
+                make_divisible(self.net_config["blocks5"][-1][2] * scale),
+                make_divisible(self.net_config["blocks6"][-1][2] * scale),
+            ]
+            self.layer_list = nn.ModuleList([
+                nn.Conv2d(self.out_channels[0], int(mv_c[0] * scale), 1, 1, 0),
+                nn.Conv2d(self.out_channels[1], int(mv_c[1] * scale), 1, 1, 0),
+                nn.Conv2d(self.out_channels[2], int(mv_c[2] * scale), 1, 1, 0),
+                nn.Conv2d(self.out_channels[3], int(mv_c[3] * scale), 1, 1, 0)
+            ])
+            self.out_channels = [
+                int(mv_c[0] * scale), int(mv_c[1] * scale),
+                int(mv_c[2] * scale), int(mv_c[3] * scale)
+            ]
+    def forward(self, x):
+        out_list = []
+        x = self.conv1(x)
+        x = self.blocks2(x)
+        x = self.blocks3(x)
+        out_list.append(x)
+        x = self.blocks4(x)
+        out_list.append(x)
+        x = self.blocks5(x)
+        out_list.append(x)
+        import numpy as np
+        x = torch.Tensor(np.load('../PaddleOCR4debug/tmp.npy'))
+        x = self.blocks6(x)
+        out_list.append(x)
+        if self.det:
+            out_list[0] = self.layer_list[0](out_list[0])
+            out_list[1] = self.layer_list[1](out_list[1])
+            out_list[2] = self.layer_list[2](out_list[2])
+            out_list[3] = self.layer_list[3](out_list[3])
+            return out_list
+        if self.training:
+            x = F.adaptive_avg_pool2d(x, [1, 40])
+        else:
+            x = F.avg_pool2d(x, [3, 2])
+        return x
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mobilenet_v3.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mobilenet_v3.py
+import os, sys
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from pytorchocr.modeling.common import Activation
+from .det_mobilenet_v3 import ResidualUnit, ConvBNLayer, make_divisible
+class MobileNetV3(nn.Module):
+    def __init__(self,
+                 in_channels=3,
+                 model_name='small',
+                 scale=0.5,
+                 large_stride=None,
+                 small_stride=None,
+                 **kwargs):
+        super(MobileNetV3, self).__init__()
+        if small_stride is None:
+            small_stride = [2, 2, 2, 2]
+        if large_stride is None:
+            large_stride = [1, 2, 2, 2]
+        assert isinstance(large_stride, list), "large_stride type must " \
+                                               "be list but got {}".format(type(large_stride))
+        assert isinstance(small_stride, list), "small_stride type must " \
+                                               "be list but got {}".format(type(small_stride))
+        assert len(large_stride) == 4, "large_stride length must be " \
+                                       "4 but got {}".format(len(large_stride))
+        assert len(small_stride) == 4, "small_stride length must be " \
+                                       "4 but got {}".format(len(small_stride))
+        if model_name == "large":
+            cfg = [
+                # k, exp, c,  se,     nl,  s,
+                [3, 16, 16, False, 'relu', large_stride[0]],
+                [3, 64, 24, False, 'relu', (large_stride[1], 1)],
+                [3, 72, 24, False, 'relu', 1],
+                [5, 72, 40, True, 'relu', (large_stride[2], 1)],
+                [5, 120, 40, True, 'relu', 1],
+                [5, 120, 40, True, 'relu', 1],
+                [3, 240, 80, False, 'hard_swish', 1],
+                [3, 200, 80, False, 'hard_swish', 1],
+                [3, 184, 80, False, 'hard_swish', 1],
+                [3, 184, 80, False, 'hard_swish', 1],
+                [3, 480, 112, True, 'hard_swish', 1],
+                [3, 672, 112, True, 'hard_swish', 1],
+                [5, 672, 160, True, 'hard_swish', (large_stride[3], 1)],
+                [5, 960, 160, True, 'hard_swish', 1],
+                [5, 960, 160, True, 'hard_swish', 1],
+            ]
+            cls_ch_squeeze = 960
+        elif model_name == "small":
+            cfg = [
+                # k, exp, c,  se,     nl,  s,
+                [3, 16, 16, True, 'relu', (small_stride[0], 1)],
+                [3, 72, 24, False, 'relu', (small_stride[1], 1)],
+                [3, 88, 24, False, 'relu', 1],
+                [5, 96, 40, True, 'hard_swish', (small_stride[2], 1)],
+                [5, 240, 40, True, 'hard_swish', 1],
+                [5, 240, 40, True, 'hard_swish', 1],
+                [5, 120, 48, True, 'hard_swish', 1],
+                [5, 144, 48, True, 'hard_swish', 1],
+                [5, 288, 96, True, 'hard_swish', (small_stride[3], 1)],
+                [5, 576, 96, True, 'hard_swish', 1],
+                [5, 576, 96, True, 'hard_swish', 1],
+            ]
+            cls_ch_squeeze = 576
+        else:
+            raise NotImplementedError("mode[" + model_name +
+                                      "_model] is not implemented!")
+        supported_scale = [0.35, 0.5, 0.75, 1.0, 1.25]
+        assert scale in supported_scale, \
+            "supported scales are {} but input scale is {}".format(supported_scale, scale)
+        inplanes = 16
+        # conv1
+        self.conv1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=make_divisible(inplanes * scale),
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            groups=1,
+            if_act=True,
+            act='hard_swish',
+            name='conv1')
+        i = 0
+        block_list = []
+        inplanes = make_divisible(inplanes * scale)
+        for (k, exp, c, se, nl, s) in cfg:
+            block_list.append(
+                ResidualUnit(
+                    in_channels=inplanes,
+                    mid_channels=make_divisible(scale * exp),
+                    out_channels=make_divisible(scale * c),
+                    kernel_size=k,
+                    stride=s,
+                    use_se=se,
+                    act=nl,
+                    name='conv' + str(i + 2)))
+            inplanes = make_divisible(scale * c)
+            i += 1
+        self.blocks = nn.Sequential(*block_list)
+        self.conv2 = ConvBNLayer(
+            in_channels=inplanes,
+            out_channels=make_divisible(scale * cls_ch_squeeze),
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            if_act=True,
+            act='hard_swish',
+            name='conv_last')
+        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
+        self.out_channels = make_divisible(scale * cls_ch_squeeze)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.blocks(x)
+        x = self.conv2(x)
+        x = self.pool(x)
+        return x
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mv1_enhance.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mv1_enhance.py
+import os, sys
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from pytorchocr.modeling.common import Activation
+class ConvBNLayer(nn.Module):
+    def __init__(self,
+                 num_channels,
+                 filter_size,
+                 num_filters,
+                 stride,
+                 padding,
+                 channels=None,
+                 num_groups=1,
+                 act='hard_swish'):
+        super(ConvBNLayer, self).__init__()
+        self.act = act
+        self._conv = nn.Conv2d(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            bias=False)
+        self._batch_norm = nn.BatchNorm2d(
+            num_filters,
+        )
+        if self.act is not None:
+            self._act = Activation(act_type=act, inplace=True)
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        if self.act is not None:
+            y = self._act(y)
+        return y
+class DepthwiseSeparable(nn.Module):
+    def __init__(self,
+                 num_channels,
+                 num_filters1,
+                 num_filters2,
+                 num_groups,
+                 stride,
+                 scale,
+                 dw_size=3,
+                 padding=1,
+                 use_se=False):
+        super(DepthwiseSeparable, self).__init__()
+        self.use_se = use_se
+        self._depthwise_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=int(num_filters1 * scale),
+            filter_size=dw_size,
+            stride=stride,
+            padding=padding,
+            num_groups=int(num_groups * scale))
+        if use_se:
+            self._se = SEModule(int(num_filters1 * scale))
+        self._pointwise_conv = ConvBNLayer(
+            num_channels=int(num_filters1 * scale),
+            filter_size=1,
+            num_filters=int(num_filters2 * scale),
+            stride=1,
+            padding=0)
+    def forward(self, inputs):
+        y = self._depthwise_conv(inputs)
+        if self.use_se:
+            y = self._se(y)
+        y = self._pointwise_conv(y)
+        return y
+class MobileNetV1Enhance(nn.Module):
+    def __init__(self,
+                 in_channels=3,
+                 scale=0.5,
+                 last_conv_stride=1,
+                 last_pool_type='max',
+                 **kwargs):
+        super().__init__()
+        self.scale = scale
+        self.block_list = []
+        self.conv1 = ConvBNLayer(
+            num_channels=in_channels,
+            filter_size=3,
+            channels=3,
+            num_filters=int(32 * scale),
+            stride=2,
+            padding=1)
+        conv2_1 = DepthwiseSeparable(
+            num_channels=int(32 * scale),
+            num_filters1=32,
+            num_filters2=64,
+            num_groups=32,
+            stride=1,
+            scale=scale)
+        self.block_list.append(conv2_1)
+        conv2_2 = DepthwiseSeparable(
+            num_channels=int(64 * scale),
+            num_filters1=64,
+            num_filters2=128,
+            num_groups=64,
+            stride=1,
+            scale=scale)
+        self.block_list.append(conv2_2)
+        conv3_1 = DepthwiseSeparable(
+            num_channels=int(128 * scale),
+            num_filters1=128,
+            num_filters2=128,
+            num_groups=128,
+            stride=1,
+            scale=scale)
+        self.block_list.append(conv3_1)
+        conv3_2 = DepthwiseSeparable(
+            num_channels=int(128 * scale),
+            num_filters1=128,
+            num_filters2=256,
+            num_groups=128,
+            stride=(2, 1),
+            scale=scale)
+        self.block_list.append(conv3_2)
+        conv4_1 = DepthwiseSeparable(
+            num_channels=int(256 * scale),
+            num_filters1=256,
+            num_filters2=256,
+            num_groups=256,
+            stride=1,
+            scale=scale)
+        self.block_list.append(conv4_1)
+        conv4_2 = DepthwiseSeparable(
+            num_channels=int(256 * scale),
+            num_filters1=256,
+            num_filters2=512,
+            num_groups=256,
+            stride=(2, 1),
+            scale=scale)
+        self.block_list.append(conv4_2)
+        for _ in range(5):
+            conv5 = DepthwiseSeparable(
+                num_channels=int(512 * scale),
+                num_filters1=512,
+                num_filters2=512,
+                num_groups=512,
+                stride=1,
+                dw_size=5,
+                padding=2,
+                scale=scale,
+                use_se=False)
+            self.block_list.append(conv5)
+        conv5_6 = DepthwiseSeparable(
+            num_channels=int(512 * scale),
+            num_filters1=512,
+            num_filters2=1024,
+            num_groups=512,
+            stride=(2, 1),
+            dw_size=5,
+            padding=2,
+            scale=scale,
+            use_se=True)
+        self.block_list.append(conv5_6)
+        conv6 = DepthwiseSeparable(
+            num_channels=int(1024 * scale),
+            num_filters1=1024,
+            num_filters2=1024,
+            num_groups=1024,
+            stride=last_conv_stride,
+            dw_size=5,
+            padding=2,
+            use_se=True,
+            scale=scale)
+        self.block_list.append(conv6)
+        self.block_list = nn.Sequential(*self.block_list)
+        if last_pool_type == 'avg':
+            self.pool = nn.AvgPool2d(kernel_size=2, stride=2, padding=0)
+        else:
+            self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
+        self.out_channels = int(1024 * scale)
+    def forward(self, inputs):
+        y = self.conv1(inputs)
+        y = self.block_list(y)
+        y = self.pool(y)
+        return y
+def hardsigmoid(x):
+    return F.relu6(x + 3., inplace=True) / 6.
+class SEModule(nn.Module):
+    def __init__(self, channel, reduction=4):
+        super(SEModule, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.conv1 = nn.Conv2d(
+            in_channels=channel,
+            out_channels=channel // reduction,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True)
+        self.conv2 = nn.Conv2d(
+            in_channels=channel // reduction,
+            out_channels=channel,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True)
+    def forward(self, inputs):
+        outputs = self.avg_pool(inputs)
+        outputs = self.conv1(outputs)
+        outputs = F.relu(outputs)
+        outputs = self.conv2(outputs)
+        outputs = hardsigmoid(outputs)
+        x = torch.mul(inputs, outputs)
+        return x
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_nrtr_mtb.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_nrtr_mtb.py
+import torch
+from torch import nn
+class MTB(nn.Module):
+    def __init__(self, cnn_num, in_channels):
+        super(MTB, self).__init__()
+        self.block = nn.Sequential()
+        self.out_channels = in_channels
+        self.cnn_num = cnn_num
+        if self.cnn_num == 2:
+            for i in range(self.cnn_num):
+                self.block.add_module(
+                    'conv_{}'.format(i),
+                    nn.Conv2d(
+                        in_channels=in_channels
+                        if i == 0 else 32 * (2**(i - 1)),
+                        out_channels=32 * (2**i),
+                        kernel_size=3,
+                        stride=2,
+                        padding=1))
+                self.block.add_module('relu_{}'.format(i), nn.ReLU())
+                self.block.add_module('bn_{}'.format(i),
+                                        nn.BatchNorm2d(32 * (2**i)))
+    def forward(self, images):
+        x = self.block(images)
+        if self.cnn_num == 2:
+            # (b, w, h, c)
+            x = x.permute(0, 3, 2, 1)
+            x_shape = x.shape
+            x = torch.reshape(
+                x, (x_shape[0], x_shape[1], x_shape[2] * x_shape[3]))
+        return x
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_resnet_31.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_resnet_31.py
+"""
+This code is refer from:
+https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textrecog/layers/conv_layer.py
+https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textrecog/backbones/resnet31_ocr.py
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# import paddle
+# from paddle import ParamAttr
+# import paddle.nn as nn
+# import paddle.nn.functional as F
+__all__ = ["ResNet31"]
+def conv3x3(in_channel, out_channel, stride=1):
+    return nn.Conv2d(
+        in_channel,
+        out_channel,
+        kernel_size=3,
+        stride=stride,
+        padding=1,
+        bias=False)
+class BasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, in_channels, channels, stride=1, downsample=False):
+        super().__init__()
+        self.conv1 = conv3x3(in_channels, channels, stride)
+        self.bn1 = nn.BatchNorm2d(channels)
+        self.relu = nn.ReLU()
+        self.conv2 = conv3x3(channels, channels)
+        self.bn2 = nn.BatchNorm2d(channels)
+        self.downsample = downsample
+        if downsample:
+            self.downsample = nn.Sequential(
+                nn.Conv2d(
+                    in_channels,
+                    channels * self.expansion,
+                    1,
+                    stride,
+                    bias=False),
+                nn.BatchNorm2d(channels * self.expansion), )
+        else:
+            self.downsample = nn.Sequential()
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+class ResNet31(nn.Module):
+    '''
+    Args:
+        in_channels (int): Number of channels of input image tensor.
+        layers (list[int]): List of BasicBlock number for each stage.
+        channels (list[int]): List of out_channels of Conv2d layer.
+        out_indices (None | Sequence[int]): Indices of output stages.
+        last_stage_pool (bool): If True, add `MaxPool2d` layer to last stage.
+    '''
+    def __init__(self,
+                 in_channels=3,
+                 layers=[1, 2, 5, 3],
+                 channels=[64, 128, 256, 256, 512, 512, 512],
+                 out_indices=None,
+                 last_stage_pool=False):
+        super(ResNet31, self).__init__()
+        assert isinstance(in_channels, int)
+        assert isinstance(last_stage_pool, bool)
+        self.out_indices = out_indices
+        self.last_stage_pool = last_stage_pool
+        # conv 1 (Conv Conv)
+        self.conv1_1 = nn.Conv2d(
+            in_channels, channels[0], kernel_size=3, stride=1, padding=1)
+        self.bn1_1 = nn.BatchNorm2d(channels[0])
+        self.relu1_1 = nn.ReLU(inplace=True)
+        self.conv1_2 = nn.Conv2d(
+            channels[0], channels[1], kernel_size=3, stride=1, padding=1)
+        self.bn1_2 = nn.BatchNorm2d(channels[1])
+        self.relu1_2 = nn.ReLU(inplace=True)
+        # conv 2 (Max-pooling, Residual block, Conv)
+        self.pool2 = nn.MaxPool2d(
+            kernel_size=2, stride=2, padding=0, ceil_mode=True)
+        self.block2 = self._make_layer(channels[1], channels[2], layers[0])
+        self.conv2 = nn.Conv2d(
+            channels[2], channels[2], kernel_size=3, stride=1, padding=1)
+        self.bn2 = nn.BatchNorm2d(channels[2])
+        self.relu2 = nn.ReLU(inplace=True)
+        # conv 3 (Max-pooling, Residual block, Conv)
+        self.pool3 = nn.MaxPool2d(
+            kernel_size=2, stride=2, padding=0, ceil_mode=True)
+        self.block3 = self._make_layer(channels[2], channels[3], layers[1])
+        self.conv3 = nn.Conv2d(
+            channels[3], channels[3], kernel_size=3, stride=1, padding=1)
+        self.bn3 = nn.BatchNorm2d(channels[3])
+        self.relu3 = nn.ReLU(inplace=True)
+        # conv 4 (Max-pooling, Residual block, Conv)
+        self.pool4 = nn.MaxPool2d(
+            kernel_size=(2, 1), stride=(2, 1), padding=0, ceil_mode=True)
+        self.block4 = self._make_layer(channels[3], channels[4], layers[2])
+        self.conv4 = nn.Conv2d(
+            channels[4], channels[4], kernel_size=3, stride=1, padding=1)
+        self.bn4 = nn.BatchNorm2d(channels[4])
+        self.relu4 = nn.ReLU(inplace=True)
+        # conv 5 ((Max-pooling), Residual block, Conv)
+        self.pool5 = None
+        if self.last_stage_pool:
+            self.pool5 = nn.MaxPool2d(
+                kernel_size=2, stride=2, padding=0, ceil_mode=True)
+        self.block5 = self._make_layer(channels[4], channels[5], layers[3])
+        self.conv5 = nn.Conv2d(
+            channels[5], channels[5], kernel_size=3, stride=1, padding=1)
+        self.bn5 = nn.BatchNorm2d(channels[5])
+        self.relu5 = nn.ReLU(inplace=True)
+        self.out_channels = channels[-1]
+    def _make_layer(self, input_channels, output_channels, blocks):
+        layers = []
+        for _ in range(blocks):
+            downsample = None
+            if input_channels != output_channels:
+                downsample = nn.Sequential(
+                    nn.Conv2d(
+                        input_channels,
+                        output_channels,
+                        kernel_size=1,
+                        stride=1,
+                        bias=False),
+                    nn.BatchNorm2d(output_channels), )
+            layers.append(
+                BasicBlock(
+                    input_channels, output_channels, downsample=downsample))
+            input_channels = output_channels
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        x = self.conv1_1(x)
+        x = self.bn1_1(x)
+        x = self.relu1_1(x)
+        x = self.conv1_2(x)
+        x = self.bn1_2(x)
+        x = self.relu1_2(x)
+        outs = []
+        for i in range(4):
+            layer_index = i + 2
+            pool_layer = getattr(self, 'pool{}'.format(layer_index))
+            block_layer = getattr(self, 'block{}'.format(layer_index))
+            conv_layer = getattr(self, 'conv{}'.format(layer_index))
+            bn_layer = getattr(self, 'bn{}'.format(layer_index))
+            relu_layer = getattr(self, 'relu{}'.format(layer_index))
+            if pool_layer is not None:
+                x = pool_layer(x)
+            x = block_layer(x)
+            x = conv_layer(x)
+            x = bn_layer(x)
+            x = relu_layer(x)
+            outs.append(x)
+        if self.out_indices is not None:
+            return tuple([outs[i] for i in self.out_indices])
+        return x
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_resnet_fpn.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_resnet_fpn.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os, sys
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from pytorchocr.modeling.common import Activation
+__all__ = ["ResNetFPN"]
+class ResNetFPN(nn.Module):
+    def __init__(self, in_channels=1, layers=50, **kwargs):
+        super(ResNetFPN, self).__init__()
+        supported_layers = {
+            18: {
+                'depth': [2, 2, 2, 2],
+                'block_class': BasicBlock
+            },
+            34: {
+                'depth': [3, 4, 6, 3],
+                'block_class': BasicBlock
+            },
+            50: {
+                'depth': [3, 4, 6, 3],
+                'block_class': BottleneckBlock
+            },
+            101: {
+                'depth': [3, 4, 23, 3],
+                'block_class': BottleneckBlock
+            },
+            152: {
+                'depth': [3, 8, 36, 3],
+                'block_class': BottleneckBlock
+            }
+        }
+        stride_list = [(2, 2), (2, 2), (1, 1), (1, 1)]
+        num_filters = [64, 128, 256, 512]
+        self.depth = supported_layers[layers]['depth']
+        self.conv = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=64,
+            kernel_size=7,
+            stride=2,
+            act="relu",
+            name="conv1")
+        self.block_list = nn.ModuleList()
+        in_ch = 64
+        if layers >= 50:
+            for block in range(len(self.depth)):
+                for i in range(self.depth[block]):
+                    if layers in [101, 152] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+                    bottlenectBlock = BottleneckBlock(
+                        in_channels=in_ch,
+                        out_channels=num_filters[block],
+                        stride=stride_list[block] if i == 0 else 1,
+                        name=conv_name)
+                    in_ch = num_filters[block] * 4
+                    self.block_list.add_module("bottleneckBlock_{}_{}".format(block, i), bottlenectBlock)
+        else:
+            for block in range(len(self.depth)):
+                for i in range(self.depth[block]):
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                    if i == 0 and block != 0:
+                        stride = (2, 1)
+                    else:
+                        stride = (1, 1)
+                    basicBlock = BasicBlock(
+                            in_channels=in_ch,
+                            out_channels=num_filters[block],
+                            stride=stride_list[block] if i == 0 else 1,
+                            is_first=block == i == 0,
+                            name=conv_name)
+                    in_ch = basicBlock.out_channels
+                    self.block_list.add_module(conv_name, basicBlock)
+        out_ch_list = [in_ch // 4, in_ch // 2, in_ch]
+        self.base_block = nn.ModuleList()
+        self.conv_trans = []
+        self.bn_block = []
+        for i in [-2, -3]:
+            in_channels = out_ch_list[i + 1] + out_ch_list[i]
+            bb_0 = nn.Conv2d(
+                        in_channels=in_channels,
+                        out_channels=out_ch_list[i],
+                        kernel_size=1,
+                        bias=True)
+            self.base_block.add_module("F_{}_base_block_0".format(i), bb_0)
+            bb_1 = nn.Conv2d(
+                        in_channels=out_ch_list[i],
+                        out_channels=out_ch_list[i],
+                        kernel_size=3,
+                        padding=1,
+                        bias=True)
+            self.base_block.add_module("F_{}_base_block_1".format(i), bb_1)
+            bb_2 = nn.Sequential(
+                nn.BatchNorm2d(out_ch_list[i]),
+                Activation("relu")
+            )
+            self.base_block.add_module("F_{}_base_block_2".format(i), bb_2)
+        bb_3 = nn.Conv2d(
+                    in_channels=out_ch_list[i],
+                    out_channels=512,
+                    kernel_size=1,
+                    bias=True)
+        self.base_block.add_module("F_{}_base_block_3".format(i), bb_3)
+        self.out_channels = 512
+    def __call__(self, x):
+        x = self.conv(x)
+        fpn_list = []
+        F = []
+        for i in range(len(self.depth)):
+            fpn_list.append(np.sum(self.depth[:i + 1]))
+        for i, block in enumerate(self.block_list):
+            x = block(x)
+            for number in fpn_list:
+                if i + 1 == number:
+                    F.append(x)
+        base = F[-1]
+        j = 0
+        for i, block in enumerate(self.base_block):
+            if i % 3 == 0 and i < 6:
+                j = j + 1
+                b, c, w, h = F[-j - 1].shape
+                if [w, h] == list(base.shape[2:]):
+                    base = base
+                else:
+                    base = self.conv_trans[j - 1](base)
+                    base = self.bn_block[j - 1](base)
+                base = torch.cat([base, F[-j - 1]], dim=1)
+            base = block(base)
+        return base
+class ConvBNLayer(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=2 if stride == (1, 1) else kernel_size,
+            dilation=2 if stride == (1, 1) else 1,
+            stride=stride,
+            padding=(kernel_size - 1) // 2,
+            groups=groups,
+            bias=False, )
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        self.bn = nn.BatchNorm2d(out_channels)
+        self.act = act
+        if self.act is not None:
+            self._act = Activation(act_type=self.act, inplace=True)
+    def __call__(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.act is not None:
+            x = self._act(x)
+        return x
+class ShortCut(nn.Module):
+    def __init__(self, in_channels, out_channels, stride, name, is_first=False):
+        super(ShortCut, self).__init__()
+        self.use_conv = True
+        if in_channels != out_channels or stride != 1 or is_first == True:
+            if stride == (1, 1):
+                self.conv = ConvBNLayer(
+                    in_channels, out_channels, 1, 1, name=name)
+            else:  # stride==(2,2)
+                self.conv = ConvBNLayer(
+                    in_channels, out_channels, 1, stride, name=name)
+        else:
+            self.use_conv = False
+    def forward(self, x):
+        if self.use_conv:
+            x = self.conv(x)
+        return x
+class BottleneckBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, stride, name):
+        super(BottleneckBlock, self).__init__()
+        self.conv0 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            act='relu',
+            name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2b")
+        self.conv2 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels * 4,
+            kernel_size=1,
+            act=None,
+            name=name + "_branch2c")
+        self.short = ShortCut(
+            in_channels=in_channels,
+            out_channels=out_channels * 4,
+            stride=stride,
+            is_first=False,
+            name=name + "_branch1")
+        self.out_channels = out_channels * 4
+    def forward(self, x):
+        y = self.conv0(x)
+        y = self.conv1(y)
+        y = self.conv2(y)
+        y = y + self.short(x)
+        y = F.relu(y)
+        return y
+class BasicBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, stride, name, is_first):
+        super(BasicBlock, self).__init__()
+        self.conv0 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            act='relu',
+            stride=stride,
+            name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            act=None,
+            name=name + "_branch2b")
+        self.short = ShortCut(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            stride=stride,
+            is_first=is_first,
+            name=name + "_branch1")
+        self.out_channels = out_channels
+    def forward(self, x):
+        y = self.conv0(x)
+        y = self.conv1(y)
+        y = y + self.short(x)
+        return F.relu(y)
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_resnet_vd.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_resnet_vd.py
+import os, sys
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from pytorchocr.modeling.common import Activation
+class ConvBNLayer(nn.Module):
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=1,
+            groups=1,
+            is_vd_mode=False,
+            act=None,
+            name=None, ):
+        super(ConvBNLayer, self).__init__()
+        self.act = act
+        self.is_vd_mode = is_vd_mode
+        self._pool2d_avg = nn.AvgPool2d(
+            kernel_size=stride, stride=stride, padding=0, ceil_mode=True)
+        self._conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=1 if is_vd_mode else stride,
+            padding=(kernel_size - 1) // 2,
+            groups=groups,
+            bias=False)
+        self._batch_norm = nn.BatchNorm2d(
+            out_channels,)
+        if self.act is not None:
+            self._act = Activation(act_type=act, inplace=True)
+    def forward(self, inputs):
+        if self.is_vd_mode:
+            inputs = self._pool2d_avg(inputs)
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        if self.act is not None:
+            y = self._act(y)
+        return y
+class BottleneckBlock(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+        self.conv0 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            act='relu',
+            name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2b")
+        self.conv2 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels * 4,
+            kernel_size=1,
+            act=None,
+            name=name + "_branch2c")
+        if not shortcut:
+            self.short = ConvBNLayer(
+                in_channels=in_channels,
+                out_channels=out_channels * 4,
+                kernel_size=1,
+                stride=stride,
+                is_vd_mode=not if_first and stride[0] != 1,
+                name=name + "_branch1")
+        self.shortcut = shortcut
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = short + conv2
+        y = F.relu(y)
+        return y
+class BasicBlock(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 name=None):
+        super(BasicBlock, self).__init__()
+        self.stride = stride
+        self.conv0 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            act=None,
+            name=name + "_branch2b")
+        if not shortcut:
+            self.short = ConvBNLayer(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=1,
+                stride=stride,
+                is_vd_mode=not if_first and stride[0] != 1,
+                name=name + "_branch1")
+        self.shortcut = shortcut
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = short + conv1
+        y = F.relu(y)
+        return y
+class ResNet(nn.Module):
+    def __init__(self, in_channels=3, layers=50, **kwargs):
+        super(ResNet, self).__init__()
+        self.layers = layers
+        supported_layers = [18, 34, 50, 101, 152, 200]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+        if layers == 18:
+            depth = [2, 2, 2, 2]
+        elif layers == 34 or layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        elif layers == 200:
+            depth = [3, 12, 48, 3]
+        num_channels = [64, 256, 512,
+                        1024] if layers >= 50 else [64, 64, 128, 256]
+        num_filters = [64, 128, 256, 512]
+        self.conv1_1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=32,
+            kernel_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_1")
+        self.conv1_2 = ConvBNLayer(
+            in_channels=32,
+            out_channels=32,
+            kernel_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_2")
+        self.conv1_3 = ConvBNLayer(
+            in_channels=32,
+            out_channels=64,
+            kernel_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_3")
+        self.pool2d_max = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        # self.block_list = list()
+        self.block_list = nn.Sequential()
+        if layers >= 50:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    if layers in [101, 152, 200] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+                    if i == 0 and block != 0:
+                        stride = (2, 1)
+                    else:
+                        stride = (1, 1)
+                    bottleneck_block = BottleneckBlock(in_channels=num_channels[block] if i == 0 else num_filters[block] * 4,
+                                                       out_channels=num_filters[block],
+                                                       stride=stride,
+                                                       shortcut=shortcut,
+                                                       if_first=block == i == 0,
+                                                       name=conv_name)
+                    shortcut = True
+                    # self.block_list.append(bottleneck_block)
+                    self.block_list.add_module('bb_%d_%d' % (block, i), bottleneck_block)
+                self.out_channels = num_filters[block]
+        else:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                    if i == 0 and block != 0:
+                        stride = (2, 1)
+                    else:
+                        stride = (1, 1)
+                    basic_block = BasicBlock(in_channels=num_channels[block] if i == 0 else num_filters[block],
+                                             out_channels=num_filters[block],
+                                             stride=stride,
+                                             shortcut=shortcut,
+                                             if_first=block == i == 0,
+                                             name=conv_name)
+                    shortcut = True
+                    # self.block_list.append(basic_block)
+                    self.block_list.add_module('bb_%d_%d' % (block, i), basic_block)
+                self.out_channels = num_filters[block]
+        self.out_pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
+    def forward(self, inputs):
+        y = self.conv1_1(inputs)
+        y = self.conv1_2(y)
+        y = self.conv1_3(y)
+        y = self.pool2d_max(y)
+        for block in self.block_list:
+            y = block(y)
+        y = self.out_pool(y)
+        return y
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_svtrnet.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_svtrnet.py
+import torch
+import torch.nn as nn
+from pytorchocr.modeling.common import Activation
+import numpy as np
+def drop_path(x, drop_prob=0., training=False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = torch.as_tensor(1 - drop_prob)
+    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype)
+    random_tensor = torch.floor(random_tensor)  # binarize
+    output = x.divide(keep_prob) * random_tensor
+    return output
+class ConvBNLayer(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 padding=0,
+                 bias_attr=False,
+                 groups=1,
+                 act='gelu'):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            bias=bias_attr)
+        self.norm = nn.BatchNorm2d(out_channels)
+        self.act = Activation(act_type=act, inplace=True)
+    def forward(self, inputs):
+        out = self.conv(inputs)
+        out = self.norm(out)
+        out = self.act(out)
+        return out
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+class Identity(nn.Module):
+    def __init__(self):
+        super(Identity, self).__init__()
+    def forward(self, input):
+        return input
+class Mlp(nn.Module):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer='gelu',
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = Activation(act_type=act_layer, inplace=True)
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class ConvMixer(nn.Module):
+    def __init__(
+            self,
+            dim,
+            num_heads=8,
+            HW=[8, 25],
+            local_k=[3, 3], ):
+        super().__init__()
+        self.HW = HW
+        self.dim = dim
+        self.local_mixer = nn.Conv2d(
+            dim,
+            dim,
+            local_k,
+            1, [local_k[0] // 2, local_k[1] // 2],
+            groups=num_heads,
+            )
+    def forward(self, x):
+        h = self.HW[0]
+        w = self.HW[1]
+        x = x.transpose([0, 2, 1]).reshape([0, self.dim, h, w])
+        x = self.local_mixer(x)
+        x = x.flatten(2).permute(0, 2, 1)
+        return x
+class Attention(nn.Module):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 mixer='Global',
+                 HW=[8, 25],
+                 local_k=[7, 11],
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.HW = HW
+        if HW is not None:
+            H = HW[0]
+            W = HW[1]
+            self.N = H * W
+            self.C = dim
+        if mixer == 'Local' and HW is not None:
+            hk = local_k[0]
+            wk = local_k[1]
+            mask = torch.ones(H * W, H + hk - 1, W + wk - 1, dtype=torch.float32)
+            for h in range(0, H):
+                for w in range(0, W):
+                    mask[h * W + w, h:h + hk, w:w + wk] = 0.
+            mask_paddle = mask[:, hk // 2:H + hk // 2, wk // 2:W + wk //
+                               2].flatten(1)
+            mask_inf = torch.full([H * W, H * W], fill_value=float("-Inf"), dtype=torch.float32)
+            mask = torch.where(mask_paddle < 1, mask_paddle, mask_inf)
+            self.mask = mask.unsqueeze(0).unsqueeze(1)
+            # self.mask = mask[None, None, :]
+        self.mixer = mixer
+    def forward(self, x):
+        if self.HW is not None:
+            N = self.N
+            C = self.C
+        else:
+            _, N, C = x.shape
+        qkv = self.qkv(x)
+        qkv = qkv.reshape((-1, N, 3, self.num_heads, C // self.num_heads)).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+        attn = (q.matmul(k.permute(0, 1, 3, 2)))
+        if self.mixer == 'Local':
+            attn += self.mask
+        attn = nn.functional.softmax(attn, dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn.matmul(v)).permute(0, 2, 1, 3).reshape((-1, N, C))
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class Block(nn.Module):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mixer='Global',
+                 local_mixer=[7, 11],
+                 HW=None,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer='gelu',
+                 norm_layer='nn.LayerNorm',
+                 epsilon=1e-6,
+                 prenorm=True):
+        super().__init__()
+        if isinstance(norm_layer, str):
+            self.norm1 = eval(norm_layer)(dim, eps=epsilon)
+        else:
+            self.norm1 = norm_layer(dim)
+        if mixer == 'Global' or mixer == 'Local':
+            self.mixer = Attention(
+                dim,
+                num_heads=num_heads,
+                mixer=mixer,
+                HW=HW,
+                local_k=local_mixer,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                attn_drop=attn_drop,
+                proj_drop=drop)
+        elif mixer == 'Conv':
+            self.mixer = ConvMixer(
+                dim, num_heads=num_heads, HW=HW, local_k=local_mixer)
+        else:
+            raise TypeError("The mixer must be one of [Global, Local, Conv]")
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        if isinstance(norm_layer, str):
+            self.norm2 = eval(norm_layer)(dim, eps=epsilon)
+        else:
+            self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp_ratio = mlp_ratio
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+        self.prenorm = prenorm
+    def forward(self, x):
+        if self.prenorm:
+            x = self.norm1(x + self.drop_path(self.mixer(x)))
+            x = self.norm2(x + self.drop_path(self.mlp(x)))
+        else:
+            x = x + self.drop_path(self.mixer(self.norm1(x)))
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(self,
+                 img_size=[32, 100],
+                 in_channels=3,
+                 embed_dim=768,
+                 sub_num=2,
+                 patch_size=[4, 4],
+                 mode='pope',
+                 ):
+        super().__init__()
+        num_patches = (img_size[1] // (2 ** sub_num)) * \
+                      (img_size[0] // (2 ** sub_num))
+        self.img_size = img_size
+        self.num_patches = num_patches
+        self.embed_dim = embed_dim
+        self.norm = None
+        if mode == 'pope':
+            if sub_num == 2:
+                self.proj = nn.Sequential(
+                    ConvBNLayer(
+                        in_channels=in_channels,
+                        out_channels=embed_dim // 2,
+                        kernel_size=3,
+                        stride=2,
+                        padding=1,
+                        act='gelu',
+                        bias_attr=True),
+                    ConvBNLayer(
+                        in_channels=embed_dim // 2,
+                        out_channels=embed_dim,
+                        kernel_size=3,
+                        stride=2,
+                        padding=1,
+                        act='gelu',
+                        bias_attr=True))
+            if sub_num == 3:
+                self.proj = nn.Sequential(
+                    ConvBNLayer(
+                        in_channels=in_channels,
+                        out_channels=embed_dim // 4,
+                        kernel_size=3,
+                        stride=2,
+                        padding=1,
+                        act='gelu',
+                        bias_attr=True),
+                    ConvBNLayer(
+                        in_channels=embed_dim // 4,
+                        out_channels=embed_dim // 2,
+                        kernel_size=3,
+                        stride=2,
+                        padding=1,
+                        act='gelu',
+                        bias_attr=True),
+                    ConvBNLayer(
+                        in_channels=embed_dim // 2,
+                        out_channels=embed_dim,
+                        kernel_size=3,
+                        stride=2,
+                        padding=1,
+                        act='gelu',
+                        bias_attr=True))
+        elif mode == 'linear':
+            self.proj = nn.Conv2d(
+                1, embed_dim, kernel_size=patch_size, stride=patch_size)
+            self.num_patches = img_size[0] // patch_size[0] * img_size[
+                1] // patch_size[1]
+    def forward(self, x):
+        B, C, H, W = x.shape
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            "Input image size ({}*{}) doesn't match model ({}*{}).".format(
+                H,W,self.img_size[0],self.img_size[1]
+            )
+        x = self.proj(x).flatten(2).permute(0, 2, 1)
+        return x
+class SubSample(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 types='Pool',
+                 stride=[2, 1],
+                 sub_norm='nn.LayerNorm',
+                 act=None):
+        super().__init__()
+        self.types = types
+        if types == 'Pool':
+            self.avgpool = nn.AvgPool2d(
+                kernel_size=[3, 5], stride=stride, padding=[1, 2])
+            self.maxpool = nn.MaxPool2d(
+                kernel_size=[3, 5], stride=stride, padding=[1, 2])
+            self.proj = nn.Linear(in_channels, out_channels)
+        else:
+            self.conv = nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=3,
+                stride=stride,
+                padding=1,
+                )
+        self.norm = eval(sub_norm)(out_channels)
+        if act is not None:
+            self.act = act()
+        else:
+            self.act = None
+    def forward(self, x):
+        if self.types == 'Pool':
+            x1 = self.avgpool(x)
+            x2 = self.maxpool(x)
+            x = (x1 + x2) * 0.5
+            out = self.proj(x.flatten(2).permute(0, 2, 1))
+        else:
+            x = self.conv(x)
+            out = x.flatten(2).permute(0, 2, 1)
+        out = self.norm(out)
+        if self.act is not None:
+            out = self.act(out)
+        return out
+class SVTRNet(nn.Module):
+    def __init__(
+            self,
+            img_size=[32, 100],
+            in_channels=3,
+            embed_dim=[64, 128, 256],
+            depth=[3, 6, 3],
+            num_heads=[2, 4, 8],
+            mixer=['Local'] * 6 + ['Global'] *
+            6,  # Local atten, Global atten, Conv
+            local_mixer=[[7, 11], [7, 11], [7, 11]],
+            patch_merging='Conv',  # Conv, Pool, None
+            mlp_ratio=4,
+            qkv_bias=True,
+            qk_scale=None,
+            drop_rate=0.,
+            last_drop=0.0,
+            attn_drop_rate=0.,
+            drop_path_rate=0.1,
+            norm_layer='nn.LayerNorm',
+            sub_norm='nn.LayerNorm',
+            epsilon=1e-6,
+            out_channels=192,
+            out_char_num=25,
+            block_unit='Block',
+            act='gelu',
+            last_stage=True,
+            sub_num=2,
+            prenorm=True,
+            use_lenhead=False,
+            **kwargs):
+        super().__init__()
+        self.img_size = img_size
+        self.embed_dim = embed_dim
+        self.out_channels = out_channels
+        self.prenorm = prenorm
+        patch_merging = None if patch_merging != 'Conv' and patch_merging != 'Pool' else patch_merging
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            in_channels=in_channels,
+            embed_dim=embed_dim[0],
+            sub_num=sub_num)
+        num_patches = self.patch_embed.num_patches
+        self.HW = [img_size[0] // (2**sub_num), img_size[1] // (2**sub_num)]
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim[0]))
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        Block_unit = eval(block_unit)
+        dpr = np.linspace(0, drop_path_rate, sum(depth))
+        self.blocks1 = nn.ModuleList([
+            Block_unit(
+                dim=embed_dim[0],
+                num_heads=num_heads[0],
+                mixer=mixer[0:depth[0]][i],
+                HW=self.HW,
+                local_mixer=local_mixer[0],
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                act_layer=act,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[0:depth[0]][i],
+                norm_layer=norm_layer,
+                epsilon=epsilon,
+                prenorm=prenorm) for i in range(depth[0])
+        ])
+        if patch_merging is not None:
+            self.sub_sample1 = SubSample(
+                embed_dim[0],
+                embed_dim[1],
+                sub_norm=sub_norm,
+                stride=[2, 1],
+                types=patch_merging)
+            HW = [self.HW[0] // 2, self.HW[1]]
+        else:
+            HW = self.HW
+        self.patch_merging = patch_merging
+        self.blocks2 = nn.ModuleList([
+            Block_unit(
+                dim=embed_dim[1],
+                num_heads=num_heads[1],
+                mixer=mixer[depth[0]:depth[0] + depth[1]][i],
+                HW=HW,
+                local_mixer=local_mixer[1],
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                act_layer=act,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[depth[0]:depth[0] + depth[1]][i],
+                norm_layer=norm_layer,
+                epsilon=epsilon,
+                prenorm=prenorm) for i in range(depth[1])
+        ])
+        if patch_merging is not None:
+            self.sub_sample2 = SubSample(
+                embed_dim[1],
+                embed_dim[2],
+                sub_norm=sub_norm,
+                stride=[2, 1],
+                types=patch_merging)
+            HW = [self.HW[0] // 4, self.HW[1]]
+        else:
+            HW = self.HW
+        self.blocks3 = nn.ModuleList([
+            Block_unit(
+                dim=embed_dim[2],
+                num_heads=num_heads[2],
+                mixer=mixer[depth[0] + depth[1]:][i],
+                HW=HW,
+                local_mixer=local_mixer[2],
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                act_layer=act,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[depth[0] + depth[1]:][i],
+                norm_layer=norm_layer,
+                epsilon=epsilon,
+                prenorm=prenorm) for i in range(depth[2])
+        ])
+        self.last_stage = last_stage
+        if last_stage:
+            self.avg_pool = nn.AdaptiveAvgPool2d([1, out_char_num])
+            self.last_conv = nn.Conv2d(
+                in_channels=embed_dim[2],
+                out_channels=self.out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=False)
+            self.hardswish = Activation('hard_swish', inplace=True) #nn.Hardswish()
+            # self.dropout = nn.Dropout(p=last_drop, mode="downscale_in_infer")
+            self.dropout = nn.Dropout(p=last_drop)
+        if not prenorm:
+            self.norm = eval(norm_layer)(embed_dim[-1], eps=epsilon)
+        self.use_lenhead = use_lenhead
+        if use_lenhead:
+            self.len_conv = nn.Linear(embed_dim[2], self.out_channels)
+            self.hardswish_len = Activation('hard_swish', inplace=True)# nn.Hardswish()
+            self.dropout_len = nn.Dropout(
+                p=last_drop)
+        torch.nn.init.xavier_normal_(self.pos_embed)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        # weight initialization
+        if isinstance(m, nn.Conv2d):
+            nn.init.kaiming_normal_(m.weight, mode='fan_out')
+            if m.bias is not None:
+                nn.init.zeros_(m.bias)
+        elif isinstance(m, nn.BatchNorm2d):
+            nn.init.ones_(m.weight)
+            nn.init.zeros_(m.bias)
+        elif isinstance(m, nn.Linear):
+            nn.init.normal_(m.weight, 0, 0.01)
+            if m.bias is not None:
+                nn.init.zeros_(m.bias)
+        elif isinstance(m, nn.ConvTranspose2d):
+            nn.init.kaiming_normal_(m.weight, mode='fan_out')
+            if m.bias is not None:
+                nn.init.zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.ones_(m.weight)
+            nn.init.zeros_(m.bias)
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+        x = x + self.pos_embed
+        x = self.pos_drop(x)
+        for blk in self.blocks1:
+            x = blk(x)
+        if self.patch_merging is not None:
+            x = self.sub_sample1(
+                x.permute(0, 2, 1).reshape(
+                    [-1, self.embed_dim[0], self.HW[0], self.HW[1]]))
+        for blk in self.blocks2:
+            x = blk(x)
+        if self.patch_merging is not None:
+            x = self.sub_sample2(
+                x.permute(0, 2, 1).reshape(
+                    [-1, self.embed_dim[1], self.HW[0] // 2, self.HW[1]]))
+        for blk in self.blocks3:
+            x = blk(x)
+        if not self.prenorm:
+            x = self.norm(x)
+        return x
+    def forward(self, x):
+        x = self.forward_features(x)
+        if self.use_lenhead:
+            len_x = self.len_conv(x.mean(1))
+            len_x = self.dropout_len(self.hardswish_len(len_x))
+        if self.last_stage:
+            if self.patch_merging is not None:
+                h = self.HW[0] // 4
+            else:
+                h = self.HW[0]
+            x = self.avg_pool(
+                x.permute(0, 2, 1).reshape(
+                    [-1, self.embed_dim[2], h, self.HW[1]]))
+            x = self.last_conv(x)
+            x = self.hardswish(x)
+            x = self.dropout(x)
+        if self.use_lenhead:
+            return x, len_x
+        return x
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_vitstr.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_vitstr.py
+"""
+This code is refer from:
+https://github.com/roatienza/deep-text-recognition-benchmark/blob/master/modules/vitstr.py
+"""
+import numpy as np
+import torch
+import torch.nn as nn
+from pytorchocr.modeling.backbones.rec_svtrnet import Block, PatchEmbed
+# import paddle
+# import paddle.nn as nn
+# from ppocr.modeling.backbones.rec_svtrnet import Block, PatchEmbed, zeros_, trunc_normal_, ones_
+scale_dim_heads = {'tiny': [192, 3], 'small': [384, 6], 'base': [768, 12]}
+class ViTSTR(nn.Module):
+    def __init__(self,
+                 img_size=[224, 224],
+                 in_channels=1,
+                 scale='tiny',
+                 seqlen=27,
+                 patch_size=[16, 16],
+                 embed_dim=None,
+                 depth=12,
+                 num_heads=None,
+                 mlp_ratio=4,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_path_rate=0.,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 norm_layer='nn.LayerNorm',
+                 act_layer='gelu',
+                 epsilon=1e-6,
+                 out_channels=None,
+                 **kwargs):
+        super().__init__()
+        self.seqlen = seqlen
+        embed_dim = embed_dim if embed_dim is not None else scale_dim_heads[
+            scale][0]
+        num_heads = num_heads if num_heads is not None else scale_dim_heads[
+            scale][1]
+        out_channels = out_channels if out_channels is not None else embed_dim
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            in_channels=in_channels,
+            embed_dim=embed_dim,
+            patch_size=patch_size,
+            mode='linear')
+        num_patches = self.patch_embed.num_patches
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        dpr = np.linspace(0, drop_path_rate, depth)
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                epsilon=epsilon,
+                prenorm=False) for i in range(depth)
+        ])
+        self.norm = eval(norm_layer)(embed_dim, eps=epsilon)
+        self.out_channels = out_channels
+        torch.nn.init.xavier_normal_(self.pos_embed)
+        torch.nn.init.xavier_normal_(self.cls_token)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        # weight initialization
+        if isinstance(m, nn.Conv2d):
+            nn.init.kaiming_normal_(m.weight, mode='fan_out')
+            if m.bias is not None:
+                nn.init.zeros_(m.bias)
+        elif isinstance(m, nn.BatchNorm2d):
+            nn.init.ones_(m.weight)
+            nn.init.zeros_(m.bias)
+        elif isinstance(m, nn.Linear):
+            nn.init.normal_(m.weight, 0, 0.01)
+            if m.bias is not None:
+                nn.init.zeros_(m.bias)
+        elif isinstance(m, nn.ConvTranspose2d):
+            nn.init.kaiming_normal_(m.weight, mode='fan_out')
+            if m.bias is not None:
+                nn.init.zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.ones_(m.weight)
+            nn.init.zeros_(m.bias)
+    def forward_features(self, x):
+        B = x.shape[0]
+        x = self.patch_embed(x)
+        # cls_tokens = paddle.tile(self.cls_token, repeat_times=[B, 1, 1])
+        cls_tokens = self.cls_token.repeat(B, 1, 1)
+        x = torch.cat((cls_tokens, x), dim=1)
+        x = x + self.pos_embed
+        x = self.pos_drop(x)
+        for blk in self.blocks:
+            x = blk(x)
+        x = self.norm(x)
+        return x
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = x[:, :self.seqlen]
+        return x.permute(0, 2, 1).unsqueeze(2)
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/table_mobilenet_v3.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/table_mobilenet_v3.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+__all__ = ['MobileNetV3']
+def make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+def hard_sigmoid(x, slope=0.1666667, offset=0.5,):
+    return torch.clamp(slope * x + offset, 0., 1.)
+def hard_swish(x, inplace=True):
+    return x * F.relu6(x + 3., inplace=inplace) / 6.
+class MobileNetV3(nn.Module):
+    def __init__(self,
+                 in_channels=3,
+                 model_name='large',
+                 scale=0.5,
+                 disable_se=False,
+                 **kwargs):
+        """
+        the MobilenetV3 backbone network for detection module.
+        Args:
+            params(dict): the super parameters for build network
+        """
+        super(MobileNetV3, self).__init__()
+        self.disable_se = disable_se
+        if model_name == "large":
+            cfg = [
+                # k, exp, c,  se,     nl,  s,
+                [3, 16, 16, False, 'relu', 1],
+                [3, 64, 24, False, 'relu', 2],
+                [3, 72, 24, False, 'relu', 1],
+                [5, 72, 40, True, 'relu', 2],
+                [5, 120, 40, True, 'relu', 1],
+                [5, 120, 40, True, 'relu', 1],
+                [3, 240, 80, False, 'hardswish', 2],
+                [3, 200, 80, False, 'hardswish', 1],
+                [3, 184, 80, False, 'hardswish', 1],
+                [3, 184, 80, False, 'hardswish', 1],
+                [3, 480, 112, True, 'hardswish', 1],
+                [3, 672, 112, True, 'hardswish', 1],
+                [5, 672, 160, True, 'hardswish', 2],
+                [5, 960, 160, True, 'hardswish', 1],
+                [5, 960, 160, True, 'hardswish', 1],
+            ]
+            cls_ch_squeeze = 960
+        elif model_name == "small":
+            cfg = [
+                # k, exp, c,  se,     nl,  s,
+                [3, 16, 16, True, 'relu', 2],
+                [3, 72, 24, False, 'relu', 2],
+                [3, 88, 24, False, 'relu', 1],
+                [5, 96, 40, True, 'hardswish', 2],
+                [5, 240, 40, True, 'hardswish', 1],
+                [5, 240, 40, True, 'hardswish', 1],
+                [5, 120, 48, True, 'hardswish', 1],
+                [5, 144, 48, True, 'hardswish', 1],
+                [5, 288, 96, True, 'hardswish', 2],
+                [5, 576, 96, True, 'hardswish', 1],
+                [5, 576, 96, True, 'hardswish', 1],
+            ]
+            cls_ch_squeeze = 576
+        else:
+            raise NotImplementedError("mode[" + model_name +
+                                      "_model] is not implemented!")
+        supported_scale = [0.35, 0.5, 0.75, 1.0, 1.25]
+        assert scale in supported_scale, \
+            "supported scale are {} but input scale is {}".format(supported_scale, scale)
+        inplanes = 16
+        # conv1
+        self.conv = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=make_divisible(inplanes * scale),
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            groups=1,
+            if_act=True,
+            act='hardswish',
+            name='conv1')
+        self.stages = nn.ModuleList()
+        self.out_channels = []
+        block_list = []
+        i = 0
+        inplanes = make_divisible(inplanes * scale)
+        for (k, exp, c, se, nl, s) in cfg:
+            se = se and not self.disable_se
+            start_idx = 2 if model_name == 'large' else 0
+            if s == 2 and i > start_idx:
+                self.out_channels.append(inplanes)
+                self.stages.append(nn.Sequential(*block_list))
+                block_list = []
+            block_list.append(
+                ResidualUnit(
+                    in_channels=inplanes,
+                    mid_channels=make_divisible(scale * exp),
+                    out_channels=make_divisible(scale * c),
+                    kernel_size=k,
+                    stride=s,
+                    use_se=se,
+                    act=nl,
+                    name="conv" + str(i + 2)))
+            inplanes = make_divisible(scale * c)
+            i += 1
+        block_list.append(
+            ConvBNLayer(
+                in_channels=inplanes,
+                out_channels=make_divisible(scale * cls_ch_squeeze),
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                groups=1,
+                if_act=True,
+                act='hardswish',
+                name='conv_last'))
+        self.stages.append(nn.Sequential(*block_list))
+        self.out_channels.append(make_divisible(scale * cls_ch_squeeze))
+        # for i, stage in enumerate(self.stages):
+        #     self.add_module(module=stage, name="stage{}".format(i))
+    def forward(self, x):
+        x = self.conv(x)
+        out_list = []
+        for stage in self.stages:
+            x = stage(x)
+            out_list.append(x)
+        return out_list
+class ConvBNLayer(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 groups=1,
+                 if_act=True,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self.if_act = if_act
+        self.act = act
+        self.conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            bias=False)
+        self.bn = nn.BatchNorm2d(
+            out_channels,
+        )
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.if_act:
+            if self.act == "relu":
+                x = F.relu(x)
+            elif self.act == "hardswish":
+                x = hard_swish(x)
+            else:
+                print("The activation function({}) is selected incorrectly.".
+                      format(self.act))
+                exit()
+        return x
+class ResidualUnit(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 mid_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 use_se,
+                 act=None,
+                 name=''):
+        super(ResidualUnit, self).__init__()
+        self.if_shortcut = stride == 1 and in_channels == out_channels
+        self.if_se = use_se
+        self.expand_conv = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=mid_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            if_act=True,
+            act=act,
+            name=name + "_expand")
+        self.bottleneck_conv = ConvBNLayer(
+            in_channels=mid_channels,
+            out_channels=mid_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=int((kernel_size - 1) // 2),
+            groups=mid_channels,
+            if_act=True,
+            act=act,
+            name=name + "_depthwise")
+        if self.if_se:
+            self.mid_se = SEModule(mid_channels, name=name + "_se")
+        self.linear_conv = ConvBNLayer(
+            in_channels=mid_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            if_act=False,
+            act=None,
+            name=name + "_linear")
+    def forward(self, inputs):
+        x = self.expand_conv(inputs)
+        x = self.bottleneck_conv(x)
+        if self.if_se:
+            x = self.mid_se(x)
+        x = self.linear_conv(x)
+        if self.if_shortcut:
+            x = torch.add(inputs, x)
+        return x
+class SEModule(nn.Module):
+    def __init__(self, in_channels, reduction=4, name=""):
+        super(SEModule, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.conv1 = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=in_channels // reduction,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True)
+        self.conv2 = nn.Conv2d(
+            in_channels=in_channels // reduction,
+            out_channels=in_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True)
+    def forward(self, inputs):
+        outputs = self.avg_pool(inputs)
+        outputs = self.conv1(outputs)
+        outputs = F.relu(outputs)
+        outputs = self.conv2(outputs)
+        outputs = hard_sigmoid(outputs, slope=0.2, offset=0.5)
+        return inputs * outputs
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/table_resnet_vd.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/table_resnet_vd.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from pytorchocr.modeling.common import Activation
+__all__ = ["ResNet"]
+class ConvBNLayer(nn.Module):
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=1,
+            groups=1,
+            is_vd_mode=False,
+            act=None,
+            name=None, ):
+        super(ConvBNLayer, self).__init__()
+        self.is_vd_mode = is_vd_mode
+        self._pool2d_avg = nn.AvgPool2d(
+            kernel_size=2, stride=2, padding=0, ceil_mode=True)
+        self._conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=(kernel_size - 1) // 2,
+            groups=groups,
+            bias=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        self._batch_norm = nn.BatchNorm2d(
+            out_channels,
+        )
+        self.act = act
+        if self.act is not None:
+            self._act = Activation(act, inplace=True)
+    def forward(self, inputs):
+        if self.is_vd_mode:
+            inputs = self._pool2d_avg(inputs)
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        if self.act is not None:
+            y = self._act(y)
+        return y
+class BottleneckBlock(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+        self.conv0 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            act='relu',
+            name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2b")
+        self.conv2 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels * 4,
+            kernel_size=1,
+            act=None,
+            name=name + "_branch2c")
+        if not shortcut:
+            self.short = ConvBNLayer(
+                in_channels=in_channels,
+                out_channels=out_channels * 4,
+                kernel_size=1,
+                stride=1,
+                is_vd_mode=False if if_first else True,
+                name=name + "_branch1")
+        self.shortcut = shortcut
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = torch.add(short, conv2)
+        y = F.relu(y)
+        return y
+class BasicBlock(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 name=None):
+        super(BasicBlock, self).__init__()
+        self.stride = stride
+        self.conv0 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            act=None,
+            name=name + "_branch2b")
+        if not shortcut:
+            self.short = ConvBNLayer(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=1,
+                stride=1,
+                is_vd_mode=False if if_first else True,
+                name=name + "_branch1")
+        self.shortcut = shortcut
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = torch.add(short, conv1)
+        y = F.relu(y)
+        return y
+class ResNet(nn.Module):
+    def __init__(self, in_channels=3, layers=50, **kwargs):
+        super(ResNet, self).__init__()
+        self.layers = layers
+        supported_layers = [18, 34, 50, 101, 152, 200]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+        if layers == 18:
+            depth = [2, 2, 2, 2]
+        elif layers == 34 or layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        elif layers == 200:
+            depth = [3, 12, 48, 3]
+        num_channels = [64, 256, 512,
+                        1024] if layers >= 50 else [64, 64, 128, 256]
+        num_filters = [64, 128, 256, 512]
+        self.conv1_1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=32,
+            kernel_size=3,
+            stride=2,
+            act='relu',
+            name="conv1_1")
+        self.conv1_2 = ConvBNLayer(
+            in_channels=32,
+            out_channels=32,
+            kernel_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_2")
+        self.conv1_3 = ConvBNLayer(
+            in_channels=32,
+            out_channels=64,
+            kernel_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_3")
+        self.pool2d_max = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.stages = nn.ModuleList()
+        self.out_channels = []
+        if layers >= 50:
+            for block in range(len(depth)):
+                block_list = nn.Sequential()
+                shortcut = False
+                for i in range(depth[block]):
+                    if layers in [101, 152] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+                    bottleneck_block = BottleneckBlock(
+                            in_channels=num_channels[block]
+                            if i == 0 else num_filters[block] * 4,
+                            out_channels=num_filters[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            shortcut=shortcut,
+                            if_first=block == i == 0,
+                            name=conv_name
+                    )
+                    shortcut = True
+                    # block_list.append(bottleneck_block)
+                    block_list.add_module('bb_%d_%d' % (block, i), bottleneck_block)
+                self.out_channels.append(num_filters[block] * 4)
+                # self.stages.append(nn.Sequential(*block_list))
+                self.stages.append(block_list)
+        else:
+            for block in range(len(depth)):
+                block_list = nn.Sequential()
+                shortcut = False
+                for i in range(depth[block]):
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                    basic_block = BasicBlock(
+                            in_channels=num_channels[block]
+                            if i == 0 else num_filters[block],
+                            out_channels=num_filters[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            shortcut=shortcut,
+                            if_first=block == i == 0,
+                            name=conv_name
+                    )
+                    shortcut = True
+                    # block_list.append(basic_block)
+                    block_list.add_module('bb_%d_%d' % (block, i), basic_block)
+                self.out_channels.append(num_filters[block])
+                # self.stages.append(nn.Sequential(*block_list))
+                self.stages.append(block_list)
+    def forward(self, inputs):
+        y = self.conv1_1(inputs)
+        y = self.conv1_2(y)
+        y = self.conv1_3(y)
+        y = self.pool2d_max(y)
+        out = []
+        for block in self.stages:
+            y = block(y)
+            out.append(y)
+        return out
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/common.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/common.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class Hswish(nn.Module):
+    def __init__(self, inplace=True):
+        super(Hswish, self).__init__()
+        self.inplace = inplace
+    def forward(self, x):
+        return x * F.relu6(x + 3., inplace=self.inplace) / 6.
+# out = max(0, min(1, slop*x+offset))
+# paddle.fluid.layers.hard_sigmoid(x, slope=0.2, offset=0.5, name=None)
+class Hsigmoid(nn.Module):
+    def __init__(self, inplace=True):
+        super(Hsigmoid, self).__init__()
+        self.inplace = inplace
+    def forward(self, x):
+        # torch: F.relu6(x + 3., inplace=self.inplace) / 6.
+        # paddle: F.relu6(1.2 * x + 3., inplace=self.inplace) / 6.
+        return F.relu6(1.2 * x + 3., inplace=self.inplace) / 6.
+class GELU(nn.Module):
+    def __init__(self, inplace=True):
+        super(GELU, self).__init__()
+        self.inplace = inplace
+    def forward(self, x):
+        return torch.nn.functional.gelu(x)
+class Swish(nn.Module):
+    def __init__(self, inplace=True):
+        super(Swish, self).__init__()
+        self.inplace = inplace
+    def forward(self, x):
+        if self.inplace:
+            x.mul_(torch.sigmoid(x))
+            return x
+        else:
+            return x*torch.sigmoid(x)
+class Activation(nn.Module):
+    def __init__(self, act_type, inplace=True):
+        super(Activation, self).__init__()
+        act_type = act_type.lower()
+        if act_type == 'relu':
+            self.act = nn.ReLU(inplace=inplace)
+        elif act_type == 'relu6':
+            self.act = nn.ReLU6(inplace=inplace)
+        elif act_type == 'sigmoid':
+            raise NotImplementedError
+        elif act_type == 'hard_sigmoid':
+            self.act = Hsigmoid(inplace)#nn.Hardsigmoid(inplace=inplace)#Hsigmoid(inplace)#
+        elif act_type == 'hard_swish' or act_type == 'hswish':
+            self.act = Hswish(inplace=inplace)
+        elif act_type == 'leakyrelu':
+            self.act = nn.LeakyReLU(inplace=inplace)
+        elif act_type == 'gelu':
+            self.act = GELU(inplace=inplace)
+        elif act_type == 'swish':
+            self.act = Swish(inplace=inplace)
+        else:
+            raise NotImplementedError
+    def forward(self, inputs):
+        return self.act(inputs)
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/__init__.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+__all__ = ['build_head']
+def build_head(config, **kwargs):
+    # det head
+    from .det_db_head import DBHead, PFHeadLocal
+    from .det_east_head import EASTHead
+    from .det_sast_head import SASTHead
+    from .det_pse_head import PSEHead
+    from .det_fce_head import FCEHead
+    from .e2e_pg_head import PGHead
+    # rec head
+    from .rec_ctc_head import CTCHead
+    from .rec_att_head import AttentionHead
+    from .rec_srn_head import SRNHead
+    from .rec_nrtr_head import Transformer
+    from .rec_sar_head import SARHead
+    from .rec_can_head import CANHead
+    from .rec_multi_head import MultiHead
+    # cls head
+    from .cls_head import ClsHead
+    support_dict = [
+        'DBHead', 'PSEHead', 'EASTHead', 'SASTHead', 'CTCHead', 'ClsHead', 'AttentionHead',
+        'SRNHead', 'PGHead', 'Transformer', 'TableAttentionHead','SARHead', 'FCEHead',
+        'CANHead', 'MultiHead', 'PFHeadLocal',
+    ]
+    from .table_att_head import TableAttentionHead
+    module_name = config.pop('name')
+    assert module_name in support_dict, Exception('head only support {}'.format(
+        support_dict))
+    module_class = eval(module_name)(**config, **kwargs)
+    return module_class
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/cls_head.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/cls_head.py
+import os, sys
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class ClsHead(nn.Module):
+    """
+    Class orientation
+    Args:
+        params(dict): super parameters for build Class network
+    """
+    def __init__(self, in_channels, class_dim, **kwargs):
+        super(ClsHead, self).__init__()
+        self.training = False
+        self.pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Linear(
+            in_channels,
+            class_dim,
+            bias=True)
+    def forward(self, x):
+        x = self.pool(x)
+        x = torch.reshape(x, shape=[x.shape[0], x.shape[1]])
+        x = self.fc(x)
+        if not self.training:
+            x = F.softmax(x, dim=1)
+        return x
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/det_db_head.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/det_db_head.py
+import os, sys
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from pytorchocr.modeling.common import Activation
+from pytorchocr.modeling.backbones.det_mobilenet_v3 import ConvBNLayer
+class Head(nn.Module):
+    def __init__(self, in_channels, **kwargs):
+        super(Head, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=in_channels // 4,
+            kernel_size=3,
+            padding=1,
+            bias=False)
+        self.conv_bn1 = nn.BatchNorm2d(
+            in_channels // 4)
+        self.relu1 = Activation(act_type='relu')
+        self.conv2 = nn.ConvTranspose2d(
+            in_channels=in_channels // 4,
+            out_channels=in_channels // 4,
+            kernel_size=2,
+            stride=2)
+        self.conv_bn2 = nn.BatchNorm2d(
+            in_channels // 4)
+        self.relu2 = Activation(act_type='relu')
+        self.conv3 = nn.ConvTranspose2d(
+            in_channels=in_channels // 4,
+            out_channels=1,
+            kernel_size=2,
+            stride=2)
+    def forward(self, x, return_f=False):
+        x = self.conv1(x)
+        x = self.conv_bn1(x)
+        x = self.relu1(x)
+        x = self.conv2(x)
+        x = self.conv_bn2(x)
+        x = self.relu2(x)
+        if return_f is True:
+            f = x
+        x = self.conv3(x)
+        x = torch.sigmoid(x)
+        if return_f is True:
+            return x, f
+        return x
+class DBHead(nn.Module):
+    """
+    Differentiable Binarization (DB) for text detection:
+        see https://arxiv.org/abs/1911.08947
+    args:
+        params(dict): super parameters for build DB network
+    """
+    def __init__(self, in_channels, k=50, **kwargs):
+        super(DBHead, self).__init__()
+        self.k = k
+        binarize_name_list = [
+            'conv2d_56', 'batch_norm_47', 'conv2d_transpose_0', 'batch_norm_48',
+            'conv2d_transpose_1', 'binarize'
+        ]
+        thresh_name_list = [
+            'conv2d_57', 'batch_norm_49', 'conv2d_transpose_2', 'batch_norm_50',
+            'conv2d_transpose_3', 'thresh'
+        ]
+        self.binarize = Head(in_channels, **kwargs)# binarize_name_list)
+        self.thresh = Head(in_channels, **kwargs)#thresh_name_list)
+    def step_function(self, x, y):
+        return torch.reciprocal(1 + torch.exp(-self.k * (x - y)))
+    def forward(self, x):
+        shrink_maps = self.binarize(x)
+        if not self.training:
+            return {'maps': shrink_maps}
+        threshold_maps = self.thresh(x)
+        binary_maps = self.step_function(shrink_maps, threshold_maps)
+        y = torch.cat([shrink_maps, threshold_maps, binary_maps], dim=1)
+        return {'maps': y}
+class LocalModule(nn.Module):
+    def __init__(self, in_c, mid_c, use_distance=True):
+        super(self.__class__, self).__init__()
+        self.last_3 = ConvBNLayer(in_c + 1, mid_c, 3, 1, 1, act='relu')
+        self.last_1 = nn.Conv2d(mid_c, 1, 1, 1, 0)
+    def forward(self, x, init_map, distance_map):
+        outf = torch.cat([init_map, x], dim=1)
+        # last Conv
+        out = self.last_1(self.last_3(outf))
+        return out
+class PFHeadLocal(DBHead):
+    def __init__(self, in_channels, k=50, mode='small', **kwargs):
+        super(PFHeadLocal, self).__init__(in_channels, k, **kwargs)
+        self.mode = mode
+        self.up_conv = nn.interpolate(scale_factor=2, mode="nearest")
+        if self.mode == 'large':
+            self.cbn_layer = LocalModule(in_channels // 4, in_channels // 4)
+        elif self.mode == 'small':
+            self.cbn_layer = LocalModule(in_channels // 4, in_channels // 8)
+    def forward(self, x, targets=None):
+        shrink_maps, f = self.binarize(x, return_f=True)
+        base_maps = shrink_maps
+        cbn_maps = self.cbn_layer(self.up_conv(f), shrink_maps, None)
+        cbn_maps = F.sigmoid(cbn_maps)
+        if not self.training:
+            return {'maps': 0.5 * (base_maps + cbn_maps), 'cbn_maps': cbn_maps}
+        threshold_maps = self.thresh(x)
+        binary_maps = self.step_function(shrink_maps, threshold_maps)
+        y = torch.cat([cbn_maps, threshold_maps, binary_maps], dim=1)
+        return {'maps': y, 'distance_maps': cbn_maps, 'cbn_maps': binary_maps}
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/det_east_head.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/det_east_head.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import math
+import os, sys
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from pytorchocr.modeling.common import Activation
+# import paddle
+# from paddle import nn
+# import paddle.nn.functional as F
+# from paddle import ParamAttr
+class ConvBNLayer(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 groups=1,
+                 if_act=True,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self.if_act = if_act
+        self.act = act
+        self.conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            bias=False)
+        self.bn = nn.BatchNorm2d(
+            out_channels,)
+        self.act = act
+        if act is not None:
+            self._act = Activation(act)
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.act is not None:
+            x = self._act(x)
+        return x
+class EASTHead(nn.Module):
+    """
+    """
+    def __init__(self, in_channels, model_name, **kwargs):
+        super(EASTHead, self).__init__()
+        self.model_name = model_name
+        if self.model_name == "large":
+            num_outputs = [128, 64, 1, 8]
+        else:
+            num_outputs = [64, 32, 1, 8]
+        self.det_conv1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=num_outputs[0],
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            if_act=True,
+            act='relu',
+            name="det_head1")
+        self.det_conv2 = ConvBNLayer(
+            in_channels=num_outputs[0],
+            out_channels=num_outputs[1],
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            if_act=True,
+            act='relu',
+            name="det_head2")
+        self.score_conv = ConvBNLayer(
+            in_channels=num_outputs[1],
+            out_channels=num_outputs[2],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            if_act=False,
+            act=None,
+            name="f_score")
+        self.geo_conv = ConvBNLayer(
+            in_channels=num_outputs[1],
+            out_channels=num_outputs[3],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            if_act=False,
+            act=None,
+            name="f_geo")
+    def forward(self, x):
+        f_det = self.det_conv1(x)
+        f_det = self.det_conv2(f_det)
+        f_score = self.score_conv(f_det)
+        f_score = torch.sigmoid(f_score)
+        f_geo = self.geo_conv(f_det)
+        f_geo = (torch.sigmoid(f_geo) - 0.5) * 2 * 800
+        pred = {'f_score': f_score, 'f_geo': f_geo}
+        return pred
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/det_fce_head.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/det_fce_head.py
+"""
+This code is refer from:
+https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textdet/dense_heads/fce_head.py
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# from paddle import nn
+# from paddle import ParamAttr
+# import paddle.nn.functional as F
+# from paddle.nn.initializer import Normal
+# import paddle
+from functools import partial
+def multi_apply(func, *args, **kwargs):
+    pfunc = partial(func, **kwargs) if kwargs else func
+    map_results = map(pfunc, *args)
+    return tuple(map(list, zip(*map_results)))
+class FCEHead(nn.Module):
+    """The class for implementing FCENet head.
+    FCENet(CVPR2021): Fourier Contour Embedding for Arbitrary-shaped Text
+    Detection.
+    [https://arxiv.org/abs/2104.10442]
+    Args:
+        in_channels (int): The number of input channels.
+        scales (list[int]) : The scale of each layer.
+        fourier_degree (int) : The maximum Fourier transform degree k.
+    """
+    def __init__(self, in_channels, fourier_degree=5):
+        super().__init__()
+        assert isinstance(in_channels, int)
+        self.downsample_ratio = 1.0
+        self.in_channels = in_channels
+        self.fourier_degree = fourier_degree
+        self.out_channels_cls = 4
+        self.out_channels_reg = (2 * self.fourier_degree + 1) * 2
+        self.out_conv_cls = nn.Conv2d(
+            in_channels=self.in_channels,
+            out_channels=self.out_channels_cls,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            groups=1,
+            bias=True)
+        self.out_conv_reg = nn.Conv2d(
+            in_channels=self.in_channels,
+            out_channels=self.out_channels_reg,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            groups=1,
+            bias=True)
+    def forward(self, feats, targets=None):
+        cls_res, reg_res = multi_apply(self.forward_single, feats)
+        level_num = len(cls_res)
+        outs = {}
+        if not self.training:
+            for i in range(level_num):
+                tr_pred = F.softmax(cls_res[i][:, 0:2, :, :], dim=1)
+                tcl_pred = F.softmax(cls_res[i][:, 2:, :, :], dim=1)
+                outs['level_{}'.format(i)] = torch.cat(
+                    [tr_pred, tcl_pred, reg_res[i]], dim=1)
+        else:
+            preds = [[cls_res[i], reg_res[i]] for i in range(level_num)]
+            outs['levels'] = preds
+        return outs
+    def forward_single(self, x):
+        cls_predict = self.out_conv_cls(x)
+        reg_predict = self.out_conv_reg(x)
+        return cls_predict, reg_predict
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/det_pse_head.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/det_pse_head.py
+"""
+This code is refer from:
+https://github.com/whai362/PSENet/blob/python3/models/head/psenet_head.py
+"""
+# from paddle import nn
+from torch import nn
+class PSEHead(nn.Module):
+    def __init__(self, in_channels, hidden_dim=256, out_channels=7, **kwargs):
+        super(PSEHead, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_channels, hidden_dim, kernel_size=3, stride=1, padding=1)
+        self.bn1 = nn.BatchNorm2d(hidden_dim)
+        self.relu1 = nn.ReLU()
+        self.conv2 = nn.Conv2d(
+            hidden_dim, out_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, x, **kwargs):
+        out = self.conv1(x)
+        out = self.relu1(self.bn1(out))
+        out = self.conv2(out)
+        return {'maps': out}
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/det_sast_head.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/det_sast_head.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import math
+import os, sys
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from pytorchocr.modeling.common import Activation
+# import paddle
+# from paddle import nn
+# import paddle.nn.functional as F
+# from paddle import ParamAttr
+class ConvBNLayer(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 groups=1,
+                 if_act=True,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self.if_act = if_act
+        self.conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=(kernel_size - 1) // 2,
+            groups=groups,
+            bias=False)
+        self.bn = nn.BatchNorm2d(
+            out_channels,)
+        self.act = act
+        if act is not None:
+            self._act = Activation(act)
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.act is not None:
+            x = self._act(x)
+        return x
+class SAST_Header1(nn.Module):
+    def __init__(self, in_channels, **kwargs):
+        super(SAST_Header1, self).__init__()
+        out_channels = [64, 64, 128]
+        self.score_conv = nn.Sequential(
+            ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_score1'),
+            ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_score2'),
+            ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_score3'),
+            ConvBNLayer(out_channels[2], 1, 3, 1, act=None, name='f_score4')
+        )
+        self.border_conv = nn.Sequential(
+            ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_border1'),
+            ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_border2'),
+            ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_border3'),
+            ConvBNLayer(out_channels[2], 4, 3, 1, act=None, name='f_border4')
+        )
+    def forward(self, x):
+        f_score = self.score_conv(x)
+        f_score = torch.sigmoid(f_score)
+        f_border = self.border_conv(x)
+        return f_score, f_border
+class SAST_Header2(nn.Module):
+    def __init__(self, in_channels, **kwargs):
+        super(SAST_Header2, self).__init__()
+        out_channels = [64, 64, 128]
+        self.tvo_conv = nn.Sequential(
+            ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_tvo1'),
+            ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_tvo2'),
+            ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_tvo3'),
+            ConvBNLayer(out_channels[2], 8, 3, 1, act=None, name='f_tvo4')
+        )
+        self.tco_conv = nn.Sequential(
+            ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_tco1'),
+            ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_tco2'),
+            ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_tco3'),
+            ConvBNLayer(out_channels[2], 2, 3, 1, act=None, name='f_tco4')
+        )
+    def forward(self, x):
+        f_tvo = self.tvo_conv(x)
+        f_tco = self.tco_conv(x)
+        return f_tvo, f_tco
+class SASTHead(nn.Module):
+    """
+    """
+    def __init__(self, in_channels, **kwargs):
+        super(SASTHead, self).__init__()
+        self.head1 = SAST_Header1(in_channels)
+        self.head2 = SAST_Header2(in_channels)
+    def forward(self, x):
+        f_score, f_border = self.head1(x)
+        f_tvo, f_tco = self.head2(x)
+        predicts = {}
+        predicts['f_score'] = f_score
+        predicts['f_border'] = f_border
+        predicts['f_tvo'] = f_tvo
+        predicts['f_tco'] = f_tco
+        return predicts
\ No newline at end of file