Merge remote-tracking branch 'origin/dev' into dev

62b7582f · myhloli · 978ef41c · 41f1fb8a · 62b7582f · 62b7582f
Commit 62b7582f authored Apr 01, 2025 by myhloli
20 changed files
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_lcnetv3.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_lcnetv3.py
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ..common import Activation
+
+NET_CONFIG_det = {
+    "blocks2":
+    # k, in_c, out_c, s, use_se
+    [[3, 16, 32, 1, False]],
+    "blocks3": [[3, 32, 64, 2, False], [3, 64, 64, 1, False]],
+    "blocks4": [[3, 64, 128, 2, False], [3, 128, 128, 1, False]],
+    "blocks5": [
+        [3, 128, 256, 2, False],
+        [5, 256, 256, 1, False],
+        [5, 256, 256, 1, False],
+        [5, 256, 256, 1, False],
+        [5, 256, 256, 1, False],
+    ],
+    "blocks6": [
+        [5, 256, 512, 2, True],
+        [5, 512, 512, 1, True],
+        [5, 512, 512, 1, False],
+        [5, 512, 512, 1, False],
+    ],
+}
+
+NET_CONFIG_rec = {
+    "blocks2":
+    # k, in_c, out_c, s, use_se
+    [[3, 16, 32, 1, False]],
+    "blocks3": [[3, 32, 64, 1, False], [3, 64, 64, 1, False]],
+    "blocks4": [[3, 64, 128, (2, 1), False], [3, 128, 128, 1, False]],
+    "blocks5": [
+        [3, 128, 256, (1, 2), False],
+        [5, 256, 256, 1, False],
+        [5, 256, 256, 1, False],
+        [5, 256, 256, 1, False],
+        [5, 256, 256, 1, False],
+    ],
+    "blocks6": [
+        [5, 256, 512, (2, 1), True],
+        [5, 512, 512, 1, True],
+        [5, 512, 512, (2, 1), False],
+        [5, 512, 512, 1, False],
+    ],
+}
+
+
+def make_divisible(v, divisor=16, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class LearnableAffineBlock(nn.Module):
+    def __init__(self, scale_value=1.0, bias_value=0.0, lr_mult=1.0, lab_lr=0.1):
+        super().__init__()
+        self.scale = nn.Parameter(torch.Tensor([scale_value]))
+        self.bias = nn.Parameter(torch.Tensor([bias_value]))
+
+    def forward(self, x):
+        return self.scale * x + self.bias
+
+
+class ConvBNLayer(nn.Module):
+    def __init__(
+        self, in_channels, out_channels, kernel_size, stride, groups=1, lr_mult=1.0
+    ):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=(kernel_size - 1) // 2,
+            groups=groups,
+            bias=False,
+        )
+
+        self.bn = nn.BatchNorm2d(
+            out_channels,
+        )
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        return x
+
+
+class Act(nn.Module):
+    def __init__(self, act="hswish", lr_mult=1.0, lab_lr=0.1):
+        super().__init__()
+        if act == "hswish":
+            self.act = nn.Hardswish(inplace=True)
+        else:
+            assert act == "relu"
+            self.act = Activation(act)
+        self.lab = LearnableAffineBlock(lr_mult=lr_mult, lab_lr=lab_lr)
+
+    def forward(self, x):
+        return self.lab(self.act(x))
+
+
+class LearnableRepLayer(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        groups=1,
+        num_conv_branches=1,
+        lr_mult=1.0,
+        lab_lr=0.1,
+    ):
+        super().__init__()
+        self.is_repped = False
+        self.groups = groups
+        self.stride = stride
+        self.kernel_size = kernel_size
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_conv_branches = num_conv_branches
+        self.padding = (kernel_size - 1) // 2
+
+        self.identity = (
+            nn.BatchNorm2d(
+                num_features=in_channels,
+            )
+            if out_channels == in_channels and stride == 1
+            else None
+        )
+
+        self.conv_kxk = nn.ModuleList(
+            [
+                ConvBNLayer(
+                    in_channels,
+                    out_channels,
+                    kernel_size,
+                    stride,
+                    groups=groups,
+                    lr_mult=lr_mult,
+                )
+                for _ in range(self.num_conv_branches)
+            ]
+        )
+
+        self.conv_1x1 = (
+            ConvBNLayer(
+                in_channels, out_channels, 1, stride, groups=groups, lr_mult=lr_mult
+            )
+            if kernel_size > 1
+            else None
+        )
+
+        self.lab = LearnableAffineBlock(lr_mult=lr_mult, lab_lr=lab_lr)
+        self.act = Act(lr_mult=lr_mult, lab_lr=lab_lr)
+
+    def forward(self, x):
+        # for export
+        if self.is_repped:
+            out = self.lab(self.reparam_conv(x))
+            if self.stride != 2:
+                out = self.act(out)
+            return out
+
+        out = 0
+        if self.identity is not None:
+            out += self.identity(x)
+
+        if self.conv_1x1 is not None:
+            out += self.conv_1x1(x)
+
+        for conv in self.conv_kxk:
+            out += conv(x)
+
+        out = self.lab(out)
+        if self.stride != 2:
+            out = self.act(out)
+        return out
+
+    def rep(self):
+        if self.is_repped:
+            return
+        kernel, bias = self._get_kernel_bias()
+        self.reparam_conv = nn.Conv2d(
+            in_channels=self.in_channels,
+            out_channels=self.out_channels,
+            kernel_size=self.kernel_size,
+            stride=self.stride,
+            padding=self.padding,
+            groups=self.groups,
+        )
+        self.reparam_conv.weight.data = kernel
+        self.reparam_conv.bias.data = bias
+        self.is_repped = True
+
+    def _pad_kernel_1x1_to_kxk(self, kernel1x1, pad):
+        if not isinstance(kernel1x1, torch.Tensor):
+            return 0
+        else:
+            return nn.functional.pad(kernel1x1, [pad, pad, pad, pad])
+
+    def _get_kernel_bias(self):
+        kernel_conv_1x1, bias_conv_1x1 = self._fuse_bn_tensor(self.conv_1x1)
+        kernel_conv_1x1 = self._pad_kernel_1x1_to_kxk(
+            kernel_conv_1x1, self.kernel_size // 2
+        )
+
+        kernel_identity, bias_identity = self._fuse_bn_tensor(self.identity)
+
+        kernel_conv_kxk = 0
+        bias_conv_kxk = 0
+        for conv in self.conv_kxk:
+            kernel, bias = self._fuse_bn_tensor(conv)
+            kernel_conv_kxk += kernel
+            bias_conv_kxk += bias
+
+        kernel_reparam = kernel_conv_kxk + kernel_conv_1x1 + kernel_identity
+        bias_reparam = bias_conv_kxk + bias_conv_1x1 + bias_identity
+        return kernel_reparam, bias_reparam
+
+    def _fuse_bn_tensor(self, branch):
+        if not branch:
+            return 0, 0
+        elif isinstance(branch, ConvBNLayer):
+            kernel = branch.conv.weight
+            running_mean = branch.bn._mean
+            running_var = branch.bn._variance
+            gamma = branch.bn.weight
+            beta = branch.bn.bias
+            eps = branch.bn._epsilon
+        else:
+            assert isinstance(branch, nn.BatchNorm2d)
+            if not hasattr(self, "id_tensor"):
+                input_dim = self.in_channels // self.groups
+                kernel_value = torch.zeros(
+                    (self.in_channels, input_dim, self.kernel_size, self.kernel_size),
+                    dtype=branch.weight.dtype,
+                )
+                for i in range(self.in_channels):
+                    kernel_value[
+                        i, i % input_dim, self.kernel_size // 2, self.kernel_size // 2
+                    ] = 1
+                self.id_tensor = kernel_value
+            kernel = self.id_tensor
+            running_mean = branch._mean
+            running_var = branch._variance
+            gamma = branch.weight
+            beta = branch.bias
+            eps = branch._epsilon
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape((-1, 1, 1, 1))
+        return kernel * t, beta - running_mean * gamma / std
+
+
+class SELayer(nn.Module):
+    def __init__(self, channel, reduction=4, lr_mult=1.0):
+        super().__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.conv1 = nn.Conv2d(
+            in_channels=channel,
+            out_channels=channel // reduction,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+        self.relu = nn.ReLU()
+        self.conv2 = nn.Conv2d(
+            in_channels=channel // reduction,
+            out_channels=channel,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+        self.hardsigmoid = nn.Hardsigmoid(inplace=True)
+
+    def forward(self, x):
+        identity = x
+        x = self.avg_pool(x)
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.hardsigmoid(x)
+        x = identity * x
+        return x
+
+
+class LCNetV3Block(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        stride,
+        dw_size,
+        use_se=False,
+        conv_kxk_num=4,
+        lr_mult=1.0,
+        lab_lr=0.1,
+    ):
+        super().__init__()
+        self.use_se = use_se
+        self.dw_conv = LearnableRepLayer(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=dw_size,
+            stride=stride,
+            groups=in_channels,
+            num_conv_branches=conv_kxk_num,
+            lr_mult=lr_mult,
+            lab_lr=lab_lr,
+        )
+        if use_se:
+            self.se = SELayer(in_channels, lr_mult=lr_mult)
+        self.pw_conv = LearnableRepLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            num_conv_branches=conv_kxk_num,
+            lr_mult=lr_mult,
+            lab_lr=lab_lr,
+        )
+
+    def forward(self, x):
+        x = self.dw_conv(x)
+        if self.use_se:
+            x = self.se(x)
+        x = self.pw_conv(x)
+        return x
+
+
+class PPLCNetV3(nn.Module):
+    def __init__(
+        self,
+        scale=1.0,
+        conv_kxk_num=4,
+        lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+        lab_lr=0.1,
+        det=False,
+        **kwargs
+    ):
+        super().__init__()
+        self.scale = scale
+        self.lr_mult_list = lr_mult_list
+        self.det = det
+
+        self.net_config = NET_CONFIG_det if self.det else NET_CONFIG_rec
+
+        assert isinstance(
+            self.lr_mult_list, (list, tuple)
+        ), "lr_mult_list should be in (list, tuple) but got {}".format(
+            type(self.lr_mult_list)
+        )
+        assert (
+            len(self.lr_mult_list) == 6
+        ), "lr_mult_list length should be 6 but got {}".format(len(self.lr_mult_list))
+
+        self.conv1 = ConvBNLayer(
+            in_channels=3,
+            out_channels=make_divisible(16 * scale),
+            kernel_size=3,
+            stride=2,
+            lr_mult=self.lr_mult_list[0],
+        )
+
+        self.blocks2 = nn.Sequential(
+            *[
+                LCNetV3Block(
+                    in_channels=make_divisible(in_c * scale),
+                    out_channels=make_divisible(out_c * scale),
+                    dw_size=k,
+                    stride=s,
+                    use_se=se,
+                    conv_kxk_num=conv_kxk_num,
+                    lr_mult=self.lr_mult_list[1],
+                    lab_lr=lab_lr,
+                )
+                for i, (k, in_c, out_c, s, se) in enumerate(self.net_config["blocks2"])
+            ]
+        )
+
+        self.blocks3 = nn.Sequential(
+            *[
+                LCNetV3Block(
+                    in_channels=make_divisible(in_c * scale),
+                    out_channels=make_divisible(out_c * scale),
+                    dw_size=k,
+                    stride=s,
+                    use_se=se,
+                    conv_kxk_num=conv_kxk_num,
+                    lr_mult=self.lr_mult_list[2],
+                    lab_lr=lab_lr,
+                )
+                for i, (k, in_c, out_c, s, se) in enumerate(self.net_config["blocks3"])
+            ]
+        )
+
+        self.blocks4 = nn.Sequential(
+            *[
+                LCNetV3Block(
+                    in_channels=make_divisible(in_c * scale),
+                    out_channels=make_divisible(out_c * scale),
+                    dw_size=k,
+                    stride=s,
+                    use_se=se,
+                    conv_kxk_num=conv_kxk_num,
+                    lr_mult=self.lr_mult_list[3],
+                    lab_lr=lab_lr,
+                )
+                for i, (k, in_c, out_c, s, se) in enumerate(self.net_config["blocks4"])
+            ]
+        )
+
+        self.blocks5 = nn.Sequential(
+            *[
+                LCNetV3Block(
+                    in_channels=make_divisible(in_c * scale),
+                    out_channels=make_divisible(out_c * scale),
+                    dw_size=k,
+                    stride=s,
+                    use_se=se,
+                    conv_kxk_num=conv_kxk_num,
+                    lr_mult=self.lr_mult_list[4],
+                    lab_lr=lab_lr,
+                )
+                for i, (k, in_c, out_c, s, se) in enumerate(self.net_config["blocks5"])
+            ]
+        )
+
+        self.blocks6 = nn.Sequential(
+            *[
+                LCNetV3Block(
+                    in_channels=make_divisible(in_c * scale),
+                    out_channels=make_divisible(out_c * scale),
+                    dw_size=k,
+                    stride=s,
+                    use_se=se,
+                    conv_kxk_num=conv_kxk_num,
+                    lr_mult=self.lr_mult_list[5],
+                    lab_lr=lab_lr,
+                )
+                for i, (k, in_c, out_c, s, se) in enumerate(self.net_config["blocks6"])
+            ]
+        )
+        self.out_channels = make_divisible(512 * scale)
+
+        if self.det:
+            mv_c = [16, 24, 56, 480]
+            self.out_channels = [
+                make_divisible(self.net_config["blocks3"][-1][2] * scale),
+                make_divisible(self.net_config["blocks4"][-1][2] * scale),
+                make_divisible(self.net_config["blocks5"][-1][2] * scale),
+                make_divisible(self.net_config["blocks6"][-1][2] * scale),
+            ]
+
+            self.layer_list = nn.ModuleList(
+                [
+                    nn.Conv2d(self.out_channels[0], int(mv_c[0] * scale), 1, 1, 0),
+                    nn.Conv2d(self.out_channels[1], int(mv_c[1] * scale), 1, 1, 0),
+                    nn.Conv2d(self.out_channels[2], int(mv_c[2] * scale), 1, 1, 0),
+                    nn.Conv2d(self.out_channels[3], int(mv_c[3] * scale), 1, 1, 0),
+                ]
+            )
+            self.out_channels = [
+                int(mv_c[0] * scale),
+                int(mv_c[1] * scale),
+                int(mv_c[2] * scale),
+                int(mv_c[3] * scale),
+            ]
+
+    def forward(self, x):
+        out_list = []
+        x = self.conv1(x)
+        x = self.blocks2(x)
+        x = self.blocks3(x)
+        out_list.append(x)
+        x = self.blocks4(x)
+        out_list.append(x)
+        x = self.blocks5(x)
+        out_list.append(x)
+        x = self.blocks6(x)
+        out_list.append(x)
+
+        if self.det:
+            out_list[0] = self.layer_list[0](out_list[0])
+            out_list[1] = self.layer_list[1](out_list[1])
+            out_list[2] = self.layer_list[2](out_list[2])
+            out_list[3] = self.layer_list[3](out_list[3])
+            return out_list
+
+        if self.training:
+            x = F.adaptive_avg_pool2d(x, [1, 40])
+        else:
+            x = F.avg_pool2d(x, [3, 2])
+        return x
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mobilenet_v3.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mobilenet_v3.py
+from torch import nn
+
+from .det_mobilenet_v3 import ConvBNLayer, ResidualUnit, make_divisible
+
+
+class MobileNetV3(nn.Module):
+    def __init__(
+        self,
+        in_channels=3,
+        model_name="small",
+        scale=0.5,
+        large_stride=None,
+        small_stride=None,
+        **kwargs
+    ):
+        super(MobileNetV3, self).__init__()
+        if small_stride is None:
+            small_stride = [2, 2, 2, 2]
+        if large_stride is None:
+            large_stride = [1, 2, 2, 2]
+
+        assert isinstance(
+            large_stride, list
+        ), "large_stride type must " "be list but got {}".format(type(large_stride))
+        assert isinstance(
+            small_stride, list
+        ), "small_stride type must " "be list but got {}".format(type(small_stride))
+        assert (
+            len(large_stride) == 4
+        ), "large_stride length must be " "4 but got {}".format(len(large_stride))
+        assert (
+            len(small_stride) == 4
+        ), "small_stride length must be " "4 but got {}".format(len(small_stride))
+
+        if model_name == "large":
+            cfg = [
+                # k, exp, c,  se,     nl,  s,
+                [3, 16, 16, False, "relu", large_stride[0]],
+                [3, 64, 24, False, "relu", (large_stride[1], 1)],
+                [3, 72, 24, False, "relu", 1],
+                [5, 72, 40, True, "relu", (large_stride[2], 1)],
+                [5, 120, 40, True, "relu", 1],
+                [5, 120, 40, True, "relu", 1],
+                [3, 240, 80, False, "hard_swish", 1],
+                [3, 200, 80, False, "hard_swish", 1],
+                [3, 184, 80, False, "hard_swish", 1],
+                [3, 184, 80, False, "hard_swish", 1],
+                [3, 480, 112, True, "hard_swish", 1],
+                [3, 672, 112, True, "hard_swish", 1],
+                [5, 672, 160, True, "hard_swish", (large_stride[3], 1)],
+                [5, 960, 160, True, "hard_swish", 1],
+                [5, 960, 160, True, "hard_swish", 1],
+            ]
+            cls_ch_squeeze = 960
+        elif model_name == "small":
+            cfg = [
+                # k, exp, c,  se,     nl,  s,
+                [3, 16, 16, True, "relu", (small_stride[0], 1)],
+                [3, 72, 24, False, "relu", (small_stride[1], 1)],
+                [3, 88, 24, False, "relu", 1],
+                [5, 96, 40, True, "hard_swish", (small_stride[2], 1)],
+                [5, 240, 40, True, "hard_swish", 1],
+                [5, 240, 40, True, "hard_swish", 1],
+                [5, 120, 48, True, "hard_swish", 1],
+                [5, 144, 48, True, "hard_swish", 1],
+                [5, 288, 96, True, "hard_swish", (small_stride[3], 1)],
+                [5, 576, 96, True, "hard_swish", 1],
+                [5, 576, 96, True, "hard_swish", 1],
+            ]
+            cls_ch_squeeze = 576
+        else:
+            raise NotImplementedError(
+                "mode[" + model_name + "_model] is not implemented!"
+            )
+
+        supported_scale = [0.35, 0.5, 0.75, 1.0, 1.25]
+        assert (
+            scale in supported_scale
+        ), "supported scales are {} but input scale is {}".format(
+            supported_scale, scale
+        )
+
+        inplanes = 16
+        # conv1
+        self.conv1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=make_divisible(inplanes * scale),
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            groups=1,
+            if_act=True,
+            act="hard_swish",
+            name="conv1",
+        )
+        i = 0
+        block_list = []
+        inplanes = make_divisible(inplanes * scale)
+        for k, exp, c, se, nl, s in cfg:
+            block_list.append(
+                ResidualUnit(
+                    in_channels=inplanes,
+                    mid_channels=make_divisible(scale * exp),
+                    out_channels=make_divisible(scale * c),
+                    kernel_size=k,
+                    stride=s,
+                    use_se=se,
+                    act=nl,
+                    name="conv" + str(i + 2),
+                )
+            )
+            inplanes = make_divisible(scale * c)
+            i += 1
+        self.blocks = nn.Sequential(*block_list)
+
+        self.conv2 = ConvBNLayer(
+            in_channels=inplanes,
+            out_channels=make_divisible(scale * cls_ch_squeeze),
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            if_act=True,
+            act="hard_swish",
+            name="conv_last",
+        )
+
+        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
+        self.out_channels = make_divisible(scale * cls_ch_squeeze)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.blocks(x)
+        x = self.conv2(x)
+        x = self.pool(x)
+        return x
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mv1_enhance.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mv1_enhance.py
+import os, sys
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..common import Activation
+
+
+class ConvBNLayer(nn.Module):
+    def __init__(self,
+                 num_channels,
+                 filter_size,
+                 num_filters,
+                 stride,
+                 padding,
+                 channels=None,
+                 num_groups=1,
+                 act='hard_swish'):
+        super(ConvBNLayer, self).__init__()
+        self.act = act
+        self._conv = nn.Conv2d(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            bias=False)
+
+        self._batch_norm = nn.BatchNorm2d(
+            num_filters,
+        )
+        if self.act is not None:
+            self._act = Activation(act_type=act, inplace=True)
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        if self.act is not None:
+            y = self._act(y)
+        return y
+
+
+class DepthwiseSeparable(nn.Module):
+    def __init__(self,
+                 num_channels,
+                 num_filters1,
+                 num_filters2,
+                 num_groups,
+                 stride,
+                 scale,
+                 dw_size=3,
+                 padding=1,
+                 use_se=False):
+        super(DepthwiseSeparable, self).__init__()
+        self.use_se = use_se
+        self._depthwise_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=int(num_filters1 * scale),
+            filter_size=dw_size,
+            stride=stride,
+            padding=padding,
+            num_groups=int(num_groups * scale))
+        if use_se:
+            self._se = SEModule(int(num_filters1 * scale))
+        self._pointwise_conv = ConvBNLayer(
+            num_channels=int(num_filters1 * scale),
+            filter_size=1,
+            num_filters=int(num_filters2 * scale),
+            stride=1,
+            padding=0)
+
+    def forward(self, inputs):
+        y = self._depthwise_conv(inputs)
+        if self.use_se:
+            y = self._se(y)
+        y = self._pointwise_conv(y)
+        return y
+
+
+class MobileNetV1Enhance(nn.Module):
+    def __init__(self,
+                 in_channels=3,
+                 scale=0.5,
+                 last_conv_stride=1,
+                 last_pool_type='max',
+                 **kwargs):
+        super().__init__()
+        self.scale = scale
+        self.block_list = []
+
+        self.conv1 = ConvBNLayer(
+            num_channels=in_channels,
+            filter_size=3,
+            channels=3,
+            num_filters=int(32 * scale),
+            stride=2,
+            padding=1)
+
+        conv2_1 = DepthwiseSeparable(
+            num_channels=int(32 * scale),
+            num_filters1=32,
+            num_filters2=64,
+            num_groups=32,
+            stride=1,
+            scale=scale)
+        self.block_list.append(conv2_1)
+
+        conv2_2 = DepthwiseSeparable(
+            num_channels=int(64 * scale),
+            num_filters1=64,
+            num_filters2=128,
+            num_groups=64,
+            stride=1,
+            scale=scale)
+        self.block_list.append(conv2_2)
+
+        conv3_1 = DepthwiseSeparable(
+            num_channels=int(128 * scale),
+            num_filters1=128,
+            num_filters2=128,
+            num_groups=128,
+            stride=1,
+            scale=scale)
+        self.block_list.append(conv3_1)
+
+        conv3_2 = DepthwiseSeparable(
+            num_channels=int(128 * scale),
+            num_filters1=128,
+            num_filters2=256,
+            num_groups=128,
+            stride=(2, 1),
+            scale=scale)
+        self.block_list.append(conv3_2)
+
+        conv4_1 = DepthwiseSeparable(
+            num_channels=int(256 * scale),
+            num_filters1=256,
+            num_filters2=256,
+            num_groups=256,
+            stride=1,
+            scale=scale)
+        self.block_list.append(conv4_1)
+
+        conv4_2 = DepthwiseSeparable(
+            num_channels=int(256 * scale),
+            num_filters1=256,
+            num_filters2=512,
+            num_groups=256,
+            stride=(2, 1),
+            scale=scale)
+        self.block_list.append(conv4_2)
+
+        for _ in range(5):
+            conv5 = DepthwiseSeparable(
+                num_channels=int(512 * scale),
+                num_filters1=512,
+                num_filters2=512,
+                num_groups=512,
+                stride=1,
+                dw_size=5,
+                padding=2,
+                scale=scale,
+                use_se=False)
+            self.block_list.append(conv5)
+
+        conv5_6 = DepthwiseSeparable(
+            num_channels=int(512 * scale),
+            num_filters1=512,
+            num_filters2=1024,
+            num_groups=512,
+            stride=(2, 1),
+            dw_size=5,
+            padding=2,
+            scale=scale,
+            use_se=True)
+        self.block_list.append(conv5_6)
+
+        conv6 = DepthwiseSeparable(
+            num_channels=int(1024 * scale),
+            num_filters1=1024,
+            num_filters2=1024,
+            num_groups=1024,
+            stride=last_conv_stride,
+            dw_size=5,
+            padding=2,
+            use_se=True,
+            scale=scale)
+        self.block_list.append(conv6)
+
+        self.block_list = nn.Sequential(*self.block_list)
+        if last_pool_type == 'avg':
+            self.pool = nn.AvgPool2d(kernel_size=2, stride=2, padding=0)
+        else:
+            self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
+        self.out_channels = int(1024 * scale)
+
+    def forward(self, inputs):
+        y = self.conv1(inputs)
+        y = self.block_list(y)
+        y = self.pool(y)
+        return y
+
+def hardsigmoid(x):
+    return F.relu6(x + 3., inplace=True) / 6.
+
+class SEModule(nn.Module):
+    def __init__(self, channel, reduction=4):
+        super(SEModule, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.conv1 = nn.Conv2d(
+            in_channels=channel,
+            out_channels=channel // reduction,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True)
+        self.conv2 = nn.Conv2d(
+            in_channels=channel // reduction,
+            out_channels=channel,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True)
+
+    def forward(self, inputs):
+        outputs = self.avg_pool(inputs)
+        outputs = self.conv1(outputs)
+        outputs = F.relu(outputs)
+        outputs = self.conv2(outputs)
+        outputs = hardsigmoid(outputs)
+        x = torch.mul(inputs, outputs)
+
+        return x
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_svtrnet.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_svtrnet.py
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/common.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/common.py
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+
+class Hswish(nn.Module):
+    def __init__(self, inplace=True):
+        super(Hswish, self).__init__()
+        self.inplace = inplace
+
+    def forward(self, x):
+        return x * F.relu6(x + 3.0, inplace=self.inplace) / 6.0
+
+
+# out = max(0, min(1, slop*x+offset))
+# paddle.fluid.layers.hard_sigmoid(x, slope=0.2, offset=0.5, name=None)
+class Hsigmoid(nn.Module):
+    def __init__(self, inplace=True):
+        super(Hsigmoid, self).__init__()
+        self.inplace = inplace
+
+    def forward(self, x):
+        # torch: F.relu6(x + 3., inplace=self.inplace) / 6.
+        # paddle: F.relu6(1.2 * x + 3., inplace=self.inplace) / 6.
+        return F.relu6(1.2 * x + 3.0, inplace=self.inplace) / 6.0
+
+
+class GELU(nn.Module):
+    def __init__(self, inplace=True):
+        super(GELU, self).__init__()
+        self.inplace = inplace
+
+    def forward(self, x):
+        return torch.nn.functional.gelu(x)
+
+
+class Swish(nn.Module):
+    def __init__(self, inplace=True):
+        super(Swish, self).__init__()
+        self.inplace = inplace
+
+    def forward(self, x):
+        if self.inplace:
+            x.mul_(torch.sigmoid(x))
+            return x
+        else:
+            return x * torch.sigmoid(x)
+
+
+class Activation(nn.Module):
+    def __init__(self, act_type, inplace=True):
+        super(Activation, self).__init__()
+        act_type = act_type.lower()
+        if act_type == "relu":
+            self.act = nn.ReLU(inplace=inplace)
+        elif act_type == "relu6":
+            self.act = nn.ReLU6(inplace=inplace)
+        elif act_type == "sigmoid":
+            raise NotImplementedError
+        elif act_type == "hard_sigmoid":
+            self.act = Hsigmoid(
+                inplace
+            )  # nn.Hardsigmoid(inplace=inplace)#Hsigmoid(inplace)#
+        elif act_type == "hard_swish" or act_type == "hswish":
+            self.act = Hswish(inplace=inplace)
+        elif act_type == "leakyrelu":
+            self.act = nn.LeakyReLU(inplace=inplace)
+        elif act_type == "gelu":
+            self.act = GELU(inplace=inplace)
+        elif act_type == "swish":
+            self.act = Swish(inplace=inplace)
+        else:
+            raise NotImplementedError
+
+    def forward(self, inputs):
+        return self.act(inputs)
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/__init__.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ["build_head"]
+
+
+def build_head(config, **kwargs):
+    # det head
+    from .det_db_head import DBHead, PFHeadLocal
+
+    # rec head
+    from .rec_ctc_head import CTCHead
+    from .rec_multi_head import MultiHead
+
+    # cls head
+    from .cls_head import ClsHead
+
+    support_dict = [
+        "DBHead",
+        "CTCHead",
+        "ClsHead",
+        "MultiHead",
+        "PFHeadLocal",
+    ]
+
+    module_name = config.pop("name")
+    char_num = config.pop("char_num", 6625)
+    assert module_name in support_dict, Exception(
+        "head only support {}".format(support_dict)
+    )
+    module_class = eval(module_name)(**config, **kwargs)
+    return module_class
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/cls_head.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/cls_head.py
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+
+class ClsHead(nn.Module):
+    """
+    Class orientation
+    Args:
+        params(dict): super parameters for build Class network
+    """
+
+    def __init__(self, in_channels, class_dim, **kwargs):
+        super(ClsHead, self).__init__()
+        self.pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Linear(in_channels, class_dim, bias=True)
+
+    def forward(self, x):
+        x = self.pool(x)
+        x = torch.reshape(x, shape=[x.shape[0], x.shape[1]])
+        x = self.fc(x)
+        x = F.softmax(x, dim=1)
+        return x
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/det_db_head.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/det_db_head.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from ..common import Activation
+from ..backbones.det_mobilenet_v3 import ConvBNLayer
+
+class Head(nn.Module):
+    def __init__(self, in_channels, **kwargs):
+        super(Head, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=in_channels // 4,
+            kernel_size=3,
+            padding=1,
+            bias=False)
+        self.conv_bn1 = nn.BatchNorm2d(
+            in_channels // 4)
+        self.relu1 = Activation(act_type='relu')
+
+        self.conv2 = nn.ConvTranspose2d(
+            in_channels=in_channels // 4,
+            out_channels=in_channels // 4,
+            kernel_size=2,
+            stride=2)
+        self.conv_bn2 = nn.BatchNorm2d(
+            in_channels // 4)
+        self.relu2 = Activation(act_type='relu')
+
+        self.conv3 = nn.ConvTranspose2d(
+            in_channels=in_channels // 4,
+            out_channels=1,
+            kernel_size=2,
+            stride=2)
+
+    def forward(self, x, return_f=False):
+        x = self.conv1(x)
+        x = self.conv_bn1(x)
+        x = self.relu1(x)
+        x = self.conv2(x)
+        x = self.conv_bn2(x)
+        x = self.relu2(x)
+        if return_f is True:
+            f = x
+        x = self.conv3(x)
+        x = torch.sigmoid(x)
+        if return_f is True:
+            return x, f
+        return x
+
+
+class DBHead(nn.Module):
+    """
+    Differentiable Binarization (DB) for text detection:
+        see https://arxiv.org/abs/1911.08947
+    args:
+        params(dict): super parameters for build DB network
+    """
+
+    def __init__(self, in_channels, k=50, **kwargs):
+        super(DBHead, self).__init__()
+        self.k = k
+        binarize_name_list = [
+            'conv2d_56', 'batch_norm_47', 'conv2d_transpose_0', 'batch_norm_48',
+            'conv2d_transpose_1', 'binarize'
+        ]
+        thresh_name_list = [
+            'conv2d_57', 'batch_norm_49', 'conv2d_transpose_2', 'batch_norm_50',
+            'conv2d_transpose_3', 'thresh'
+        ]
+        self.binarize = Head(in_channels, **kwargs)# binarize_name_list)
+        self.thresh = Head(in_channels, **kwargs)#thresh_name_list)
+
+    def step_function(self, x, y):
+        return torch.reciprocal(1 + torch.exp(-self.k * (x - y)))
+
+    def forward(self, x):
+        shrink_maps = self.binarize(x)
+        return {'maps': shrink_maps}
+
+
+class LocalModule(nn.Module):
+    def __init__(self, in_c, mid_c, use_distance=True):
+        super(self.__class__, self).__init__()
+        self.last_3 = ConvBNLayer(in_c + 1, mid_c, 3, 1, 1, act='relu')
+        self.last_1 = nn.Conv2d(mid_c, 1, 1, 1, 0)
+
+    def forward(self, x, init_map, distance_map):
+        outf = torch.cat([init_map, x], dim=1)
+        # last Conv
+        out = self.last_1(self.last_3(outf))
+        return out
+
+class PFHeadLocal(DBHead):
+    def __init__(self, in_channels, k=50, mode='small', **kwargs):
+        super(PFHeadLocal, self).__init__(in_channels, k, **kwargs)
+        self.mode = mode
+
+        self.up_conv = nn.Upsample(scale_factor=2, mode="nearest")
+        if self.mode == 'large':
+            self.cbn_layer = LocalModule(in_channels // 4, in_channels // 4)
+        elif self.mode == 'small':
+            self.cbn_layer = LocalModule(in_channels // 4, in_channels // 8)
+
+    def forward(self, x, targets=None):
+        shrink_maps, f = self.binarize(x, return_f=True)
+        base_maps = shrink_maps
+        cbn_maps = self.cbn_layer(self.up_conv(f), shrink_maps, None)
+        cbn_maps = F.sigmoid(cbn_maps)
+        return {'maps': 0.5 * (base_maps + cbn_maps), 'cbn_maps': cbn_maps}
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/rec_ctc_head.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/rec_ctc_head.py
+import torch.nn.functional as F
+from torch import nn
+
+
+class CTCHead(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels=6625,
+        fc_decay=0.0004,
+        mid_channels=None,
+        return_feats=False,
+        **kwargs
+    ):
+        super(CTCHead, self).__init__()
+        if mid_channels is None:
+            self.fc = nn.Linear(
+                in_channels,
+                out_channels,
+                bias=True,
+            )
+        else:
+            self.fc1 = nn.Linear(
+                in_channels,
+                mid_channels,
+                bias=True,
+            )
+            self.fc2 = nn.Linear(
+                mid_channels,
+                out_channels,
+                bias=True,
+            )
+
+        self.out_channels = out_channels
+        self.mid_channels = mid_channels
+        self.return_feats = return_feats
+
+    def forward(self, x, labels=None):
+        if self.mid_channels is None:
+            predicts = self.fc(x)
+        else:
+            x = self.fc1(x)
+            predicts = self.fc2(x)
+
+        if self.return_feats:
+            result = (x, predicts)
+        else:
+            result = predicts
+
+        if not self.training:
+            predicts = F.softmax(predicts, dim=2)
+            result = predicts
+
+        return result
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/rec_multi_head.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/rec_multi_head.py
+from torch import nn
+
+from ..necks.rnn import Im2Seq, SequenceEncoder
+from .rec_ctc_head import CTCHead
+
+
+class FCTranspose(nn.Module):
+    def __init__(self, in_channels, out_channels, only_transpose=False):
+        super().__init__()
+        self.only_transpose = only_transpose
+        if not self.only_transpose:
+            self.fc = nn.Linear(in_channels, out_channels, bias=False)
+
+    def forward(self, x):
+        if self.only_transpose:
+            return x.permute([0, 2, 1])
+        else:
+            return self.fc(x.permute([0, 2, 1]))
+
+
+class MultiHead(nn.Module):
+    def __init__(self, in_channels, out_channels_list, **kwargs):
+        super().__init__()
+        self.head_list = kwargs.pop("head_list")
+
+        self.gtc_head = "sar"
+        assert len(self.head_list) >= 2
+        for idx, head_name in enumerate(self.head_list):
+            name = list(head_name)[0]
+            if name == "SARHead":
+                pass
+
+            elif name == "NRTRHead":
+                pass
+            elif name == "CTCHead":
+                # ctc neck
+                self.encoder_reshape = Im2Seq(in_channels)
+                neck_args = self.head_list[idx][name]["Neck"]
+                encoder_type = neck_args.pop("name")
+                self.ctc_encoder = SequenceEncoder(
+                    in_channels=in_channels, encoder_type=encoder_type, **neck_args
+                )
+                # ctc head
+                head_args = self.head_list[idx][name].get("Head", {})
+                if head_args is None:
+                    head_args = {}
+
+                self.ctc_head = CTCHead(
+                    in_channels=self.ctc_encoder.out_channels,
+                    out_channels=out_channels_list["CTCLabelDecode"],
+                    **head_args,
+                )
+            else:
+                raise NotImplementedError(f"{name} is not supported in MultiHead yet")
+
+    def forward(self, x, data=None):
+        ctc_encoder = self.ctc_encoder(x)
+        return self.ctc_head(ctc_encoder)
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/__init__.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ["build_neck"]
+
+
+def build_neck(config):
+    from .db_fpn import DBFPN, LKPAN, RSEFPN
+    from .rnn import SequenceEncoder
+
+    support_dict = ["DBFPN", "SequenceEncoder", "RSEFPN", "LKPAN"]
+
+    module_name = config.pop("name")
+    assert module_name in support_dict, Exception(
+        "neck only support {}".format(support_dict)
+    )
+    module_class = eval(module_name)(**config)
+    return module_class
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/db_fpn.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/db_fpn.py
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/intracl.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/intracl.py
+from torch import nn
+
+
+class IntraCLBlock(nn.Module):
+    def __init__(self, in_channels=96, reduce_factor=4):
+        super(IntraCLBlock, self).__init__()
+        self.channels = in_channels
+        self.rf = reduce_factor
+        self.conv1x1_reduce_channel = nn.Conv2d(
+            self.channels, self.channels // self.rf, kernel_size=1, stride=1, padding=0
+        )
+        self.conv1x1_return_channel = nn.Conv2d(
+            self.channels // self.rf, self.channels, kernel_size=1, stride=1, padding=0
+        )
+
+        self.v_layer_7x1 = nn.Conv2d(
+            self.channels // self.rf,
+            self.channels // self.rf,
+            kernel_size=(7, 1),
+            stride=(1, 1),
+            padding=(3, 0),
+        )
+        self.v_layer_5x1 = nn.Conv2d(
+            self.channels // self.rf,
+            self.channels // self.rf,
+            kernel_size=(5, 1),
+            stride=(1, 1),
+            padding=(2, 0),
+        )
+        self.v_layer_3x1 = nn.Conv2d(
+            self.channels // self.rf,
+            self.channels // self.rf,
+            kernel_size=(3, 1),
+            stride=(1, 1),
+            padding=(1, 0),
+        )
+
+        self.q_layer_1x7 = nn.Conv2d(
+            self.channels // self.rf,
+            self.channels // self.rf,
+            kernel_size=(1, 7),
+            stride=(1, 1),
+            padding=(0, 3),
+        )
+        self.q_layer_1x5 = nn.Conv2d(
+            self.channels // self.rf,
+            self.channels // self.rf,
+            kernel_size=(1, 5),
+            stride=(1, 1),
+            padding=(0, 2),
+        )
+        self.q_layer_1x3 = nn.Conv2d(
+            self.channels // self.rf,
+            self.channels // self.rf,
+            kernel_size=(1, 3),
+            stride=(1, 1),
+            padding=(0, 1),
+        )
+
+        # base
+        self.c_layer_7x7 = nn.Conv2d(
+            self.channels // self.rf,
+            self.channels // self.rf,
+            kernel_size=(7, 7),
+            stride=(1, 1),
+            padding=(3, 3),
+        )
+        self.c_layer_5x5 = nn.Conv2d(
+            self.channels // self.rf,
+            self.channels // self.rf,
+            kernel_size=(5, 5),
+            stride=(1, 1),
+            padding=(2, 2),
+        )
+        self.c_layer_3x3 = nn.Conv2d(
+            self.channels // self.rf,
+            self.channels // self.rf,
+            kernel_size=(3, 3),
+            stride=(1, 1),
+            padding=(1, 1),
+        )
+
+        self.bn = nn.BatchNorm2d(self.channels)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        x_new = self.conv1x1_reduce_channel(x)
+
+        x_7_c = self.c_layer_7x7(x_new)
+        x_7_v = self.v_layer_7x1(x_new)
+        x_7_q = self.q_layer_1x7(x_new)
+        x_7 = x_7_c + x_7_v + x_7_q
+
+        x_5_c = self.c_layer_5x5(x_7)
+        x_5_v = self.v_layer_5x1(x_7)
+        x_5_q = self.q_layer_1x5(x_7)
+        x_5 = x_5_c + x_5_v + x_5_q
+
+        x_3_c = self.c_layer_3x3(x_5)
+        x_3_v = self.v_layer_3x1(x_5)
+        x_3_q = self.q_layer_1x3(x_5)
+        x_3 = x_3_c + x_3_v + x_3_q
+
+        x_relation = self.conv1x1_return_channel(x_3)
+
+        x_relation = self.bn(x_relation)
+        x_relation = self.relu(x_relation)
+
+        return x + x_relation
+
+
+def build_intraclblock_list(num_block):
+    IntraCLBlock_list = nn.ModuleList()
+    for i in range(num_block):
+        IntraCLBlock_list.append(IntraCLBlock())
+
+    return IntraCLBlock_list
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/rnn.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/rnn.py
+import torch
+from torch import nn
+
+from ..backbones.rec_svtrnet import Block, ConvBNLayer
+
+
+class Im2Seq(nn.Module):
+    def __init__(self, in_channels, **kwargs):
+        super().__init__()
+        self.out_channels = in_channels
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        # assert H == 1
+        x = x.squeeze(dim=2)
+        # x = x.transpose([0, 2, 1])  # paddle (NTC)(batch, width, channels)
+        x = x.permute(0, 2, 1)
+        return x
+
+
+class EncoderWithRNN_(nn.Module):
+    def __init__(self, in_channels, hidden_size):
+        super(EncoderWithRNN_, self).__init__()
+        self.out_channels = hidden_size * 2
+        self.rnn1 = nn.LSTM(
+            in_channels,
+            hidden_size,
+            bidirectional=False,
+            batch_first=True,
+            num_layers=2,
+        )
+        self.rnn2 = nn.LSTM(
+            in_channels,
+            hidden_size,
+            bidirectional=False,
+            batch_first=True,
+            num_layers=2,
+        )
+
+    def forward(self, x):
+        self.rnn1.flatten_parameters()
+        self.rnn2.flatten_parameters()
+        out1, h1 = self.rnn1(x)
+        out2, h2 = self.rnn2(torch.flip(x, [1]))
+        return torch.cat([out1, torch.flip(out2, [1])], 2)
+
+
+class EncoderWithRNN(nn.Module):
+    def __init__(self, in_channels, hidden_size):
+        super(EncoderWithRNN, self).__init__()
+        self.out_channels = hidden_size * 2
+        self.lstm = nn.LSTM(
+            in_channels, hidden_size, num_layers=2, batch_first=True, bidirectional=True
+        )  # batch_first:=True
+
+    def forward(self, x):
+        x, _ = self.lstm(x)
+        return x
+
+
+class EncoderWithFC(nn.Module):
+    def __init__(self, in_channels, hidden_size):
+        super(EncoderWithFC, self).__init__()
+        self.out_channels = hidden_size
+        self.fc = nn.Linear(
+            in_channels,
+            hidden_size,
+            bias=True,
+        )
+
+    def forward(self, x):
+        x = self.fc(x)
+        return x
+
+
+class EncoderWithSVTR(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        dims=64,  # XS
+        depth=2,
+        hidden_dims=120,
+        use_guide=False,
+        num_heads=8,
+        qkv_bias=True,
+        mlp_ratio=2.0,
+        drop_rate=0.1,
+        kernel_size=[3, 3],
+        attn_drop_rate=0.1,
+        drop_path=0.0,
+        qk_scale=None,
+    ):
+        super(EncoderWithSVTR, self).__init__()
+        self.depth = depth
+        self.use_guide = use_guide
+        self.conv1 = ConvBNLayer(
+            in_channels,
+            in_channels // 8,
+            kernel_size=kernel_size,
+            padding=[kernel_size[0] // 2, kernel_size[1] // 2],
+            act="swish",
+        )
+        self.conv2 = ConvBNLayer(
+            in_channels // 8, hidden_dims, kernel_size=1, act="swish"
+        )
+
+        self.svtr_block = nn.ModuleList(
+            [
+                Block(
+                    dim=hidden_dims,
+                    num_heads=num_heads,
+                    mixer="Global",
+                    HW=None,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop_rate,
+                    act_layer="swish",
+                    attn_drop=attn_drop_rate,
+                    drop_path=drop_path,
+                    norm_layer="nn.LayerNorm",
+                    epsilon=1e-05,
+                    prenorm=False,
+                )
+                for i in range(depth)
+            ]
+        )
+        self.norm = nn.LayerNorm(hidden_dims, eps=1e-6)
+        self.conv3 = ConvBNLayer(hidden_dims, in_channels, kernel_size=1, act="swish")
+        # last conv-nxn, the input is concat of input tensor and conv3 output tensor
+        self.conv4 = ConvBNLayer(
+            2 * in_channels, in_channels // 8, padding=1, act="swish"
+        )
+
+        self.conv1x1 = ConvBNLayer(in_channels // 8, dims, kernel_size=1, act="swish")
+        self.out_channels = dims
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        # weight initialization
+        if isinstance(m, nn.Conv2d):
+            nn.init.kaiming_normal_(m.weight, mode="fan_out")
+            if m.bias is not None:
+                nn.init.zeros_(m.bias)
+        elif isinstance(m, nn.BatchNorm2d):
+            nn.init.ones_(m.weight)
+            nn.init.zeros_(m.bias)
+        elif isinstance(m, nn.Linear):
+            nn.init.normal_(m.weight, 0, 0.01)
+            if m.bias is not None:
+                nn.init.zeros_(m.bias)
+        elif isinstance(m, nn.ConvTranspose2d):
+            nn.init.kaiming_normal_(m.weight, mode="fan_out")
+            if m.bias is not None:
+                nn.init.zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.ones_(m.weight)
+            nn.init.zeros_(m.bias)
+
+    def forward(self, x):
+        # for use guide
+        if self.use_guide:
+            z = x.clone()
+            z.stop_gradient = True
+        else:
+            z = x
+        # for short cut
+        h = z
+        # reduce dim
+        z = self.conv1(z)
+        z = self.conv2(z)
+        # SVTR global block
+        B, C, H, W = z.shape
+        z = z.flatten(2).permute(0, 2, 1)
+
+        for blk in self.svtr_block:
+            z = blk(z)
+
+        z = self.norm(z)
+        # last stage
+        z = z.reshape([-1, H, W, C]).permute(0, 3, 1, 2)
+        z = self.conv3(z)
+        z = torch.cat((h, z), dim=1)
+        z = self.conv1x1(self.conv4(z))
+
+        return z
+
+
+class SequenceEncoder(nn.Module):
+    def __init__(self, in_channels, encoder_type, hidden_size=48, **kwargs):
+        super(SequenceEncoder, self).__init__()
+        self.encoder_reshape = Im2Seq(in_channels)
+        self.out_channels = self.encoder_reshape.out_channels
+        self.encoder_type = encoder_type
+        if encoder_type == "reshape":
+            self.only_reshape = True
+        else:
+            support_encoder_dict = {
+                "reshape": Im2Seq,
+                "fc": EncoderWithFC,
+                "rnn": EncoderWithRNN,
+                "svtr": EncoderWithSVTR,
+            }
+            assert encoder_type in support_encoder_dict, "{} must in {}".format(
+                encoder_type, support_encoder_dict.keys()
+            )
+
+            if encoder_type == "svtr":
+                self.encoder = support_encoder_dict[encoder_type](
+                    self.encoder_reshape.out_channels, **kwargs
+                )
+            else:
+                self.encoder = support_encoder_dict[encoder_type](
+                    self.encoder_reshape.out_channels, hidden_size
+                )
+            self.out_channels = self.encoder.out_channels
+            self.only_reshape = False
+
+    def forward(self, x):
+        if self.encoder_type != "svtr":
+            x = self.encoder_reshape(x)
+            if not self.only_reshape:
+                x = self.encoder(x)
+            return x
+        else:
+            x = self.encoder(x)
+            x = self.encoder_reshape(x)
+            return x
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/__init__.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/__init__.py
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/cls_postprocess.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/cls_postprocess.py
+import torch
+
+
+class ClsPostProcess(object):
+    """ Convert between text-label and text-index """
+
+    def __init__(self, label_list, **kwargs):
+        super(ClsPostProcess, self).__init__()
+        self.label_list = label_list
+
+    def __call__(self, preds, label=None, *args, **kwargs):
+        if isinstance(preds, torch.Tensor):
+            preds = preds.cpu().numpy()
+        pred_idxs = preds.argmax(axis=1)
+        decode_out = [(self.label_list[idx], preds[i, idx])
+                      for i, idx in enumerate(pred_idxs)]
+        if label is None:
+            return decode_out
+        label = [(self.label_list[idx], 1.0) for idx in label]
+        return decode_out, label
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/db_postprocess.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/db_postprocess.py
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/rec_postprocess.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/rec_postprocess.py
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/__init__.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/__init__.py
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/dict/arabic_dict.txt
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/dict/arabic_dict.txt