refactor(ocr): remove unused code and simplify model architecture

- Remove unused imports and code - Simplify model architecture by removing unnecessary components - Update initialization and forward pass logic - Rename variables for consistency

refactor(ocr): remove unused code and simplify model architecture
- Remove unused imports and code - Simplify model architecture by removing unnecessary components - Update initialization and forward pass logic - Rename variables for consistency
b3d6785d · myhloli · 3cb156f5 · b3d6785d · 3cb156f5 · 3cb156f5
Commit b3d6785d authored Apr 01, 2025 by myhloli
20 changed files
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mv1_enhance.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mv1_enhance.py
@@ -2,7 +2,8 @@ import os, sys
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from pytorchocr.modeling.common import Activation
+
+from ..common import Activation


 class ConvBNLayer(nn.Module):

--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_nrtr_mtb.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_nrtr_mtb.py
-
-import torch
-from torch import nn
-
-
-class MTB(nn.Module):
-    def __init__(self, cnn_num, in_channels):
-        super(MTB, self).__init__()
-        self.block = nn.Sequential()
-        self.out_channels = in_channels
-        self.cnn_num = cnn_num
-        if self.cnn_num == 2:
-            for i in range(self.cnn_num):
-                self.block.add_module(
-                    'conv_{}'.format(i),
-                    nn.Conv2d(
-                        in_channels=in_channels
-                        if i == 0 else 32 * (2**(i - 1)),
-                        out_channels=32 * (2**i),
-                        kernel_size=3,
-                        stride=2,
-                        padding=1))
-                self.block.add_module('relu_{}'.format(i), nn.ReLU())
-                self.block.add_module('bn_{}'.format(i),
-                                        nn.BatchNorm2d(32 * (2**i)))
-
-
-    def forward(self, images):
-        x = self.block(images)
-        if self.cnn_num == 2:
-            # (b, w, h, c)
-            x = x.permute(0, 3, 2, 1)
-            x_shape = x.shape
-            x = torch.reshape(
-                x, (x_shape[0], x_shape[1], x_shape[2] * x_shape[3]))
-        return x
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_resnet_31.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_resnet_31.py
-"""
-This code is refer from:
-https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textrecog/layers/conv_layer.py
-https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textrecog/backbones/resnet31_ocr.py
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-# import paddle
-# from paddle import ParamAttr
-# import paddle.nn as nn
-# import paddle.nn.functional as F
-
-
-__all__ = ["ResNet31"]
-
-
-def conv3x3(in_channel, out_channel, stride=1):
-    return nn.Conv2d(
-        in_channel,
-        out_channel,
-        kernel_size=3,
-        stride=stride,
-        padding=1,
-        bias=False)
-
-
-class BasicBlock(nn.Module):
-    expansion = 1
-
-    def __init__(self, in_channels, channels, stride=1, downsample=False):
-        super().__init__()
-        self.conv1 = conv3x3(in_channels, channels, stride)
-        self.bn1 = nn.BatchNorm2d(channels)
-        self.relu = nn.ReLU()
-        self.conv2 = conv3x3(channels, channels)
-        self.bn2 = nn.BatchNorm2d(channels)
-        self.downsample = downsample
-        if downsample:
-            self.downsample = nn.Sequential(
-                nn.Conv2d(
-                    in_channels,
-                    channels * self.expansion,
-                    1,
-                    stride,
-                    bias=False),
-                nn.BatchNorm2d(channels * self.expansion), )
-        else:
-            self.downsample = nn.Sequential()
-        self.stride = stride
-
-    def forward(self, x):
-        residual = x
-
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-
-        out = self.conv2(out)
-        out = self.bn2(out)
-
-        if self.downsample:
-            residual = self.downsample(x)
-
-        out += residual
-        out = self.relu(out)
-
-        return out
-
-
-class ResNet31(nn.Module):
-    '''
-    Args:
-        in_channels (int): Number of channels of input image tensor.
-        layers (list[int]): List of BasicBlock number for each stage.
-        channels (list[int]): List of out_channels of Conv2d layer.
-        out_indices (None | Sequence[int]): Indices of output stages.
-        last_stage_pool (bool): If True, add `MaxPool2d` layer to last stage.
-    '''
-
-    def __init__(self,
-                 in_channels=3,
-                 layers=[1, 2, 5, 3],
-                 channels=[64, 128, 256, 256, 512, 512, 512],
-                 out_indices=None,
-                 last_stage_pool=False):
-        super(ResNet31, self).__init__()
-        assert isinstance(in_channels, int)
-        assert isinstance(last_stage_pool, bool)
-
-        self.out_indices = out_indices
-        self.last_stage_pool = last_stage_pool
-
-        # conv 1 (Conv Conv)
-        self.conv1_1 = nn.Conv2d(
-            in_channels, channels[0], kernel_size=3, stride=1, padding=1)
-        self.bn1_1 = nn.BatchNorm2d(channels[0])
-        self.relu1_1 = nn.ReLU(inplace=True)
-
-        self.conv1_2 = nn.Conv2d(
-            channels[0], channels[1], kernel_size=3, stride=1, padding=1)
-        self.bn1_2 = nn.BatchNorm2d(channels[1])
-        self.relu1_2 = nn.ReLU(inplace=True)
-
-        # conv 2 (Max-pooling, Residual block, Conv)
-        self.pool2 = nn.MaxPool2d(
-            kernel_size=2, stride=2, padding=0, ceil_mode=True)
-        self.block2 = self._make_layer(channels[1], channels[2], layers[0])
-        self.conv2 = nn.Conv2d(
-            channels[2], channels[2], kernel_size=3, stride=1, padding=1)
-        self.bn2 = nn.BatchNorm2d(channels[2])
-        self.relu2 = nn.ReLU(inplace=True)
-
-        # conv 3 (Max-pooling, Residual block, Conv)
-        self.pool3 = nn.MaxPool2d(
-            kernel_size=2, stride=2, padding=0, ceil_mode=True)
-        self.block3 = self._make_layer(channels[2], channels[3], layers[1])
-        self.conv3 = nn.Conv2d(
-            channels[3], channels[3], kernel_size=3, stride=1, padding=1)
-        self.bn3 = nn.BatchNorm2d(channels[3])
-        self.relu3 = nn.ReLU(inplace=True)
-
-        # conv 4 (Max-pooling, Residual block, Conv)
-        self.pool4 = nn.MaxPool2d(
-            kernel_size=(2, 1), stride=(2, 1), padding=0, ceil_mode=True)
-        self.block4 = self._make_layer(channels[3], channels[4], layers[2])
-        self.conv4 = nn.Conv2d(
-            channels[4], channels[4], kernel_size=3, stride=1, padding=1)
-        self.bn4 = nn.BatchNorm2d(channels[4])
-        self.relu4 = nn.ReLU(inplace=True)
-
-        # conv 5 ((Max-pooling), Residual block, Conv)
-        self.pool5 = None
-        if self.last_stage_pool:
-            self.pool5 = nn.MaxPool2d(
-                kernel_size=2, stride=2, padding=0, ceil_mode=True)
-        self.block5 = self._make_layer(channels[4], channels[5], layers[3])
-        self.conv5 = nn.Conv2d(
-            channels[5], channels[5], kernel_size=3, stride=1, padding=1)
-        self.bn5 = nn.BatchNorm2d(channels[5])
-        self.relu5 = nn.ReLU(inplace=True)
-
-        self.out_channels = channels[-1]
-
-    def _make_layer(self, input_channels, output_channels, blocks):
-        layers = []
-        for _ in range(blocks):
-            downsample = None
-            if input_channels != output_channels:
-                downsample = nn.Sequential(
-                    nn.Conv2d(
-                        input_channels,
-                        output_channels,
-                        kernel_size=1,
-                        stride=1,
-                        bias=False),
-                    nn.BatchNorm2d(output_channels), )
-
-            layers.append(
-                BasicBlock(
-                    input_channels, output_channels, downsample=downsample))
-            input_channels = output_channels
-        return nn.Sequential(*layers)
-
-    def forward(self, x):
-        x = self.conv1_1(x)
-        x = self.bn1_1(x)
-        x = self.relu1_1(x)
-
-        x = self.conv1_2(x)
-        x = self.bn1_2(x)
-        x = self.relu1_2(x)
-
-        outs = []
-        for i in range(4):
-            layer_index = i + 2
-            pool_layer = getattr(self, 'pool{}'.format(layer_index))
-            block_layer = getattr(self, 'block{}'.format(layer_index))
-            conv_layer = getattr(self, 'conv{}'.format(layer_index))
-            bn_layer = getattr(self, 'bn{}'.format(layer_index))
-            relu_layer = getattr(self, 'relu{}'.format(layer_index))
-
-            if pool_layer is not None:
-                x = pool_layer(x)
-            x = block_layer(x)
-            x = conv_layer(x)
-            x = bn_layer(x)
-            x = relu_layer(x)
-
-            outs.append(x)
-
-        if self.out_indices is not None:
-            return tuple([outs[i] for i in self.out_indices])
-
-        return x
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_resnet_fpn.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_resnet_fpn.py
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os, sys
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from pytorchocr.modeling.common import Activation
-
-__all__ = ["ResNetFPN"]
-
-class ResNetFPN(nn.Module):
-    def __init__(self, in_channels=1, layers=50, **kwargs):
-        super(ResNetFPN, self).__init__()
-        supported_layers = {
-            18: {
-                'depth': [2, 2, 2, 2],
-                'block_class': BasicBlock
-            },
-            34: {
-                'depth': [3, 4, 6, 3],
-                'block_class': BasicBlock
-            },
-            50: {
-                'depth': [3, 4, 6, 3],
-                'block_class': BottleneckBlock
-            },
-            101: {
-                'depth': [3, 4, 23, 3],
-                'block_class': BottleneckBlock
-            },
-            152: {
-                'depth': [3, 8, 36, 3],
-                'block_class': BottleneckBlock
-            }
-        }
-        stride_list = [(2, 2), (2, 2), (1, 1), (1, 1)]
-        num_filters = [64, 128, 256, 512]
-        self.depth = supported_layers[layers]['depth']
-        self.conv = ConvBNLayer(
-            in_channels=in_channels,
-            out_channels=64,
-            kernel_size=7,
-            stride=2,
-            act="relu",
-            name="conv1")
-        self.block_list = nn.ModuleList()
-        in_ch = 64
-        if layers >= 50:
-            for block in range(len(self.depth)):
-                for i in range(self.depth[block]):
-                    if layers in [101, 152] and block == 2:
-                        if i == 0:
-                            conv_name = "res" + str(block + 2) + "a"
-                        else:
-                            conv_name = "res" + str(block + 2) + "b" + str(i)
-                    else:
-                        conv_name = "res" + str(block + 2) + chr(97 + i)
-
-                    bottlenectBlock = BottleneckBlock(
-                        in_channels=in_ch,
-                        out_channels=num_filters[block],
-                        stride=stride_list[block] if i == 0 else 1,
-                        name=conv_name)
-
-                    in_ch = num_filters[block] * 4
-                    self.block_list.add_module("bottleneckBlock_{}_{}".format(block, i), bottlenectBlock)
-
-        else:
-            for block in range(len(self.depth)):
-                for i in range(self.depth[block]):
-                    conv_name = "res" + str(block + 2) + chr(97 + i)
-                    if i == 0 and block != 0:
-                        stride = (2, 1)
-                    else:
-                        stride = (1, 1)
-
-                    basicBlock = BasicBlock(
-                            in_channels=in_ch,
-                            out_channels=num_filters[block],
-                            stride=stride_list[block] if i == 0 else 1,
-                            is_first=block == i == 0,
-                            name=conv_name)
-                    in_ch = basicBlock.out_channels
-                    self.block_list.add_module(conv_name, basicBlock)
-        out_ch_list = [in_ch // 4, in_ch // 2, in_ch]
-        self.base_block = nn.ModuleList()
-        self.conv_trans = []
-        self.bn_block = []
-        for i in [-2, -3]:
-            in_channels = out_ch_list[i + 1] + out_ch_list[i]
-            bb_0 = nn.Conv2d(
-                        in_channels=in_channels,
-                        out_channels=out_ch_list[i],
-                        kernel_size=1,
-                        bias=True)
-            self.base_block.add_module("F_{}_base_block_0".format(i), bb_0)
-            bb_1 = nn.Conv2d(
-                        in_channels=out_ch_list[i],
-                        out_channels=out_ch_list[i],
-                        kernel_size=3,
-                        padding=1,
-                        bias=True)
-            self.base_block.add_module("F_{}_base_block_1".format(i), bb_1)
-            bb_2 = nn.Sequential(
-                nn.BatchNorm2d(out_ch_list[i]),
-                Activation("relu")
-            )
-            self.base_block.add_module("F_{}_base_block_2".format(i), bb_2)
-
-        bb_3 = nn.Conv2d(
-                    in_channels=out_ch_list[i],
-                    out_channels=512,
-                    kernel_size=1,
-                    bias=True)
-        self.base_block.add_module("F_{}_base_block_3".format(i), bb_3)
-        self.out_channels = 512
-
-    def __call__(self, x):
-        x = self.conv(x)
-        fpn_list = []
-        F = []
-        for i in range(len(self.depth)):
-            fpn_list.append(np.sum(self.depth[:i + 1]))
-
-        for i, block in enumerate(self.block_list):
-            x = block(x)
-            for number in fpn_list:
-                if i + 1 == number:
-                    F.append(x)
-        base = F[-1]
-
-        j = 0
-        for i, block in enumerate(self.base_block):
-            if i % 3 == 0 and i < 6:
-                j = j + 1
-                b, c, w, h = F[-j - 1].shape
-                if [w, h] == list(base.shape[2:]):
-                    base = base
-                else:
-                    base = self.conv_trans[j - 1](base)
-                    base = self.bn_block[j - 1](base)
-                base = torch.cat([base, F[-j - 1]], dim=1)
-            base = block(base)
-        return base
-
-
-class ConvBNLayer(nn.Module):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 groups=1,
-                 act=None,
-                 name=None):
-        super(ConvBNLayer, self).__init__()
-        self.conv = nn.Conv2d(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=2 if stride == (1, 1) else kernel_size,
-            dilation=2 if stride == (1, 1) else 1,
-            stride=stride,
-            padding=(kernel_size - 1) // 2,
-            groups=groups,
-            bias=False, )
-
-        if name == "conv1":
-            bn_name = "bn_" + name
-        else:
-            bn_name = "bn" + name[3:]
-
-        self.bn = nn.BatchNorm2d(out_channels)
-        self.act = act
-        if self.act is not None:
-            self._act = Activation(act_type=self.act, inplace=True)
-
-    def __call__(self, x):
-        x = self.conv(x)
-        x = self.bn(x)
-        if self.act is not None:
-            x = self._act(x)
-        return x
-
-
-class ShortCut(nn.Module):
-    def __init__(self, in_channels, out_channels, stride, name, is_first=False):
-        super(ShortCut, self).__init__()
-        self.use_conv = True
-
-        if in_channels != out_channels or stride != 1 or is_first == True:
-            if stride == (1, 1):
-                self.conv = ConvBNLayer(
-                    in_channels, out_channels, 1, 1, name=name)
-            else:  # stride==(2,2)
-                self.conv = ConvBNLayer(
-                    in_channels, out_channels, 1, stride, name=name)
-        else:
-            self.use_conv = False
-
-    def forward(self, x):
-        if self.use_conv:
-            x = self.conv(x)
-        return x
-
-
-class BottleneckBlock(nn.Module):
-    def __init__(self, in_channels, out_channels, stride, name):
-        super(BottleneckBlock, self).__init__()
-        self.conv0 = ConvBNLayer(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=1,
-            act='relu',
-            name=name + "_branch2a")
-        self.conv1 = ConvBNLayer(
-            in_channels=out_channels,
-            out_channels=out_channels,
-            kernel_size=3,
-            stride=stride,
-            act='relu',
-            name=name + "_branch2b")
-
-        self.conv2 = ConvBNLayer(
-            in_channels=out_channels,
-            out_channels=out_channels * 4,
-            kernel_size=1,
-            act=None,
-            name=name + "_branch2c")
-
-        self.short = ShortCut(
-            in_channels=in_channels,
-            out_channels=out_channels * 4,
-            stride=stride,
-            is_first=False,
-            name=name + "_branch1")
-        self.out_channels = out_channels * 4
-
-    def forward(self, x):
-        y = self.conv0(x)
-        y = self.conv1(y)
-        y = self.conv2(y)
-        y = y + self.short(x)
-        y = F.relu(y)
-        return y
-
-
-class BasicBlock(nn.Module):
-    def __init__(self, in_channels, out_channels, stride, name, is_first):
-        super(BasicBlock, self).__init__()
-        self.conv0 = ConvBNLayer(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=3,
-            act='relu',
-            stride=stride,
-            name=name + "_branch2a")
-        self.conv1 = ConvBNLayer(
-            in_channels=out_channels,
-            out_channels=out_channels,
-            kernel_size=3,
-            act=None,
-            name=name + "_branch2b")
-        self.short = ShortCut(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            stride=stride,
-            is_first=is_first,
-            name=name + "_branch1")
-        self.out_channels = out_channels
-
-    def forward(self, x):
-        y = self.conv0(x)
-        y = self.conv1(y)
-        y = y + self.short(x)
-        return F.relu(y)
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_resnet_vd.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_resnet_vd.py
-import os, sys
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from pytorchocr.modeling.common import Activation
-
-class ConvBNLayer(nn.Module):
-    def __init__(
-            self,
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride=1,
-            groups=1,
-            is_vd_mode=False,
-            act=None,
-            name=None, ):
-        super(ConvBNLayer, self).__init__()
-        self.act = act
-        self.is_vd_mode = is_vd_mode
-        self._pool2d_avg = nn.AvgPool2d(
-            kernel_size=stride, stride=stride, padding=0, ceil_mode=True)
-
-        self._conv = nn.Conv2d(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-            stride=1 if is_vd_mode else stride,
-            padding=(kernel_size - 1) // 2,
-            groups=groups,
-            bias=False)
-
-        self._batch_norm = nn.BatchNorm2d(
-            out_channels,)
-        if self.act is not None:
-            self._act = Activation(act_type=act, inplace=True)
-
-    def forward(self, inputs):
-        if self.is_vd_mode:
-            inputs = self._pool2d_avg(inputs)
-        y = self._conv(inputs)
-        y = self._batch_norm(y)
-        if self.act is not None:
-            y = self._act(y)
-        return y
-
-
-class BottleneckBlock(nn.Module):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 stride,
-                 shortcut=True,
-                 if_first=False,
-                 name=None):
-        super(BottleneckBlock, self).__init__()
-
-        self.conv0 = ConvBNLayer(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=1,
-            act='relu',
-            name=name + "_branch2a")
-        self.conv1 = ConvBNLayer(
-            in_channels=out_channels,
-            out_channels=out_channels,
-            kernel_size=3,
-            stride=stride,
-            act='relu',
-            name=name + "_branch2b")
-        self.conv2 = ConvBNLayer(
-            in_channels=out_channels,
-            out_channels=out_channels * 4,
-            kernel_size=1,
-            act=None,
-            name=name + "_branch2c")
-
-        if not shortcut:
-            self.short = ConvBNLayer(
-                in_channels=in_channels,
-                out_channels=out_channels * 4,
-                kernel_size=1,
-                stride=stride,
-                is_vd_mode=not if_first and stride[0] != 1,
-                name=name + "_branch1")
-
-        self.shortcut = shortcut
-
-    def forward(self, inputs):
-        y = self.conv0(inputs)
-
-        conv1 = self.conv1(y)
-        conv2 = self.conv2(conv1)
-
-        if self.shortcut:
-            short = inputs
-        else:
-            short = self.short(inputs)
-        y = short + conv2
-        y = F.relu(y)
-        return y
-
-
-class BasicBlock(nn.Module):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 stride,
-                 shortcut=True,
-                 if_first=False,
-                 name=None):
-        super(BasicBlock, self).__init__()
-        self.stride = stride
-        self.conv0 = ConvBNLayer(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=3,
-            stride=stride,
-            act='relu',
-            name=name + "_branch2a")
-        self.conv1 = ConvBNLayer(
-            in_channels=out_channels,
-            out_channels=out_channels,
-            kernel_size=3,
-            act=None,
-            name=name + "_branch2b")
-
-        if not shortcut:
-            self.short = ConvBNLayer(
-                in_channels=in_channels,
-                out_channels=out_channels,
-                kernel_size=1,
-                stride=stride,
-                is_vd_mode=not if_first and stride[0] != 1,
-                name=name + "_branch1")
-
-        self.shortcut = shortcut
-
-    def forward(self, inputs):
-        y = self.conv0(inputs)
-        conv1 = self.conv1(y)
-
-        if self.shortcut:
-            short = inputs
-        else:
-            short = self.short(inputs)
-        y = short + conv1
-        y = F.relu(y)
-        return y
-
-
-class ResNet(nn.Module):
-    def __init__(self, in_channels=3, layers=50, **kwargs):
-        super(ResNet, self).__init__()
-
-        self.layers = layers
-        supported_layers = [18, 34, 50, 101, 152, 200]
-        assert layers in supported_layers, \
-            "supported layers are {} but input layer is {}".format(
-                supported_layers, layers)
-
-        if layers == 18:
-            depth = [2, 2, 2, 2]
-        elif layers == 34 or layers == 50:
-            depth = [3, 4, 6, 3]
-        elif layers == 101:
-            depth = [3, 4, 23, 3]
-        elif layers == 152:
-            depth = [3, 8, 36, 3]
-        elif layers == 200:
-            depth = [3, 12, 48, 3]
-        num_channels = [64, 256, 512,
-                        1024] if layers >= 50 else [64, 64, 128, 256]
-        num_filters = [64, 128, 256, 512]
-
-        self.conv1_1 = ConvBNLayer(
-            in_channels=in_channels,
-            out_channels=32,
-            kernel_size=3,
-            stride=1,
-            act='relu',
-            name="conv1_1")
-        self.conv1_2 = ConvBNLayer(
-            in_channels=32,
-            out_channels=32,
-            kernel_size=3,
-            stride=1,
-            act='relu',
-            name="conv1_2")
-        self.conv1_3 = ConvBNLayer(
-            in_channels=32,
-            out_channels=64,
-            kernel_size=3,
-            stride=1,
-            act='relu',
-            name="conv1_3")
-        self.pool2d_max = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
-
-        # self.block_list = list()
-        self.block_list = nn.Sequential()
-        if layers >= 50:
-            for block in range(len(depth)):
-                shortcut = False
-                for i in range(depth[block]):
-                    if layers in [101, 152, 200] and block == 2:
-                        if i == 0:
-                            conv_name = "res" + str(block + 2) + "a"
-                        else:
-                            conv_name = "res" + str(block + 2) + "b" + str(i)
-                    else:
-                        conv_name = "res" + str(block + 2) + chr(97 + i)
-
-                    if i == 0 and block != 0:
-                        stride = (2, 1)
-                    else:
-                        stride = (1, 1)
-
-                    bottleneck_block = BottleneckBlock(in_channels=num_channels[block] if i == 0 else num_filters[block] * 4,
-                                                       out_channels=num_filters[block],
-                                                       stride=stride,
-                                                       shortcut=shortcut,
-                                                       if_first=block == i == 0,
-                                                       name=conv_name)
-                    shortcut = True
-                    # self.block_list.append(bottleneck_block)
-                    self.block_list.add_module('bb_%d_%d' % (block, i), bottleneck_block)
-                self.out_channels = num_filters[block]
-        else:
-            for block in range(len(depth)):
-                shortcut = False
-                for i in range(depth[block]):
-                    conv_name = "res" + str(block + 2) + chr(97 + i)
-                    if i == 0 and block != 0:
-                        stride = (2, 1)
-                    else:
-                        stride = (1, 1)
-
-                    basic_block = BasicBlock(in_channels=num_channels[block] if i == 0 else num_filters[block],
-                                             out_channels=num_filters[block],
-                                             stride=stride,
-                                             shortcut=shortcut,
-                                             if_first=block == i == 0,
-                                             name=conv_name)
-
-                    shortcut = True
-                    # self.block_list.append(basic_block)
-                    self.block_list.add_module('bb_%d_%d' % (block, i), basic_block)
-                self.out_channels = num_filters[block]
-        self.out_pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
-
-    def forward(self, inputs):
-        y = self.conv1_1(inputs)
-        y = self.conv1_2(y)
-        y = self.conv1_3(y)
-        y = self.pool2d_max(y)
-        for block in self.block_list:
-            y = block(y)
-        y = self.out_pool(y)
-
-        return y
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_svtrnet.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_svtrnet.py
-import torch
-import torch.nn as nn
-from pytorchocr.modeling.common import Activation
 import numpy as np
+import torch
+from torch import nn
+
+from ..common import Activation

-def drop_path(x, drop_prob=0., training=False):
+
+def drop_path(x, drop_prob=0.0, training=False):
    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
    """
-    if drop_prob == 0. or not training:
+    if drop_prob == 0.0 or not training:
        return x
    keep_prob = torch.as_tensor(1 - drop_prob)
-    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)
    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype)
    random_tensor = torch.floor(random_tensor)  # binarize
    output = x.divide(keep_prob) * random_tensor
@@ -19,15 +21,17 @@ def drop_path(x, drop_prob=0., training=False):


 class ConvBNLayer(nn.Module):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size=3,
-                 stride=1,
-                 padding=0,
-                 bias_attr=False,
-                 groups=1,
-                 act='gelu'):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=3,
+        stride=1,
+        padding=0,
+        bias_attr=False,
+        groups=1,
+        act="gelu",
+    ):
        super().__init__()
        self.conv = nn.Conv2d(
            in_channels=in_channels,
@@ -36,7 +40,8 @@ class ConvBNLayer(nn.Module):
            stride=stride,
            padding=padding,
            groups=groups,
-            bias=bias_attr)
+            bias=bias_attr,
+        )
        self.norm = nn.BatchNorm2d(out_channels)
        self.act = Activation(act_type=act, inplace=True)

@@ -48,8 +53,7 @@ class ConvBNLayer(nn.Module):


 class DropPath(nn.Module):
-    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
-    """
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""

    def __init__(self, drop_prob=None):
        super(DropPath, self).__init__()
@@ -68,12 +72,14 @@ class Identity(nn.Module):


 class Mlp(nn.Module):
-    def __init__(self,
-                 in_features,
-                 hidden_features=None,
-                 out_features=None,
-                 act_layer='gelu',
-                 drop=0.):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer="gelu",
+        drop=0.0,
+    ):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
@@ -93,11 +99,12 @@ class Mlp(nn.Module):

 class ConvMixer(nn.Module):
    def __init__(
-            self,
-            dim,
-            num_heads=8,
-            HW=[8, 25],
-            local_k=[3, 3], ):
+        self,
+        dim,
+        num_heads=8,
+        HW=[8, 25],
+        local_k=[3, 3],
+    ):
        super().__init__()
        self.HW = HW
        self.dim = dim
@@ -105,9 +112,10 @@ class ConvMixer(nn.Module):
            dim,
            dim,
            local_k,
-            1, [local_k[0] // 2, local_k[1] // 2],
+            1,
+            [local_k[0] // 2, local_k[1] // 2],
            groups=num_heads,
-            )
+        )

    def forward(self, x):
        h = self.HW[0]
@@ -119,16 +127,18 @@ class ConvMixer(nn.Module):


 class Attention(nn.Module):
-    def __init__(self,
-                 dim,
-                 num_heads=8,
-                 mixer='Global',
-                 HW=[8, 25],
-                 local_k=[7, 11],
-                 qkv_bias=False,
-                 qk_scale=None,
-                 attn_drop=0.,
-                 proj_drop=0.):
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        mixer="Global",
+        HW=[8, 25],
+        local_k=[7, 11],
+        qkv_bias=False,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+    ):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
@@ -143,16 +153,19 @@ class Attention(nn.Module):
            W = HW[1]
            self.N = H * W
            self.C = dim
-        if mixer == 'Local' and HW is not None:
+        if mixer == "Local" and HW is not None:
            hk = local_k[0]
            wk = local_k[1]
            mask = torch.ones(H * W, H + hk - 1, W + wk - 1, dtype=torch.float32)
            for h in range(0, H):
                for w in range(0, W):
-                    mask[h * W + w, h:h + hk, w:w + wk] = 0.
-            mask_paddle = mask[:, hk // 2:H + hk // 2, wk // 2:W + wk //
-                               2].flatten(1)
-            mask_inf = torch.full([H * W, H * W], fill_value=float("-Inf"), dtype=torch.float32)
+                    mask[h * W + w, h : h + hk, w : w + wk] = 0.0
+            mask_paddle = mask[:, hk // 2 : H + hk // 2, wk // 2 : W + wk // 2].flatten(
+                1
+            )
+            mask_inf = torch.full(
+                [H * W, H * W], fill_value=float("-Inf"), dtype=torch.float32
+            )
            mask = torch.where(mask_paddle < 1, mask_paddle, mask_inf)
            self.mask = mask.unsqueeze(0).unsqueeze(1)
            # self.mask = mask[None, None, :]
@@ -165,11 +178,13 @@ class Attention(nn.Module):
        else:
            _, N, C = x.shape
        qkv = self.qkv(x)
-        qkv = qkv.reshape((-1, N, 3, self.num_heads, C // self.num_heads)).permute(2, 0, 3, 1, 4)
+        qkv = qkv.reshape((-1, N, 3, self.num_heads, C // self.num_heads)).permute(
+            2, 0, 3, 1, 4
+        )
        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]

-        attn = (q.matmul(k.permute(0, 1, 3, 2)))
-        if self.mixer == 'Local':
+        attn = q.matmul(k.permute(0, 1, 3, 2))
+        if self.mixer == "Local":
            attn += self.mask
        attn = nn.functional.softmax(attn, dim=-1)
        attn = self.attn_drop(attn)
@@ -181,28 +196,30 @@ class Attention(nn.Module):


 class Block(nn.Module):
-    def __init__(self,
-                 dim,
-                 num_heads,
-                 mixer='Global',
-                 local_mixer=[7, 11],
-                 HW=None,
-                 mlp_ratio=4.,
-                 qkv_bias=False,
-                 qk_scale=None,
-                 drop=0.,
-                 attn_drop=0.,
-                 drop_path=0.,
-                 act_layer='gelu',
-                 norm_layer='nn.LayerNorm',
-                 epsilon=1e-6,
-                 prenorm=True):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mixer="Global",
+        local_mixer=[7, 11],
+        HW=None,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer="gelu",
+        norm_layer="nn.LayerNorm",
+        epsilon=1e-6,
+        prenorm=True,
+    ):
        super().__init__()
        if isinstance(norm_layer, str):
            self.norm1 = eval(norm_layer)(dim, eps=epsilon)
        else:
            self.norm1 = norm_layer(dim)
-        if mixer == 'Global' or mixer == 'Local':
+        if mixer == "Global" or mixer == "Local":
            self.mixer = Attention(
                dim,
                num_heads=num_heads,
@@ -212,24 +229,26 @@ class Block(nn.Module):
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                attn_drop=attn_drop,
-                proj_drop=drop)
-        elif mixer == 'Conv':
-            self.mixer = ConvMixer(
-                dim, num_heads=num_heads, HW=HW, local_k=local_mixer)
+                proj_drop=drop,
+            )
+        elif mixer == "Conv":
+            self.mixer = ConvMixer(dim, num_heads=num_heads, HW=HW, local_k=local_mixer)
        else:
            raise TypeError("The mixer must be one of [Global, Local, Conv]")

-        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity()
        if isinstance(norm_layer, str):
            self.norm2 = eval(norm_layer)(dim, eps=epsilon)
        else:
            self.norm2 = norm_layer(dim)
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp_ratio = mlp_ratio
-        self.mlp = Mlp(in_features=dim,
-                       hidden_features=mlp_hidden_dim,
-                       act_layer=act_layer,
-                       drop=drop)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+        )
        self.prenorm = prenorm

    def forward(self, x):
@@ -243,25 +262,24 @@ class Block(nn.Module):


 class PatchEmbed(nn.Module):
-    """ Image to Patch Embedding
-    """
+    """Image to Patch Embedding"""

-    def __init__(self,
-                 img_size=[32, 100],
-                 in_channels=3,
-                 embed_dim=768,
-                 sub_num=2,
-                 patch_size=[4, 4],
-                 mode='pope',
-                 ):
+    def __init__(
+        self,
+        img_size=[32, 100],
+        in_channels=3,
+        embed_dim=768,
+        sub_num=2,
+        patch_size=[4, 4],
+        mode="pope",
+    ):
        super().__init__()
-        num_patches = (img_size[1] // (2 ** sub_num)) * \
-                      (img_size[0] // (2 ** sub_num))
+        num_patches = (img_size[1] // (2**sub_num)) * (img_size[0] // (2**sub_num))
        self.img_size = img_size
        self.num_patches = num_patches
        self.embed_dim = embed_dim
        self.norm = None
-        if mode == 'pope':
+        if mode == "pope":
            if sub_num == 2:
                self.proj = nn.Sequential(
                    ConvBNLayer(
@@ -270,16 +288,19 @@ class PatchEmbed(nn.Module):
                        kernel_size=3,
                        stride=2,
                        padding=1,
-                        act='gelu',
-                        bias_attr=True),
+                        act="gelu",
+                        bias_attr=True,
+                    ),
                    ConvBNLayer(
                        in_channels=embed_dim // 2,
                        out_channels=embed_dim,
                        kernel_size=3,
                        stride=2,
                        padding=1,
-                        act='gelu',
-                        bias_attr=True))
+                        act="gelu",
+                        bias_attr=True,
+                    ),
+                )
            if sub_num == 3:
                self.proj = nn.Sequential(
                    ConvBNLayer(
@@ -288,55 +309,66 @@ class PatchEmbed(nn.Module):
                        kernel_size=3,
                        stride=2,
                        padding=1,
-                        act='gelu',
-                        bias_attr=True),
+                        act="gelu",
+                        bias_attr=True,
+                    ),
                    ConvBNLayer(
                        in_channels=embed_dim // 4,
                        out_channels=embed_dim // 2,
                        kernel_size=3,
                        stride=2,
                        padding=1,
-                        act='gelu',
-                        bias_attr=True),
+                        act="gelu",
+                        bias_attr=True,
+                    ),
                    ConvBNLayer(
                        in_channels=embed_dim // 2,
                        out_channels=embed_dim,
                        kernel_size=3,
                        stride=2,
                        padding=1,
-                        act='gelu',
-                        bias_attr=True))
-        elif mode == 'linear':
+                        act="gelu",
+                        bias_attr=True,
+                    ),
+                )
+        elif mode == "linear":
            self.proj = nn.Conv2d(
-                1, embed_dim, kernel_size=patch_size, stride=patch_size)
-            self.num_patches = img_size[0] // patch_size[0] * img_size[
-                1] // patch_size[1]
+                1, embed_dim, kernel_size=patch_size, stride=patch_size
+            )
+            self.num_patches = (
+                img_size[0] // patch_size[0] * img_size[1] // patch_size[1]
+            )

    def forward(self, x):
        B, C, H, W = x.shape
-        assert H == self.img_size[0] and W == self.img_size[1], \
-            "Input image size ({}*{}) doesn't match model ({}*{}).".format(
-                H,W,self.img_size[0],self.img_size[1]
-            )
+        assert (
+            H == self.img_size[0] and W == self.img_size[1]
+        ), "Input image size ({}*{}) doesn't match model ({}*{}).".format(
+            H, W, self.img_size[0], self.img_size[1]
+        )
        x = self.proj(x).flatten(2).permute(0, 2, 1)
        return x


 class SubSample(nn.Module):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 types='Pool',
-                 stride=[2, 1],
-                 sub_norm='nn.LayerNorm',
-                 act=None):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        types="Pool",
+        stride=[2, 1],
+        sub_norm="nn.LayerNorm",
+        act=None,
+    ):
        super().__init__()
        self.types = types
-        if types == 'Pool':
+        if types == "Pool":
            self.avgpool = nn.AvgPool2d(
-                kernel_size=[3, 5], stride=stride, padding=[1, 2])
+                kernel_size=[3, 5], stride=stride, padding=[1, 2]
+            )
            self.maxpool = nn.MaxPool2d(
-                kernel_size=[3, 5], stride=stride, padding=[1, 2])
+                kernel_size=[3, 5], stride=stride, padding=[1, 2]
+            )
            self.proj = nn.Linear(in_channels, out_channels)
        else:
            self.conv = nn.Conv2d(
@@ -345,7 +377,7 @@ class SubSample(nn.Module):
                kernel_size=3,
                stride=stride,
                padding=1,
-                )
+            )
        self.norm = eval(sub_norm)(out_channels)
        if act is not None:
            self.act = act()
@@ -353,8 +385,7 @@ class SubSample(nn.Module):
            self.act = None

    def forward(self, x):
-
-        if self.types == 'Pool':
+        if self.types == "Pool":
            x1 = self.avgpool(x)
            x2 = self.maxpool(x)
            x = (x1 + x2) * 0.5
@@ -371,46 +402,51 @@ class SubSample(nn.Module):

 class SVTRNet(nn.Module):
    def __init__(
-            self,
-            img_size=[32, 100],
-            in_channels=3,
-            embed_dim=[64, 128, 256],
-            depth=[3, 6, 3],
-            num_heads=[2, 4, 8],
-            mixer=['Local'] * 6 + ['Global'] *
-            6,  # Local atten, Global atten, Conv
-            local_mixer=[[7, 11], [7, 11], [7, 11]],
-            patch_merging='Conv',  # Conv, Pool, None
-            mlp_ratio=4,
-            qkv_bias=True,
-            qk_scale=None,
-            drop_rate=0.,
-            last_drop=0.0,
-            attn_drop_rate=0.,
-            drop_path_rate=0.1,
-            norm_layer='nn.LayerNorm',
-            sub_norm='nn.LayerNorm',
-            epsilon=1e-6,
-            out_channels=192,
-            out_char_num=25,
-            block_unit='Block',
-            act='gelu',
-            last_stage=True,
-            sub_num=2,
-            prenorm=True,
-            use_lenhead=False,
-            **kwargs):
+        self,
+        img_size=[32, 100],
+        in_channels=3,
+        embed_dim=[64, 128, 256],
+        depth=[3, 6, 3],
+        num_heads=[2, 4, 8],
+        mixer=["Local"] * 6 + ["Global"] * 6,  # Local atten, Global atten, Conv
+        local_mixer=[[7, 11], [7, 11], [7, 11]],
+        patch_merging="Conv",  # Conv, Pool, None
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.0,
+        last_drop=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.1,
+        norm_layer="nn.LayerNorm",
+        sub_norm="nn.LayerNorm",
+        epsilon=1e-6,
+        out_channels=192,
+        out_char_num=25,
+        block_unit="Block",
+        act="gelu",
+        last_stage=True,
+        sub_num=2,
+        prenorm=True,
+        use_lenhead=False,
+        **kwargs
+    ):
        super().__init__()
        self.img_size = img_size
        self.embed_dim = embed_dim
        self.out_channels = out_channels
        self.prenorm = prenorm
-        patch_merging = None if patch_merging != 'Conv' and patch_merging != 'Pool' else patch_merging
+        patch_merging = (
+            None
+            if patch_merging != "Conv" and patch_merging != "Pool"
+            else patch_merging
+        )
        self.patch_embed = PatchEmbed(
            img_size=img_size,
            in_channels=in_channels,
            embed_dim=embed_dim[0],
-            sub_num=sub_num)
+            sub_num=sub_num,
+        )
        num_patches = self.patch_embed.num_patches
        self.HW = [img_size[0] // (2**sub_num), img_size[1] // (2**sub_num)]
        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim[0]))
@@ -418,81 +454,95 @@ class SVTRNet(nn.Module):
        Block_unit = eval(block_unit)

        dpr = np.linspace(0, drop_path_rate, sum(depth))
-        self.blocks1 = nn.ModuleList([
-            Block_unit(
-                dim=embed_dim[0],
-                num_heads=num_heads[0],
-                mixer=mixer[0:depth[0]][i],
-                HW=self.HW,
-                local_mixer=local_mixer[0],
-                mlp_ratio=mlp_ratio,
-                qkv_bias=qkv_bias,
-                qk_scale=qk_scale,
-                drop=drop_rate,
-                act_layer=act,
-                attn_drop=attn_drop_rate,
-                drop_path=dpr[0:depth[0]][i],
-                norm_layer=norm_layer,
-                epsilon=epsilon,
-                prenorm=prenorm) for i in range(depth[0])
-        ])
+        self.blocks1 = nn.ModuleList(
+            [
+                Block_unit(
+                    dim=embed_dim[0],
+                    num_heads=num_heads[0],
+                    mixer=mixer[0 : depth[0]][i],
+                    HW=self.HW,
+                    local_mixer=local_mixer[0],
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop_rate,
+                    act_layer=act,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[0 : depth[0]][i],
+                    norm_layer=norm_layer,
+                    epsilon=epsilon,
+                    prenorm=prenorm,
+                )
+                for i in range(depth[0])
+            ]
+        )
        if patch_merging is not None:
            self.sub_sample1 = SubSample(
                embed_dim[0],
                embed_dim[1],
                sub_norm=sub_norm,
                stride=[2, 1],
-                types=patch_merging)
+                types=patch_merging,
+            )
            HW = [self.HW[0] // 2, self.HW[1]]
        else:
            HW = self.HW
        self.patch_merging = patch_merging
-        self.blocks2 = nn.ModuleList([
-            Block_unit(
-                dim=embed_dim[1],
-                num_heads=num_heads[1],
-                mixer=mixer[depth[0]:depth[0] + depth[1]][i],
-                HW=HW,
-                local_mixer=local_mixer[1],
-                mlp_ratio=mlp_ratio,
-                qkv_bias=qkv_bias,
-                qk_scale=qk_scale,
-                drop=drop_rate,
-                act_layer=act,
-                attn_drop=attn_drop_rate,
-                drop_path=dpr[depth[0]:depth[0] + depth[1]][i],
-                norm_layer=norm_layer,
-                epsilon=epsilon,
-                prenorm=prenorm) for i in range(depth[1])
-        ])
+        self.blocks2 = nn.ModuleList(
+            [
+                Block_unit(
+                    dim=embed_dim[1],
+                    num_heads=num_heads[1],
+                    mixer=mixer[depth[0] : depth[0] + depth[1]][i],
+                    HW=HW,
+                    local_mixer=local_mixer[1],
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop_rate,
+                    act_layer=act,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[depth[0] : depth[0] + depth[1]][i],
+                    norm_layer=norm_layer,
+                    epsilon=epsilon,
+                    prenorm=prenorm,
+                )
+                for i in range(depth[1])
+            ]
+        )
        if patch_merging is not None:
            self.sub_sample2 = SubSample(
                embed_dim[1],
                embed_dim[2],
                sub_norm=sub_norm,
                stride=[2, 1],
-                types=patch_merging)
+                types=patch_merging,
+            )
            HW = [self.HW[0] // 4, self.HW[1]]
        else:
            HW = self.HW
-        self.blocks3 = nn.ModuleList([
-            Block_unit(
-                dim=embed_dim[2],
-                num_heads=num_heads[2],
-                mixer=mixer[depth[0] + depth[1]:][i],
-                HW=HW,
-                local_mixer=local_mixer[2],
-                mlp_ratio=mlp_ratio,
-                qkv_bias=qkv_bias,
-                qk_scale=qk_scale,
-                drop=drop_rate,
-                act_layer=act,
-                attn_drop=attn_drop_rate,
-                drop_path=dpr[depth[0] + depth[1]:][i],
-                norm_layer=norm_layer,
-                epsilon=epsilon,
-                prenorm=prenorm) for i in range(depth[2])
-        ])
+        self.blocks3 = nn.ModuleList(
+            [
+                Block_unit(
+                    dim=embed_dim[2],
+                    num_heads=num_heads[2],
+                    mixer=mixer[depth[0] + depth[1] :][i],
+                    HW=HW,
+                    local_mixer=local_mixer[2],
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop_rate,
+                    act_layer=act,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[depth[0] + depth[1] :][i],
+                    norm_layer=norm_layer,
+                    epsilon=epsilon,
+                    prenorm=prenorm,
+                )
+                for i in range(depth[2])
+            ]
+        )
        self.last_stage = last_stage
        if last_stage:
            self.avg_pool = nn.AdaptiveAvgPool2d([1, out_char_num])
@@ -502,8 +552,9 @@ class SVTRNet(nn.Module):
                kernel_size=1,
                stride=1,
                padding=0,
-                bias=False)
-            self.hardswish = Activation('hard_swish', inplace=True) #nn.Hardswish()
+                bias=False,
+            )
+            self.hardswish = Activation("hard_swish", inplace=True)  # nn.Hardswish()
            # self.dropout = nn.Dropout(p=last_drop, mode="downscale_in_infer")
            self.dropout = nn.Dropout(p=last_drop)
        if not prenorm:
@@ -511,9 +562,10 @@ class SVTRNet(nn.Module):
        self.use_lenhead = use_lenhead
        if use_lenhead:
            self.len_conv = nn.Linear(embed_dim[2], self.out_channels)
-            self.hardswish_len = Activation('hard_swish', inplace=True)# nn.Hardswish()
-            self.dropout_len = nn.Dropout(
-                p=last_drop)
+            self.hardswish_len = Activation(
+                "hard_swish", inplace=True
+            )  # nn.Hardswish()
+            self.dropout_len = nn.Dropout(p=last_drop)

        torch.nn.init.xavier_normal_(self.pos_embed)
        self.apply(self._init_weights)
@@ -521,7 +573,7 @@ class SVTRNet(nn.Module):
    def _init_weights(self, m):
        # weight initialization
        if isinstance(m, nn.Conv2d):
-            nn.init.kaiming_normal_(m.weight, mode='fan_out')
+            nn.init.kaiming_normal_(m.weight, mode="fan_out")
            if m.bias is not None:
                nn.init.zeros_(m.bias)
        elif isinstance(m, nn.BatchNorm2d):
@@ -532,7 +584,7 @@ class SVTRNet(nn.Module):
            if m.bias is not None:
                nn.init.zeros_(m.bias)
        elif isinstance(m, nn.ConvTranspose2d):
-            nn.init.kaiming_normal_(m.weight, mode='fan_out')
+            nn.init.kaiming_normal_(m.weight, mode="fan_out")
            if m.bias is not None:
                nn.init.zeros_(m.bias)
        elif isinstance(m, nn.LayerNorm):
@@ -548,13 +600,17 @@ class SVTRNet(nn.Module):
        if self.patch_merging is not None:
            x = self.sub_sample1(
                x.permute(0, 2, 1).reshape(
-                    [-1, self.embed_dim[0], self.HW[0], self.HW[1]]))
+                    [-1, self.embed_dim[0], self.HW[0], self.HW[1]]
+                )
+            )
        for blk in self.blocks2:
            x = blk(x)
        if self.patch_merging is not None:
            x = self.sub_sample2(
                x.permute(0, 2, 1).reshape(
-                    [-1, self.embed_dim[1], self.HW[0] // 2, self.HW[1]]))
+                    [-1, self.embed_dim[1], self.HW[0] // 2, self.HW[1]]
+                )
+            )
        for blk in self.blocks3:
            x = blk(x)
        if not self.prenorm:
@@ -572,11 +628,11 @@ class SVTRNet(nn.Module):
            else:
                h = self.HW[0]
            x = self.avg_pool(
-                x.permute(0, 2, 1).reshape(
-                    [-1, self.embed_dim[2], h, self.HW[1]]))
+                x.permute(0, 2, 1).reshape([-1, self.embed_dim[2], h, self.HW[1]])
+            )
            x = self.last_conv(x)
            x = self.hardswish(x)
            x = self.dropout(x)
        if self.use_lenhead:
            return x, len_x
-        return x
\ No newline at end of file
+        return x
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_vitstr.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_vitstr.py
-"""
-This code is refer from:
-https://github.com/roatienza/deep-text-recognition-benchmark/blob/master/modules/vitstr.py
-"""
-
-import numpy as np
-import torch
-import torch.nn as nn
-from pytorchocr.modeling.backbones.rec_svtrnet import Block, PatchEmbed
-
-# import paddle
-# import paddle.nn as nn
-# from ppocr.modeling.backbones.rec_svtrnet import Block, PatchEmbed, zeros_, trunc_normal_, ones_
-
-scale_dim_heads = {'tiny': [192, 3], 'small': [384, 6], 'base': [768, 12]}
-
-
-class ViTSTR(nn.Module):
-    def __init__(self,
-                 img_size=[224, 224],
-                 in_channels=1,
-                 scale='tiny',
-                 seqlen=27,
-                 patch_size=[16, 16],
-                 embed_dim=None,
-                 depth=12,
-                 num_heads=None,
-                 mlp_ratio=4,
-                 qkv_bias=True,
-                 qk_scale=None,
-                 drop_path_rate=0.,
-                 drop_rate=0.,
-                 attn_drop_rate=0.,
-                 norm_layer='nn.LayerNorm',
-                 act_layer='gelu',
-                 epsilon=1e-6,
-                 out_channels=None,
-                 **kwargs):
-        super().__init__()
-        self.seqlen = seqlen
-        embed_dim = embed_dim if embed_dim is not None else scale_dim_heads[
-            scale][0]
-        num_heads = num_heads if num_heads is not None else scale_dim_heads[
-            scale][1]
-        out_channels = out_channels if out_channels is not None else embed_dim
-        self.patch_embed = PatchEmbed(
-            img_size=img_size,
-            in_channels=in_channels,
-            embed_dim=embed_dim,
-            patch_size=patch_size,
-            mode='linear')
-        num_patches = self.patch_embed.num_patches
-
-        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
-        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
-
-        self.pos_drop = nn.Dropout(p=drop_rate)
-
-        dpr = np.linspace(0, drop_path_rate, depth)
-        self.blocks = nn.ModuleList([
-            Block(
-                dim=embed_dim,
-                num_heads=num_heads,
-                mlp_ratio=mlp_ratio,
-                qkv_bias=qkv_bias,
-                qk_scale=qk_scale,
-                drop=drop_rate,
-                attn_drop=attn_drop_rate,
-                drop_path=dpr[i],
-                norm_layer=norm_layer,
-                act_layer=act_layer,
-                epsilon=epsilon,
-                prenorm=False) for i in range(depth)
-        ])
-        self.norm = eval(norm_layer)(embed_dim, eps=epsilon)
-
-        self.out_channels = out_channels
-
-        torch.nn.init.xavier_normal_(self.pos_embed)
-        torch.nn.init.xavier_normal_(self.cls_token)
-        self.apply(self._init_weights)
-
-    def _init_weights(self, m):
-        # weight initialization
-        if isinstance(m, nn.Conv2d):
-            nn.init.kaiming_normal_(m.weight, mode='fan_out')
-            if m.bias is not None:
-                nn.init.zeros_(m.bias)
-        elif isinstance(m, nn.BatchNorm2d):
-            nn.init.ones_(m.weight)
-            nn.init.zeros_(m.bias)
-        elif isinstance(m, nn.Linear):
-            nn.init.normal_(m.weight, 0, 0.01)
-            if m.bias is not None:
-                nn.init.zeros_(m.bias)
-        elif isinstance(m, nn.ConvTranspose2d):
-            nn.init.kaiming_normal_(m.weight, mode='fan_out')
-            if m.bias is not None:
-                nn.init.zeros_(m.bias)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.ones_(m.weight)
-            nn.init.zeros_(m.bias)
-
-    def forward_features(self, x):
-        B = x.shape[0]
-        x = self.patch_embed(x)
-        # cls_tokens = paddle.tile(self.cls_token, repeat_times=[B, 1, 1])
-        cls_tokens = self.cls_token.repeat(B, 1, 1)
-        x = torch.cat((cls_tokens, x), dim=1)
-        x = x + self.pos_embed
-        x = self.pos_drop(x)
-        for blk in self.blocks:
-            x = blk(x)
-        x = self.norm(x)
-        return x
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        x = x[:, :self.seqlen]
-        return x.permute(0, 2, 1).unsqueeze(2)
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/table_mobilenet_v3.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/table_mobilenet_v3.py
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-__all__ = ['MobileNetV3']
-
-
-def make_divisible(v, divisor=8, min_value=None):
-    if min_value is None:
-        min_value = divisor
-    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
-    if new_v < 0.9 * v:
-        new_v += divisor
-    return new_v
-
-def hard_sigmoid(x, slope=0.1666667, offset=0.5,):
-    return torch.clamp(slope * x + offset, 0., 1.)
-
-def hard_swish(x, inplace=True):
-    return x * F.relu6(x + 3., inplace=inplace) / 6.
-
-class MobileNetV3(nn.Module):
-    def __init__(self,
-                 in_channels=3,
-                 model_name='large',
-                 scale=0.5,
-                 disable_se=False,
-                 **kwargs):
-        """
-        the MobilenetV3 backbone network for detection module.
-        Args:
-            params(dict): the super parameters for build network
-        """
-        super(MobileNetV3, self).__init__()
-
-        self.disable_se = disable_se
-
-        if model_name == "large":
-            cfg = [
-                # k, exp, c,  se,     nl,  s,
-                [3, 16, 16, False, 'relu', 1],
-                [3, 64, 24, False, 'relu', 2],
-                [3, 72, 24, False, 'relu', 1],
-                [5, 72, 40, True, 'relu', 2],
-                [5, 120, 40, True, 'relu', 1],
-                [5, 120, 40, True, 'relu', 1],
-                [3, 240, 80, False, 'hardswish', 2],
-                [3, 200, 80, False, 'hardswish', 1],
-                [3, 184, 80, False, 'hardswish', 1],
-                [3, 184, 80, False, 'hardswish', 1],
-                [3, 480, 112, True, 'hardswish', 1],
-                [3, 672, 112, True, 'hardswish', 1],
-                [5, 672, 160, True, 'hardswish', 2],
-                [5, 960, 160, True, 'hardswish', 1],
-                [5, 960, 160, True, 'hardswish', 1],
-            ]
-            cls_ch_squeeze = 960
-        elif model_name == "small":
-            cfg = [
-                # k, exp, c,  se,     nl,  s,
-                [3, 16, 16, True, 'relu', 2],
-                [3, 72, 24, False, 'relu', 2],
-                [3, 88, 24, False, 'relu', 1],
-                [5, 96, 40, True, 'hardswish', 2],
-                [5, 240, 40, True, 'hardswish', 1],
-                [5, 240, 40, True, 'hardswish', 1],
-                [5, 120, 48, True, 'hardswish', 1],
-                [5, 144, 48, True, 'hardswish', 1],
-                [5, 288, 96, True, 'hardswish', 2],
-                [5, 576, 96, True, 'hardswish', 1],
-                [5, 576, 96, True, 'hardswish', 1],
-            ]
-            cls_ch_squeeze = 576
-        else:
-            raise NotImplementedError("mode[" + model_name +
-                                      "_model] is not implemented!")
-
-        supported_scale = [0.35, 0.5, 0.75, 1.0, 1.25]
-        assert scale in supported_scale, \
-            "supported scale are {} but input scale is {}".format(supported_scale, scale)
-        inplanes = 16
-        # conv1
-        self.conv = ConvBNLayer(
-            in_channels=in_channels,
-            out_channels=make_divisible(inplanes * scale),
-            kernel_size=3,
-            stride=2,
-            padding=1,
-            groups=1,
-            if_act=True,
-            act='hardswish',
-            name='conv1')
-
-        self.stages = nn.ModuleList()
-        self.out_channels = []
-        block_list = []
-        i = 0
-        inplanes = make_divisible(inplanes * scale)
-        for (k, exp, c, se, nl, s) in cfg:
-            se = se and not self.disable_se
-            start_idx = 2 if model_name == 'large' else 0
-            if s == 2 and i > start_idx:
-                self.out_channels.append(inplanes)
-                self.stages.append(nn.Sequential(*block_list))
-                block_list = []
-            block_list.append(
-                ResidualUnit(
-                    in_channels=inplanes,
-                    mid_channels=make_divisible(scale * exp),
-                    out_channels=make_divisible(scale * c),
-                    kernel_size=k,
-                    stride=s,
-                    use_se=se,
-                    act=nl,
-                    name="conv" + str(i + 2)))
-            inplanes = make_divisible(scale * c)
-            i += 1
-        block_list.append(
-            ConvBNLayer(
-                in_channels=inplanes,
-                out_channels=make_divisible(scale * cls_ch_squeeze),
-                kernel_size=1,
-                stride=1,
-                padding=0,
-                groups=1,
-                if_act=True,
-                act='hardswish',
-                name='conv_last'))
-        self.stages.append(nn.Sequential(*block_list))
-        self.out_channels.append(make_divisible(scale * cls_ch_squeeze))
-        # for i, stage in enumerate(self.stages):
-        #     self.add_module(module=stage, name="stage{}".format(i))
-
-    def forward(self, x):
-        x = self.conv(x)
-        out_list = []
-        for stage in self.stages:
-            x = stage(x)
-            out_list.append(x)
-        return out_list
-
-
-class ConvBNLayer(nn.Module):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride,
-                 padding,
-                 groups=1,
-                 if_act=True,
-                 act=None,
-                 name=None):
-        super(ConvBNLayer, self).__init__()
-        self.if_act = if_act
-        self.act = act
-        self.conv = nn.Conv2d(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            groups=groups,
-            bias=False)
-
-        self.bn = nn.BatchNorm2d(
-            out_channels,
-        )
-
-    def forward(self, x):
-        x = self.conv(x)
-        x = self.bn(x)
-        if self.if_act:
-            if self.act == "relu":
-                x = F.relu(x)
-            elif self.act == "hardswish":
-                x = hard_swish(x)
-            else:
-                print("The activation function({}) is selected incorrectly.".
-                      format(self.act))
-                exit()
-        return x
-
-
-class ResidualUnit(nn.Module):
-    def __init__(self,
-                 in_channels,
-                 mid_channels,
-                 out_channels,
-                 kernel_size,
-                 stride,
-                 use_se,
-                 act=None,
-                 name=''):
-        super(ResidualUnit, self).__init__()
-        self.if_shortcut = stride == 1 and in_channels == out_channels
-        self.if_se = use_se
-
-        self.expand_conv = ConvBNLayer(
-            in_channels=in_channels,
-            out_channels=mid_channels,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            if_act=True,
-            act=act,
-            name=name + "_expand")
-        self.bottleneck_conv = ConvBNLayer(
-            in_channels=mid_channels,
-            out_channels=mid_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=int((kernel_size - 1) // 2),
-            groups=mid_channels,
-            if_act=True,
-            act=act,
-            name=name + "_depthwise")
-        if self.if_se:
-            self.mid_se = SEModule(mid_channels, name=name + "_se")
-        self.linear_conv = ConvBNLayer(
-            in_channels=mid_channels,
-            out_channels=out_channels,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            if_act=False,
-            act=None,
-            name=name + "_linear")
-
-    def forward(self, inputs):
-        x = self.expand_conv(inputs)
-        x = self.bottleneck_conv(x)
-        if self.if_se:
-            x = self.mid_se(x)
-        x = self.linear_conv(x)
-        if self.if_shortcut:
-            x = torch.add(inputs, x)
-        return x
-
-
-class SEModule(nn.Module):
-    def __init__(self, in_channels, reduction=4, name=""):
-        super(SEModule, self).__init__()
-        self.avg_pool = nn.AdaptiveAvgPool2d(1)
-        self.conv1 = nn.Conv2d(
-            in_channels=in_channels,
-            out_channels=in_channels // reduction,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            bias=True)
-        self.conv2 = nn.Conv2d(
-            in_channels=in_channels // reduction,
-            out_channels=in_channels,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            bias=True)
-
-    def forward(self, inputs):
-        outputs = self.avg_pool(inputs)
-        outputs = self.conv1(outputs)
-        outputs = F.relu(outputs)
-        outputs = self.conv2(outputs)
-        outputs = hard_sigmoid(outputs, slope=0.2, offset=0.5)
-        return inputs * outputs
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/table_resnet_vd.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/table_resnet_vd.py
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from pytorchocr.modeling.common import Activation
-
-__all__ = ["ResNet"]
-
-
-class ConvBNLayer(nn.Module):
-    def __init__(
-            self,
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride=1,
-            groups=1,
-            is_vd_mode=False,
-            act=None,
-            name=None, ):
-        super(ConvBNLayer, self).__init__()
-
-        self.is_vd_mode = is_vd_mode
-        self._pool2d_avg = nn.AvgPool2d(
-            kernel_size=2, stride=2, padding=0, ceil_mode=True)
-        self._conv = nn.Conv2d(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=(kernel_size - 1) // 2,
-            groups=groups,
-            bias=False)
-        if name == "conv1":
-            bn_name = "bn_" + name
-        else:
-            bn_name = "bn" + name[3:]
-        self._batch_norm = nn.BatchNorm2d(
-            out_channels,
-        )
-        self.act = act
-        if self.act is not None:
-            self._act = Activation(act, inplace=True)
-
-
-    def forward(self, inputs):
-        if self.is_vd_mode:
-            inputs = self._pool2d_avg(inputs)
-        y = self._conv(inputs)
-        y = self._batch_norm(y)
-        if self.act is not None:
-            y = self._act(y)
-        return y
-
-
-class BottleneckBlock(nn.Module):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 stride,
-                 shortcut=True,
-                 if_first=False,
-                 name=None):
-        super(BottleneckBlock, self).__init__()
-
-        self.conv0 = ConvBNLayer(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=1,
-            act='relu',
-            name=name + "_branch2a")
-        self.conv1 = ConvBNLayer(
-            in_channels=out_channels,
-            out_channels=out_channels,
-            kernel_size=3,
-            stride=stride,
-            act='relu',
-            name=name + "_branch2b")
-        self.conv2 = ConvBNLayer(
-            in_channels=out_channels,
-            out_channels=out_channels * 4,
-            kernel_size=1,
-            act=None,
-            name=name + "_branch2c")
-
-        if not shortcut:
-            self.short = ConvBNLayer(
-                in_channels=in_channels,
-                out_channels=out_channels * 4,
-                kernel_size=1,
-                stride=1,
-                is_vd_mode=False if if_first else True,
-                name=name + "_branch1")
-
-        self.shortcut = shortcut
-
-    def forward(self, inputs):
-        y = self.conv0(inputs)
-        conv1 = self.conv1(y)
-        conv2 = self.conv2(conv1)
-
-        if self.shortcut:
-            short = inputs
-        else:
-            short = self.short(inputs)
-        y = torch.add(short, conv2)
-        y = F.relu(y)
-        return y
-
-
-class BasicBlock(nn.Module):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 stride,
-                 shortcut=True,
-                 if_first=False,
-                 name=None):
-        super(BasicBlock, self).__init__()
-        self.stride = stride
-        self.conv0 = ConvBNLayer(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=3,
-            stride=stride,
-            act='relu',
-            name=name + "_branch2a")
-        self.conv1 = ConvBNLayer(
-            in_channels=out_channels,
-            out_channels=out_channels,
-            kernel_size=3,
-            act=None,
-            name=name + "_branch2b")
-
-        if not shortcut:
-            self.short = ConvBNLayer(
-                in_channels=in_channels,
-                out_channels=out_channels,
-                kernel_size=1,
-                stride=1,
-                is_vd_mode=False if if_first else True,
-                name=name + "_branch1")
-
-        self.shortcut = shortcut
-
-    def forward(self, inputs):
-        y = self.conv0(inputs)
-        conv1 = self.conv1(y)
-
-        if self.shortcut:
-            short = inputs
-        else:
-            short = self.short(inputs)
-        y = torch.add(short, conv1)
-        y = F.relu(y)
-        return y
-
-
-class ResNet(nn.Module):
-    def __init__(self, in_channels=3, layers=50, **kwargs):
-        super(ResNet, self).__init__()
-
-        self.layers = layers
-        supported_layers = [18, 34, 50, 101, 152, 200]
-        assert layers in supported_layers, \
-            "supported layers are {} but input layer is {}".format(
-                supported_layers, layers)
-
-        if layers == 18:
-            depth = [2, 2, 2, 2]
-        elif layers == 34 or layers == 50:
-            depth = [3, 4, 6, 3]
-        elif layers == 101:
-            depth = [3, 4, 23, 3]
-        elif layers == 152:
-            depth = [3, 8, 36, 3]
-        elif layers == 200:
-            depth = [3, 12, 48, 3]
-        num_channels = [64, 256, 512,
-                        1024] if layers >= 50 else [64, 64, 128, 256]
-        num_filters = [64, 128, 256, 512]
-
-        self.conv1_1 = ConvBNLayer(
-            in_channels=in_channels,
-            out_channels=32,
-            kernel_size=3,
-            stride=2,
-            act='relu',
-            name="conv1_1")
-        self.conv1_2 = ConvBNLayer(
-            in_channels=32,
-            out_channels=32,
-            kernel_size=3,
-            stride=1,
-            act='relu',
-            name="conv1_2")
-        self.conv1_3 = ConvBNLayer(
-            in_channels=32,
-            out_channels=64,
-            kernel_size=3,
-            stride=1,
-            act='relu',
-            name="conv1_3")
-        self.pool2d_max = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
-
-        self.stages = nn.ModuleList()
-        self.out_channels = []
-        if layers >= 50:
-            for block in range(len(depth)):
-                block_list = nn.Sequential()
-                shortcut = False
-                for i in range(depth[block]):
-                    if layers in [101, 152] and block == 2:
-                        if i == 0:
-                            conv_name = "res" + str(block + 2) + "a"
-                        else:
-                            conv_name = "res" + str(block + 2) + "b" + str(i)
-                    else:
-                        conv_name = "res" + str(block + 2) + chr(97 + i)
-                    bottleneck_block = BottleneckBlock(
-                            in_channels=num_channels[block]
-                            if i == 0 else num_filters[block] * 4,
-                            out_channels=num_filters[block],
-                            stride=2 if i == 0 and block != 0 else 1,
-                            shortcut=shortcut,
-                            if_first=block == i == 0,
-                            name=conv_name
-                    )
-                    shortcut = True
-                    # block_list.append(bottleneck_block)
-                    block_list.add_module('bb_%d_%d' % (block, i), bottleneck_block)
-                self.out_channels.append(num_filters[block] * 4)
-                # self.stages.append(nn.Sequential(*block_list))
-                self.stages.append(block_list)
-        else:
-            for block in range(len(depth)):
-                block_list = nn.Sequential()
-                shortcut = False
-                for i in range(depth[block]):
-                    conv_name = "res" + str(block + 2) + chr(97 + i)
-                    basic_block = BasicBlock(
-                            in_channels=num_channels[block]
-                            if i == 0 else num_filters[block],
-                            out_channels=num_filters[block],
-                            stride=2 if i == 0 and block != 0 else 1,
-                            shortcut=shortcut,
-                            if_first=block == i == 0,
-                            name=conv_name
-                    )
-                    shortcut = True
-                    # block_list.append(basic_block)
-                    block_list.add_module('bb_%d_%d' % (block, i), basic_block)
-                self.out_channels.append(num_filters[block])
-                # self.stages.append(nn.Sequential(*block_list))
-                self.stages.append(block_list)
-
-    def forward(self, inputs):
-        y = self.conv1_1(inputs)
-        y = self.conv1_2(y)
-        y = self.conv1_3(y)
-        y = self.pool2d_max(y)
-        out = []
-        for block in self.stages:
-            y = block(y)
-            out.append(y)
-        return out
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/common.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/common.py
-
-
 import torch
-import torch.nn as nn
 import torch.nn.functional as F
+from torch import nn
+

 class Hswish(nn.Module):
    def __init__(self, inplace=True):
@@ -10,7 +9,8 @@ class Hswish(nn.Module):
        self.inplace = inplace

    def forward(self, x):
-        return x * F.relu6(x + 3., inplace=self.inplace) / 6.
+        return x * F.relu6(x + 3.0, inplace=self.inplace) / 6.0
+

 # out = max(0, min(1, slop*x+offset))
 # paddle.fluid.layers.hard_sigmoid(x, slope=0.2, offset=0.5, name=None)
@@ -22,7 +22,8 @@ class Hsigmoid(nn.Module):
    def forward(self, x):
        # torch: F.relu6(x + 3., inplace=self.inplace) / 6.
        # paddle: F.relu6(1.2 * x + 3., inplace=self.inplace) / 6.
-        return F.relu6(1.2 * x + 3., inplace=self.inplace) / 6.
+        return F.relu6(1.2 * x + 3.0, inplace=self.inplace) / 6.0
+

 class GELU(nn.Module):
    def __init__(self, inplace=True):
@@ -43,31 +44,33 @@ class Swish(nn.Module):
            x.mul_(torch.sigmoid(x))
            return x
        else:
-            return x*torch.sigmoid(x)
+            return x * torch.sigmoid(x)


 class Activation(nn.Module):
    def __init__(self, act_type, inplace=True):
        super(Activation, self).__init__()
        act_type = act_type.lower()
-        if act_type == 'relu':
+        if act_type == "relu":
            self.act = nn.ReLU(inplace=inplace)
-        elif act_type == 'relu6':
+        elif act_type == "relu6":
            self.act = nn.ReLU6(inplace=inplace)
-        elif act_type == 'sigmoid':
+        elif act_type == "sigmoid":
            raise NotImplementedError
-        elif act_type == 'hard_sigmoid':
-            self.act = Hsigmoid(inplace)#nn.Hardsigmoid(inplace=inplace)#Hsigmoid(inplace)#
-        elif act_type == 'hard_swish' or act_type == 'hswish':
+        elif act_type == "hard_sigmoid":
+            self.act = Hsigmoid(
+                inplace
+            )  # nn.Hardsigmoid(inplace=inplace)#Hsigmoid(inplace)#
+        elif act_type == "hard_swish" or act_type == "hswish":
            self.act = Hswish(inplace=inplace)
-        elif act_type == 'leakyrelu':
+        elif act_type == "leakyrelu":
            self.act = nn.LeakyReLU(inplace=inplace)
-        elif act_type == 'gelu':
+        elif act_type == "gelu":
            self.act = GELU(inplace=inplace)
-        elif act_type == 'swish':
+        elif act_type == "swish":
            self.act = Swish(inplace=inplace)
        else:
            raise NotImplementedError

    def forward(self, inputs):
-        return self.act(inputs)
\ No newline at end of file
+        return self.act(inputs)
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/__init__.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/__init__.py
@@ -12,40 +12,32 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-__all__ = ['build_head']
+__all__ = ["build_head"]


 def build_head(config, **kwargs):
    # det head
    from .det_db_head import DBHead, PFHeadLocal
-    from .det_east_head import EASTHead
-    from .det_sast_head import SASTHead
-    from .det_pse_head import PSEHead
-    from .det_fce_head import FCEHead
-    from .e2e_pg_head import PGHead

    # rec head
    from .rec_ctc_head import CTCHead
-    from .rec_att_head import AttentionHead
-    from .rec_srn_head import SRNHead
-    from .rec_nrtr_head import Transformer
-    from .rec_sar_head import SARHead
-    from .rec_can_head import CANHead
    from .rec_multi_head import MultiHead

    # cls head
    from .cls_head import ClsHead
-    support_dict = [
-        'DBHead', 'PSEHead', 'EASTHead', 'SASTHead', 'CTCHead', 'ClsHead', 'AttentionHead',
-        'SRNHead', 'PGHead', 'Transformer', 'TableAttentionHead','SARHead', 'FCEHead',
-        'CANHead', 'MultiHead', 'PFHeadLocal',

+    support_dict = [
+        "DBHead",
+        "CTCHead",
+        "ClsHead",
+        "MultiHead",
+        "PFHeadLocal",
    ]

-    from .table_att_head import TableAttentionHead
-
-    module_name = config.pop('name')
-    assert module_name in support_dict, Exception('head only support {}'.format(
-        support_dict))
+    module_name = config.pop("name")
+    char_num = config.pop("char_num", 6625)
+    assert module_name in support_dict, Exception(
+        "head only support {}".format(support_dict)
+    )
    module_class = eval(module_name)(**config, **kwargs)
-    return module_class
\ No newline at end of file
+    return module_class
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/cls_head.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/cls_head.py
-import os, sys
 import torch
-import torch.nn as nn
 import torch.nn.functional as F
+from torch import nn
+

 class ClsHead(nn.Module):
    """
@@ -12,17 +12,12 @@ class ClsHead(nn.Module):

    def __init__(self, in_channels, class_dim, **kwargs):
        super(ClsHead, self).__init__()
-        self.training = False
        self.pool = nn.AdaptiveAvgPool2d(1)
-        self.fc = nn.Linear(
-            in_channels,
-            class_dim,
-            bias=True)
+        self.fc = nn.Linear(in_channels, class_dim, bias=True)

    def forward(self, x):
        x = self.pool(x)
        x = torch.reshape(x, shape=[x.shape[0], x.shape[1]])
        x = self.fc(x)
-        if not self.training:
-            x = F.softmax(x, dim=1)
-        return x
\ No newline at end of file
+        x = F.softmax(x, dim=1)
+        return x
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/det_db_head.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/det_db_head.py
-import os, sys
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from pytorchocr.modeling.common import Activation
-from pytorchocr.modeling.backbones.det_mobilenet_v3 import ConvBNLayer
+from ..common import Activation
+from ..backbones.det_mobilenet_v3 import ConvBNLayer

 class Head(nn.Module):
    def __init__(self, in_channels, **kwargs):
@@ -76,13 +75,8 @@ class DBHead(nn.Module):

    def forward(self, x):
        shrink_maps = self.binarize(x)
-        if not self.training:
-            return {'maps': shrink_maps}
+        return {'maps': shrink_maps}

-        threshold_maps = self.thresh(x)
-        binary_maps = self.step_function(shrink_maps, threshold_maps)
-        y = torch.cat([shrink_maps, threshold_maps, binary_maps], dim=1)
-        return {'maps': y}

 class LocalModule(nn.Module):
    def __init__(self, in_c, mid_c, use_distance=True):
@@ -101,7 +95,7 @@ class PFHeadLocal(DBHead):
        super(PFHeadLocal, self).__init__(in_channels, k, **kwargs)
        self.mode = mode

-        self.up_conv = nn.interpolate(scale_factor=2, mode="nearest")
+        self.up_conv = nn.Upsample(scale_factor=2, mode="nearest")
        if self.mode == 'large':
            self.cbn_layer = LocalModule(in_channels // 4, in_channels // 4)
        elif self.mode == 'small':
@@ -112,10 +106,4 @@ class PFHeadLocal(DBHead):
        base_maps = shrink_maps
        cbn_maps = self.cbn_layer(self.up_conv(f), shrink_maps, None)
        cbn_maps = F.sigmoid(cbn_maps)
-        if not self.training:
-            return {'maps': 0.5 * (base_maps + cbn_maps), 'cbn_maps': cbn_maps}
-
-        threshold_maps = self.thresh(x)
-        binary_maps = self.step_function(shrink_maps, threshold_maps)
-        y = torch.cat([cbn_maps, threshold_maps, binary_maps], dim=1)
-        return {'maps': y, 'distance_maps': cbn_maps, 'cbn_maps': binary_maps}
\ No newline at end of file
+        return {'maps': 0.5 * (base_maps + cbn_maps), 'cbn_maps': cbn_maps}
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/det_east_head.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/det_east_head.py
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-import os, sys
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from pytorchocr.modeling.common import Activation
-# import paddle
-# from paddle import nn
-# import paddle.nn.functional as F
-# from paddle import ParamAttr
-
-
-class ConvBNLayer(nn.Module):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride,
-                 padding,
-                 groups=1,
-                 if_act=True,
-                 act=None,
-                 name=None):
-        super(ConvBNLayer, self).__init__()
-        self.if_act = if_act
-        self.act = act
-
-        self.conv = nn.Conv2d(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            groups=groups,
-            bias=False)
-
-        self.bn = nn.BatchNorm2d(
-            out_channels,)
-        self.act = act
-        if act is not None:
-            self._act = Activation(act)
-
-    def forward(self, x):
-        x = self.conv(x)
-        x = self.bn(x)
-        if self.act is not None:
-            x = self._act(x)
-        return x
-
-
-class EASTHead(nn.Module):
-    """
-    """
-    def __init__(self, in_channels, model_name, **kwargs):
-        super(EASTHead, self).__init__()
-        self.model_name = model_name
-        if self.model_name == "large":
-            num_outputs = [128, 64, 1, 8]
-        else:
-            num_outputs = [64, 32, 1, 8]
-
-        self.det_conv1 = ConvBNLayer(
-            in_channels=in_channels,
-            out_channels=num_outputs[0],
-            kernel_size=3,
-            stride=1,
-            padding=1,
-            if_act=True,
-            act='relu',
-            name="det_head1")
-        self.det_conv2 = ConvBNLayer(
-            in_channels=num_outputs[0],
-            out_channels=num_outputs[1],
-            kernel_size=3,
-            stride=1,
-            padding=1,
-            if_act=True,
-            act='relu',
-            name="det_head2")
-        self.score_conv = ConvBNLayer(
-            in_channels=num_outputs[1],
-            out_channels=num_outputs[2],
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            if_act=False,
-            act=None,
-            name="f_score")
-        self.geo_conv = ConvBNLayer(
-            in_channels=num_outputs[1],
-            out_channels=num_outputs[3],
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            if_act=False,
-            act=None,
-            name="f_geo")
-
-    def forward(self, x):
-        f_det = self.det_conv1(x)
-        f_det = self.det_conv2(f_det)
-        f_score = self.score_conv(f_det)
-        f_score = torch.sigmoid(f_score)
-        f_geo = self.geo_conv(f_det)
-        f_geo = (torch.sigmoid(f_geo) - 0.5) * 2 * 800
-
-        pred = {'f_score': f_score, 'f_geo': f_geo}
-        return pred
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/det_fce_head.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/det_fce_head.py
-"""
-This code is refer from:
-https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textdet/dense_heads/fce_head.py
-"""
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-# from paddle import nn
-# from paddle import ParamAttr
-# import paddle.nn.functional as F
-# from paddle.nn.initializer import Normal
-# import paddle
-from functools import partial
-
-
-def multi_apply(func, *args, **kwargs):
-    pfunc = partial(func, **kwargs) if kwargs else func
-    map_results = map(pfunc, *args)
-    return tuple(map(list, zip(*map_results)))
-
-
-class FCEHead(nn.Module):
-    """The class for implementing FCENet head.
-    FCENet(CVPR2021): Fourier Contour Embedding for Arbitrary-shaped Text
-    Detection.
-
-    [https://arxiv.org/abs/2104.10442]
-
-    Args:
-        in_channels (int): The number of input channels.
-        scales (list[int]) : The scale of each layer.
-        fourier_degree (int) : The maximum Fourier transform degree k.
-    """
-
-    def __init__(self, in_channels, fourier_degree=5):
-        super().__init__()
-        assert isinstance(in_channels, int)
-
-        self.downsample_ratio = 1.0
-        self.in_channels = in_channels
-        self.fourier_degree = fourier_degree
-        self.out_channels_cls = 4
-        self.out_channels_reg = (2 * self.fourier_degree + 1) * 2
-
-        self.out_conv_cls = nn.Conv2d(
-            in_channels=self.in_channels,
-            out_channels=self.out_channels_cls,
-            kernel_size=3,
-            stride=1,
-            padding=1,
-            groups=1,
-            bias=True)
-        self.out_conv_reg = nn.Conv2d(
-            in_channels=self.in_channels,
-            out_channels=self.out_channels_reg,
-            kernel_size=3,
-            stride=1,
-            padding=1,
-            groups=1,
-            bias=True)
-
-    def forward(self, feats, targets=None):
-        cls_res, reg_res = multi_apply(self.forward_single, feats)
-        level_num = len(cls_res)
-        outs = {}
-        if not self.training:
-            for i in range(level_num):
-                tr_pred = F.softmax(cls_res[i][:, 0:2, :, :], dim=1)
-                tcl_pred = F.softmax(cls_res[i][:, 2:, :, :], dim=1)
-                outs['level_{}'.format(i)] = torch.cat(
-                    [tr_pred, tcl_pred, reg_res[i]], dim=1)
-        else:
-            preds = [[cls_res[i], reg_res[i]] for i in range(level_num)]
-            outs['levels'] = preds
-        return outs
-
-    def forward_single(self, x):
-        cls_predict = self.out_conv_cls(x)
-        reg_predict = self.out_conv_reg(x)
-        return cls_predict, reg_predict
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/det_pse_head.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/det_pse_head.py
-"""
-This code is refer from:
-https://github.com/whai362/PSENet/blob/python3/models/head/psenet_head.py
-"""
-
-# from paddle import nn
-from torch import nn
-
-
-class PSEHead(nn.Module):
-    def __init__(self, in_channels, hidden_dim=256, out_channels=7, **kwargs):
-        super(PSEHead, self).__init__()
-        self.conv1 = nn.Conv2d(
-            in_channels, hidden_dim, kernel_size=3, stride=1, padding=1)
-        self.bn1 = nn.BatchNorm2d(hidden_dim)
-        self.relu1 = nn.ReLU()
-
-        self.conv2 = nn.Conv2d(
-            hidden_dim, out_channels, kernel_size=1, stride=1, padding=0)
-
-    def forward(self, x, **kwargs):
-        out = self.conv1(x)
-        out = self.relu1(self.bn1(out))
-        out = self.conv2(out)
-        return {'maps': out}
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/det_sast_head.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/det_sast_head.py
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-import os, sys
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from pytorchocr.modeling.common import Activation
-# import paddle
-# from paddle import nn
-# import paddle.nn.functional as F
-# from paddle import ParamAttr
-
-
-class ConvBNLayer(nn.Module):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride,
-                 groups=1,
-                 if_act=True,
-                 act=None,
-                 name=None):
-        super(ConvBNLayer, self).__init__()
-        self.if_act = if_act
-
-        self.conv = nn.Conv2d(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=(kernel_size - 1) // 2,
-            groups=groups,
-            bias=False)
-
-        self.bn = nn.BatchNorm2d(
-            out_channels,)
-        self.act = act
-        if act is not None:
-            self._act = Activation(act)
-
-    def forward(self, x):
-        x = self.conv(x)
-        x = self.bn(x)
-        if self.act is not None:
-            x = self._act(x)
-        return x
-
-
-class SAST_Header1(nn.Module):
-    def __init__(self, in_channels, **kwargs):
-        super(SAST_Header1, self).__init__()
-        out_channels = [64, 64, 128]
-        self.score_conv = nn.Sequential(
-            ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_score1'),
-            ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_score2'),
-            ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_score3'),
-            ConvBNLayer(out_channels[2], 1, 3, 1, act=None, name='f_score4')
-        )
-        self.border_conv = nn.Sequential(
-            ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_border1'),
-            ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_border2'),
-            ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_border3'),
-            ConvBNLayer(out_channels[2], 4, 3, 1, act=None, name='f_border4')
-        )
-
-    def forward(self, x):
-        f_score = self.score_conv(x)
-        f_score = torch.sigmoid(f_score)
-        f_border = self.border_conv(x)
-        return f_score, f_border
-
-
-class SAST_Header2(nn.Module):
-    def __init__(self, in_channels, **kwargs):
-        super(SAST_Header2, self).__init__()
-        out_channels = [64, 64, 128]
-        self.tvo_conv = nn.Sequential(
-            ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_tvo1'),
-            ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_tvo2'),
-            ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_tvo3'),
-            ConvBNLayer(out_channels[2], 8, 3, 1, act=None, name='f_tvo4')
-        )
-        self.tco_conv = nn.Sequential(
-            ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_tco1'),
-            ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_tco2'),
-            ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_tco3'),
-            ConvBNLayer(out_channels[2], 2, 3, 1, act=None, name='f_tco4')
-        )
-
-    def forward(self, x):
-        f_tvo = self.tvo_conv(x)
-        f_tco = self.tco_conv(x)
-        return f_tvo, f_tco
-
-
-class SASTHead(nn.Module):
-    """
-    """
-    def __init__(self, in_channels, **kwargs):
-        super(SASTHead, self).__init__()
-
-        self.head1 = SAST_Header1(in_channels)
-        self.head2 = SAST_Header2(in_channels)
-
-    def forward(self, x):
-        f_score, f_border = self.head1(x)
-        f_tvo, f_tco = self.head2(x)
-
-        predicts = {}
-        predicts['f_score'] = f_score
-        predicts['f_border'] = f_border
-        predicts['f_tvo'] = f_tvo
-        predicts['f_tco'] = f_tco
-        return predicts
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/e2e_pg_head.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/e2e_pg_head.py
-
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from pytorchocr.modeling.common import Activation
-
-
-class ConvBNLayer(nn.Module):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride,
-                 padding,
-                 groups=1,
-                 if_act=True,
-                 act=None,
-                 name=None):
-        super(ConvBNLayer, self).__init__()
-        self.if_act = if_act
-        self.act = act
-        self.conv = nn.Conv2d(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            groups=groups,
-            bias=False)
-
-        self.bn = nn.BatchNorm2d(out_channels)
-        self.act = act
-        if self.act is not None:
-            self._act = Activation(act_type=self.act, inplace=True)
-
-    def forward(self, x):
-        x = self.conv(x)
-        x = self.bn(x)
-        if self.act is not None:
-            x = self._act(x)
-        return x
-
-
-class PGHead(nn.Module):
-    """
-    """
-
-    def __init__(self, in_channels, **kwargs):
-        super(PGHead, self).__init__()
-        self.conv_f_score1 = ConvBNLayer(
-            in_channels=in_channels,
-            out_channels=64,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            act='relu',
-            name="conv_f_score{}".format(1))
-        self.conv_f_score2 = ConvBNLayer(
-            in_channels=64,
-            out_channels=64,
-            kernel_size=3,
-            stride=1,
-            padding=1,
-            act='relu',
-            name="conv_f_score{}".format(2))
-        self.conv_f_score3 = ConvBNLayer(
-            in_channels=64,
-            out_channels=128,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            act='relu',
-            name="conv_f_score{}".format(3))
-
-        self.conv1 = nn.Conv2d(
-            in_channels=128,
-            out_channels=1,
-            kernel_size=3,
-            stride=1,
-            padding=1,
-            groups=1,
-            bias=False)
-
-        self.conv_f_boder1 = ConvBNLayer(
-            in_channels=in_channels,
-            out_channels=64,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            act='relu',
-            name="conv_f_boder{}".format(1))
-        self.conv_f_boder2 = ConvBNLayer(
-            in_channels=64,
-            out_channels=64,
-            kernel_size=3,
-            stride=1,
-            padding=1,
-            act='relu',
-            name="conv_f_boder{}".format(2))
-        self.conv_f_boder3 = ConvBNLayer(
-            in_channels=64,
-            out_channels=128,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            act='relu',
-            name="conv_f_boder{}".format(3))
-        self.conv2 = nn.Conv2d(
-            in_channels=128,
-            out_channels=4,
-            kernel_size=3,
-            stride=1,
-            padding=1,
-            groups=1,
-            bias=False)
-        self.conv_f_char1 = ConvBNLayer(
-            in_channels=in_channels,
-            out_channels=128,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            act='relu',
-            name="conv_f_char{}".format(1))
-        self.conv_f_char2 = ConvBNLayer(
-            in_channels=128,
-            out_channels=128,
-            kernel_size=3,
-            stride=1,
-            padding=1,
-            act='relu',
-            name="conv_f_char{}".format(2))
-        self.conv_f_char3 = ConvBNLayer(
-            in_channels=128,
-            out_channels=256,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            act='relu',
-            name="conv_f_char{}".format(3))
-        self.conv_f_char4 = ConvBNLayer(
-            in_channels=256,
-            out_channels=256,
-            kernel_size=3,
-            stride=1,
-            padding=1,
-            act='relu',
-            name="conv_f_char{}".format(4))
-        self.conv_f_char5 = ConvBNLayer(
-            in_channels=256,
-            out_channels=256,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            act='relu',
-            name="conv_f_char{}".format(5))
-        self.conv3 = nn.Conv2d(
-            in_channels=256,
-            out_channels=37,
-            kernel_size=3,
-            stride=1,
-            padding=1,
-            groups=1,
-            bias=False)
-
-        self.conv_f_direc1 = ConvBNLayer(
-            in_channels=in_channels,
-            out_channels=64,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            act='relu',
-            name="conv_f_direc{}".format(1))
-        self.conv_f_direc2 = ConvBNLayer(
-            in_channels=64,
-            out_channels=64,
-            kernel_size=3,
-            stride=1,
-            padding=1,
-            act='relu',
-            name="conv_f_direc{}".format(2))
-        self.conv_f_direc3 = ConvBNLayer(
-            in_channels=64,
-            out_channels=128,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            act='relu',
-            name="conv_f_direc{}".format(3))
-        self.conv4 = nn.Conv2d(
-            in_channels=128,
-            out_channels=2,
-            kernel_size=3,
-            stride=1,
-            padding=1,
-            groups=1,
-            bias=False)
-
-    def forward(self, x):
-        f_score = self.conv_f_score1(x)
-        f_score = self.conv_f_score2(f_score)
-        f_score = self.conv_f_score3(f_score)
-        f_score = self.conv1(f_score)
-        f_score = torch.sigmoid(f_score)
-
-        # f_border
-        f_border = self.conv_f_boder1(x)
-        f_border = self.conv_f_boder2(f_border)
-        f_border = self.conv_f_boder3(f_border)
-        f_border = self.conv2(f_border)
-
-        f_char = self.conv_f_char1(x)
-        f_char = self.conv_f_char2(f_char)
-        f_char = self.conv_f_char3(f_char)
-        f_char = self.conv_f_char4(f_char)
-        f_char = self.conv_f_char5(f_char)
-        f_char = self.conv3(f_char)
-
-        f_direction = self.conv_f_direc1(x)
-        f_direction = self.conv_f_direc2(f_direction)
-        f_direction = self.conv_f_direc3(f_direction)
-        f_direction = self.conv4(f_direction)
-
-        predicts = {}
-        predicts['f_score'] = f_score
-        predicts['f_border'] = f_border
-        predicts['f_char'] = f_char
-        predicts['f_direction'] = f_direction
-        return predicts
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/multiheadAttention.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/multiheadAttention.py
-import torch
-from torch import nn
-import torch.nn.functional as F
-from torch.nn import Linear
-from torch.nn.init import xavier_uniform_
-
-
-class MultiheadAttention(nn.Module):
-    """Allows the model to jointly attend to information
-    from different representation subspaces.
-    See reference: Attention Is All You Need
-
-    .. math::
-        \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
-        \text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)
-
-    Args:
-        embed_dim: total dimension of the model
-        num_heads: parallel attention layers, or heads
-
-    """
-
-    def __init__(self,
-                 embed_dim,
-                 num_heads,
-                 dropout=0.,
-                 bias=True,
-                 add_bias_kv=False,
-                 add_zero_attn=False):
-        super(MultiheadAttention, self).__init__()
-        self.embed_dim = embed_dim
-        self.num_heads = num_heads
-        self.dropout = dropout
-        self.head_dim = embed_dim // num_heads
-        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
-        self.scaling = self.head_dim**-0.5
-        self.out_proj = Linear(embed_dim, embed_dim, bias=bias)
-        self._reset_parameters()
-        self.conv1 = torch.nn.Conv2d(
-            in_channels=embed_dim, out_channels=embed_dim, kernel_size=(1, 1))
-        self.conv2 = torch.nn.Conv2d(
-            in_channels=embed_dim, out_channels=embed_dim, kernel_size=(1, 1))
-        self.conv3 = torch.nn.Conv2d(
-            in_channels=embed_dim, out_channels=embed_dim, kernel_size=(1, 1))
-
-    def _reset_parameters(self):
-        xavier_uniform_(self.out_proj.weight)
-
-    def forward(self,
-                query,
-                key,
-                value,
-                key_padding_mask=None,
-                incremental_state=None,
-                attn_mask=None):
-        """
-        Inputs of forward function
-            query: [target length, batch size, embed dim]
-            key: [sequence length, batch size, embed dim]
-            value: [sequence length, batch size, embed dim]
-            key_padding_mask: if True, mask padding based on batch size
-            incremental_state: if provided, previous time steps are cashed
-            need_weights: output attn_output_weights
-            static_kv: key and value are static
-
-        Outputs of forward function
-            attn_output: [target length, batch size, embed dim]
-            attn_output_weights: [batch size, target length, sequence length]
-        """
-        q_shape = query.shape
-        src_shape = key.shape
-        q = self._in_proj_q(query)
-        k = self._in_proj_k(key)
-        v = self._in_proj_v(value)
-        q *= self.scaling
-        # q = paddle.transpose(
-        #     paddle.reshape(
-        #         q, [q_shape[0], q_shape[1], self.num_heads, self.head_dim]),
-        #     [1, 2, 0, 3])
-        q = torch.reshape(q, (q_shape[0], q_shape[1], self.num_heads, self.head_dim))
-        q = q.permute(1, 2, 0, 3)
-        # k = paddle.transpose(
-        #     paddle.reshape(
-        #         k, [src_shape[0], q_shape[1], self.num_heads, self.head_dim]),
-        #     [1, 2, 0, 3])
-        k = torch.reshape(k, (src_shape[0], q_shape[1], self.num_heads, self.head_dim))
-        k = k.permute(1, 2, 0, 3)
-        # v = paddle.transpose(
-        #     paddle.reshape(
-        #         v, [src_shape[0], q_shape[1], self.num_heads, self.head_dim]),
-        #     [1, 2, 0, 3])
-        v = torch.reshape(v, (src_shape[0], q_shape[1], self.num_heads, self.head_dim))
-        v = v.permute(1, 2, 0, 3)
-        if key_padding_mask is not None:
-            assert key_padding_mask.shape[0] == q_shape[1]
-            assert key_padding_mask.shape[1] == src_shape[0]
-        attn_output_weights = torch.matmul(q,
-                                            k.permute(0, 1, 3, 2))
-        if attn_mask is not None:
-            attn_mask = torch.unsqueeze(torch.unsqueeze(attn_mask, 0), 0)
-            attn_output_weights += attn_mask
-        if key_padding_mask is not None:
-            attn_output_weights = torch.reshape(
-                attn_output_weights,
-                [q_shape[1], self.num_heads, q_shape[0], src_shape[0]])
-            key = torch.unsqueeze(torch.unsqueeze(key_padding_mask, 1), 2)
-            key = key.type(torch.float32)
-            y = torch.full(
-                size=key.shape, fill_value=float("-Inf"), dtype=torch.float32)
-            y = torch.where(key == 0., key, y)
-            attn_output_weights += y
-        attn_output_weights = F.softmax(
-            attn_output_weights.type(torch.float32),
-            dim=-1,
-            dtype=torch.float32 if attn_output_weights.dtype == torch.float16
-            else attn_output_weights.dtype)
-        attn_output_weights = F.dropout(
-            attn_output_weights, p=self.dropout, training=self.training)
-
-        attn_output = torch.matmul(attn_output_weights, v)
-        attn_output = torch.reshape(
-        attn_output.permute(2, 0, 1, 3),
-            [q_shape[0], q_shape[1], self.embed_dim])
-        attn_output = self.out_proj(attn_output)
-
-        return attn_output
-
-    def _in_proj_q(self, query):
-        query = query.permute(1, 2, 0)
-        query = torch.unsqueeze(query, dim=2)
-        res = self.conv1(query)
-        res = torch.squeeze(res, dim=2)
-        res = res.permute(2, 0, 1)
-        return res
-
-    def _in_proj_k(self, key):
-        key = key.permute(1, 2, 0)
-        key = torch.unsqueeze(key, dim=2)
-        res = self.conv2(key)
-        res = torch.squeeze(res, dim=2)
-        res = res.permute(2, 0, 1)
-        return res
-
-    def _in_proj_v(self, value):
-        value = value.permute(1, 2, 0)  #(1, 2, 0)
-        value = torch.unsqueeze(value, dim=2)
-        res = self.conv3(value)
-        res = torch.squeeze(res, dim=2)
-        res = res.permute(2, 0, 1)
-        return res
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/rec_att_head.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/rec_att_head.py
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from pytorchocr.modeling.common import Activation
-
-
-class AttentionHead(nn.Module):
-    def __init__(self, in_channels, out_channels, hidden_size, **kwargs):
-        super(AttentionHead, self).__init__()
-        self.input_size = in_channels
-        self.hidden_size = hidden_size
-        self.num_classes = out_channels
-
-        self.attention_cell = AttentionGRUCell(
-            in_channels, hidden_size, out_channels, use_gru=False)
-        self.generator = nn.Linear(hidden_size, out_channels)
-
-    def _char_to_onehot(self, input_char, onehot_dim):
-        input_ont_hot = F.one_hot(input_char.type(torch.int64), onehot_dim)
-        return input_ont_hot
-
-    def forward(self, inputs, targets=None, batch_max_length=25):
-        batch_size = inputs.size()[0]
-        num_steps = batch_max_length
-
-        hidden = torch.zeros((batch_size, self.hidden_size))
-        output_hiddens = []
-
-        if targets is not None:
-            for i in range(num_steps):
-                char_onehots = self._char_to_onehot(
-                    targets[:, i], onehot_dim=self.num_classes)
-                (outputs, hidden), alpha = self.attention_cell(hidden, inputs,
-                                                               char_onehots)
-                output_hiddens.append(torch.unsqueeze(outputs, dim=1))
-            output = torch.cat(output_hiddens, dim=1)
-            probs = self.generator(output)
-
-        else:
-            targets = torch.zeros([batch_size], dtype=torch.int32)
-            probs = None
-            char_onehots = None
-            outputs = None
-            alpha = None
-
-            for i in range(num_steps):
-                char_onehots = self._char_to_onehot(
-                    targets, onehot_dim=self.num_classes)
-                (outputs, hidden), alpha = self.attention_cell(hidden, inputs,
-                                                               char_onehots)
-                probs_step = self.generator(outputs)
-                if probs is None:
-                    probs = torch.unsqueeze(probs_step, dim=1)
-                else:
-                    probs = torch.cat(
-                        [probs, torch.unsqueeze(
-                            probs_step, dim=1)], dim=1)
-                next_input = probs_step.argmax(dim=1)
-                targets = next_input
-
-        return probs
-
-
-class AttentionGRUCell(nn.Module):
-    def __init__(self, input_size, hidden_size, num_embeddings, use_gru=False):
-        super(AttentionGRUCell, self).__init__()
-        self.i2h = nn.Linear(input_size, hidden_size, bias=False)
-        self.h2h = nn.Linear(hidden_size, hidden_size)
-        self.score = nn.Linear(hidden_size, 1, bias=False)
-
-        self.rnn = nn.GRUCell(
-            input_size=input_size + num_embeddings, hidden_size=hidden_size, bias=True)
-
-        self.hidden_size = hidden_size
-
-    def forward(self, prev_hidden, batch_H, char_onehots):
-
-        batch_H_proj = self.i2h(batch_H)
-        prev_hidden_proj = torch.unsqueeze(self.h2h(prev_hidden), dim=1)
-
-        res = torch.add(batch_H_proj, prev_hidden_proj)
-        res = torch.tanh(res)
-        e = self.score(res)
-
-        alpha = F.softmax(e, dim=1)
-        alpha = alpha.permute(0, 2, 1)
-        context = torch.squeeze(torch.matmul(alpha, batch_H), dim=1)
-        concat_context = torch.cat([context, char_onehots.float()], 1)
-
-        cur_hidden = self.rnn(concat_context, prev_hidden)
-
-        return (cur_hidden, cur_hidden), alpha
-
-
-class AttentionLSTM(nn.Module):
-    def __init__(self, in_channels, out_channels, hidden_size, **kwargs):
-        super(AttentionLSTM, self).__init__()
-        self.input_size = in_channels
-        self.hidden_size = hidden_size
-        self.num_classes = out_channels
-
-        self.attention_cell = AttentionLSTMCell(
-            in_channels, hidden_size, out_channels, use_gru=False)
-        self.generator = nn.Linear(hidden_size, out_channels)
-
-    def _char_to_onehot(self, input_char, onehot_dim):
-        input_ont_hot = F.one_hot(input_char.type(torch.int64), onehot_dim)
-        return input_ont_hot
-
-    def forward(self, inputs, targets=None, batch_max_length=25):
-        batch_size = inputs.shape[0]
-        num_steps = batch_max_length
-
-        hidden = (torch.zeros((batch_size, self.hidden_size)), torch.zeros(
-            (batch_size, self.hidden_size)))
-        output_hiddens = []
-
-        if targets is not None:
-            for i in range(num_steps):
-                # one-hot vectors for a i-th char
-                char_onehots = self._char_to_onehot(
-                    targets[:, i], onehot_dim=self.num_classes)
-                hidden, alpha = self.attention_cell(hidden, inputs,
-                                                    char_onehots)
-
-                hidden = (hidden[1][0], hidden[1][1])
-                output_hiddens.append(torch.unsqueeze(hidden[0], dim=1))
-            output = torch.cat(output_hiddens, dim=1)
-            probs = self.generator(output)
-
-        else:
-            targets = torch.zeros([batch_size], dtype=torch.int32)
-            probs = None
-
-            for i in range(num_steps):
-                char_onehots = self._char_to_onehot(
-                    targets, onehot_dim=self.num_classes)
-                hidden, alpha = self.attention_cell(hidden, inputs,
-                                                    char_onehots)
-                probs_step = self.generator(hidden[0])
-                hidden = (hidden[1][0], hidden[1][1])
-                if probs is None:
-                    probs = torch.unsqueeze(probs_step, dim=1)
-                else:
-                    probs = torch.cat(
-                        [probs, torch.unsqueeze(
-                            probs_step, dim=1)], dim=1)
-
-                next_input = probs_step.argmax(dim=1)
-
-                targets = next_input
-
-        return probs
-
-
-class AttentionLSTMCell(nn.Module):
-    def __init__(self, input_size, hidden_size, num_embeddings, use_gru=False):
-        super(AttentionLSTMCell, self).__init__()
-        self.i2h = nn.Linear(input_size, hidden_size, bias=False)
-        self.h2h = nn.Linear(hidden_size, hidden_size)
-        self.score = nn.Linear(hidden_size, 1, bias=False)
-        if not use_gru:
-            self.rnn = nn.LSTMCell(
-                input_size=input_size + num_embeddings, hidden_size=hidden_size)
-        else:
-            self.rnn = nn.GRUCell(
-                input_size=input_size + num_embeddings, hidden_size=hidden_size)
-
-        self.hidden_size = hidden_size
-
-    def forward(self, prev_hidden, batch_H, char_onehots):
-        batch_H_proj = self.i2h(batch_H)
-        prev_hidden_proj = torch.unsqueeze(self.h2h(prev_hidden[0]), dim=1)
-        res = torch.add(batch_H_proj, prev_hidden_proj)
-        res = torch.tanh(res)
-        e = self.score(res)
-
-        alpha = F.softmax(e, dim=1)
-        alpha = alpha.permute(0, 2, 1)
-        context = torch.squeeze(torch.matmul(alpha, batch_H), dim=1)
-        concat_context = torch.cat([context, char_onehots.float()], 1)
-
-        cur_hidden = self.rnn(concat_context, prev_hidden)
-
-        return cur_hidden, alpha