user_api.py 4.11 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14

"""
用户输入:
    model数组,每个元素代表一个页面
    pdf在s3的路径
    截图保存的s3位置

然后:
    1)根据s3路径,调用spark集群的api,拿到ak,sk,endpoint,构造出s3PDFReader
    2)根据用户输入的s3地址,调用spark集群的api,拿到ak,sk,endpoint,构造出s3ImageWriter

其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!!

"""
15
16
import re

赵小蒙's avatar
赵小蒙 committed
17
18
from loguru import logger

赵小蒙's avatar
赵小蒙 committed
19
from magic_pdf.libs.version import __version__
kernel.h@qq.com's avatar
kernel.h@qq.com committed
20
from magic_pdf.rw import AbsReaderWriter
21
from magic_pdf.pdf_parse_by_ocr_v2 import parse_pdf_by_ocr
许瑞's avatar
许瑞 committed
22
from magic_pdf.pdf_parse_by_txt_v2 import parse_pdf_by_txt
赵小蒙's avatar
赵小蒙 committed
23

kernel.h@qq.com's avatar
kernel.h@qq.com committed
24
25
26
PARSE_TYPE_TXT = "txt"
PARSE_TYPE_OCR = "ocr"

赵小蒙's avatar
赵小蒙 committed
27
28
29
30
31
32
33
34
35
36
37
38
39
def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args,
                  **kwargs):
    """
    解析文本类pdf
    """
    pdf_info_dict = parse_pdf_by_txt(
        pdf_bytes,
        pdf_models,
        imageWriter,
        start_page_id=start_page,
        debug_mode=is_debug,
    )

kernel.h@qq.com's avatar
kernel.h@qq.com committed
40
    pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT
赵小蒙's avatar
赵小蒙 committed
41

赵小蒙's avatar
赵小蒙 committed
42
    pdf_info_dict["_version_name"] = __version__
赵小蒙's avatar
赵小蒙 committed
43

赵小蒙's avatar
赵小蒙 committed
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
    return pdf_info_dict


def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args,
                  **kwargs):
    """
    解析ocr类pdf
    """
    pdf_info_dict = parse_pdf_by_ocr(
        pdf_bytes,
        pdf_models,
        imageWriter,
        start_page_id=start_page,
        debug_mode=is_debug,
    )

kernel.h@qq.com's avatar
kernel.h@qq.com committed
60
    pdf_info_dict["_parse_type"] = PARSE_TYPE_OCR
赵小蒙's avatar
赵小蒙 committed
61

赵小蒙's avatar
赵小蒙 committed
62
    pdf_info_dict["_version_name"] = __version__
赵小蒙's avatar
赵小蒙 committed
63

赵小蒙's avatar
赵小蒙 committed
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
    return pdf_info_dict


def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0,
                    *args, **kwargs):
    """
    ocr和文本混合的pdf,全部解析出来
    """

    def parse_pdf(method):
        try:
            return method(
                pdf_bytes,
                pdf_models,
                imageWriter,
                start_page_id=start_page,
                debug_mode=is_debug,
            )
        except Exception as e:
83
            logger.exception(e)
赵小蒙's avatar
赵小蒙 committed
84
85
86
            return None

    pdf_info_dict = parse_pdf(parse_pdf_by_txt)
87
88
89
90
91
92
93
94
    text_all = ""
    for page_dict in pdf_info_dict['pdf_info']:
        for para_block in page_dict['para_blocks']:
            if para_block['type'] in ['title', 'text']:
                for line in para_block['lines']:
                    for span in line['spans']:
                        text_all += span['content']

95
    def calculate_not_common_character_rate(text):
96
97
98
        garbage_regex = re.compile(r'[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a\u3000-\u303f\uff00-\uffef]')
        # 计算乱码字符的数量
        garbage_count = len(garbage_regex.findall(text))
99
100
101
        total = len(text)
        if total == 0:
            return 0  # 避免除以零的错误
102
103
        return garbage_count / total

104
105
106
107
108
109
    def calculate_not_printable_rate(text):
        printable = sum(1 for c in text if c.isprintable())
        total = len(text)
        if total == 0:
            return 0  # 避免除以零的错误
        return (total - printable) / total
110

111
112
    # not_common_character_rate = calculate_not_common_character_rate(text_all)
    not_printable_rate = calculate_not_printable_rate(text_all)
赵小蒙's avatar
赵小蒙 committed
113
    # 测试乱码pdf,not_common_character_rate > 0.95, not_printable_rate > 0.15
114
115
    # not_common_character_rate对小语种可能会有误伤,not_printable_rate对小语种较为友好
    if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False) or not_printable_rate > 0.1:
116
        logger.warning(f"parse_pdf_by_txt drop or error or garbled_rate too large, switch to parse_pdf_by_ocr")
赵小蒙's avatar
赵小蒙 committed
117
118
119
120
        pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
        if pdf_info_dict is None:
            raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
        else:
kernel.h@qq.com's avatar
kernel.h@qq.com committed
121
            pdf_info_dict["_parse_type"] = PARSE_TYPE_OCR
赵小蒙's avatar
赵小蒙 committed
122
    else:
kernel.h@qq.com's avatar
kernel.h@qq.com committed
123
        pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT
赵小蒙's avatar
赵小蒙 committed
124

赵小蒙's avatar
赵小蒙 committed
125
    pdf_info_dict["_version_name"] = __version__
赵小蒙's avatar
赵小蒙 committed
126

赵小蒙's avatar
赵小蒙 committed
127
    return pdf_info_dict