user_api.py 2.6 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16

"""
用户输入:
    model数组,每个元素代表一个页面
    pdf在s3的路径
    截图保存的s3位置

然后:
    1)根据s3路径,调用spark集群的api,拿到ak,sk,endpoint,构造出s3PDFReader
    2)根据用户输入的s3地址,调用spark集群的api,拿到ak,sk,endpoint,构造出s3ImageWriter

其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!!

"""
from loguru import logger

kernel.h@qq.com's avatar
kernel.h@qq.com committed
17
from magic_pdf.rw import AbsReaderWriter
赵小蒙's avatar
赵小蒙 committed
18
19
20
21
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt


kernel.h@qq.com's avatar
kernel.h@qq.com committed
22
23
24
PARSE_TYPE_TXT = "txt"
PARSE_TYPE_OCR = "ocr"

赵小蒙's avatar
赵小蒙 committed
25
26
27
28
29
30
31
32
33
34
35
36
37
def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args,
                  **kwargs):
    """
    解析文本类pdf
    """
    pdf_info_dict = parse_pdf_by_txt(
        pdf_bytes,
        pdf_models,
        imageWriter,
        start_page_id=start_page,
        debug_mode=is_debug,
    )

kernel.h@qq.com's avatar
kernel.h@qq.com committed
38
    pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT
赵小蒙's avatar
赵小蒙 committed
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55

    return pdf_info_dict


def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args,
                  **kwargs):
    """
    解析ocr类pdf
    """
    pdf_info_dict = parse_pdf_by_ocr(
        pdf_bytes,
        pdf_models,
        imageWriter,
        start_page_id=start_page,
        debug_mode=is_debug,
    )

kernel.h@qq.com's avatar
kernel.h@qq.com committed
56
    pdf_info_dict["_parse_type"] = PARSE_TYPE_OCR
赵小蒙's avatar
赵小蒙 committed
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87

    return pdf_info_dict


def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0,
                    *args, **kwargs):
    """
    ocr和文本混合的pdf,全部解析出来
    """

    def parse_pdf(method):
        try:
            return method(
                pdf_bytes,
                pdf_models,
                imageWriter,
                start_page_id=start_page,
                debug_mode=is_debug,
            )
        except Exception as e:
            logger.error(f"{method.__name__} error: {e}")
            return None

    pdf_info_dict = parse_pdf(parse_pdf_by_txt)

    if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False):
        logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
        pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
        if pdf_info_dict is None:
            raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
        else:
kernel.h@qq.com's avatar
kernel.h@qq.com committed
88
            pdf_info_dict["_parse_type"] = PARSE_TYPE_OCR
赵小蒙's avatar
赵小蒙 committed
89
    else:
kernel.h@qq.com's avatar
kernel.h@qq.com committed
90
        pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT
赵小蒙's avatar
赵小蒙 committed
91
92

    return pdf_info_dict