AbsPipe.py 3.84 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
2
from abc import ABC, abstractmethod

赵小蒙's avatar
赵小蒙 committed
3
from magic_pdf.dict2md.ocr_mkcontent import union_make
赵小蒙's avatar
赵小蒙 committed
4
5
from magic_pdf.filter.pdf_classify_by_type import classify
from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
6
from magic_pdf.libs.MakeContentConfig import MakeMode, DropMode
kernel.h@qq.com's avatar
kernel.h@qq.com committed
7
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
赵小蒙's avatar
赵小蒙 committed
8
9
10
11
12
13
14
15
from magic_pdf.libs.drop_reason import DropReason
from magic_pdf.libs.json_compressor import JsonCompressor


class AbsPipe(ABC):
    """
    txt和ocr处理的抽象类
    """
kernel.h@qq.com's avatar
kernel.h@qq.com committed
16
17
    PIP_OCR = "ocr"
    PIP_TXT = "txt"
赵小蒙's avatar
赵小蒙 committed
18

19
    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False):
赵小蒙's avatar
赵小蒙 committed
20
21
22
        self.pdf_bytes = pdf_bytes
        self.model_list = model_list
        self.image_writer = image_writer
23
        self.pdf_mid_data = None  # 未压缩
kernel.h@qq.com's avatar
kernel.h@qq.com committed
24
        self.is_debug = is_debug
kernel.h@qq.com's avatar
kernel.h@qq.com committed
25
26
27
    
    def get_compress_pdf_mid_data(self):
        return JsonCompressor.compress_json(self.pdf_mid_data)
赵小蒙's avatar
赵小蒙 committed
28
29
30
31
32
33
34
35

    @abstractmethod
    def pipe_classify(self):
        """
        有状态的分类
        """
        raise NotImplementedError

36
37
38
39
40
41
42
    @abstractmethod
    def pipe_analyze(self):
        """
        有状态的跑模型分析
        """
        raise NotImplementedError

赵小蒙's avatar
赵小蒙 committed
43
44
45
46
47
48
49
50
    @abstractmethod
    def pipe_parse(self):
        """
        有状态的解析
        """
        raise NotImplementedError

    @abstractmethod
51
    def pipe_mk_uni_format(self, img_parent_path, drop_mode):
赵小蒙's avatar
赵小蒙 committed
52
53
54
55
56
57
        """
        有状态的组装统一格式
        """
        raise NotImplementedError

    @abstractmethod
58
    def pipe_mk_markdown(self, img_parent_path, drop_mode):
赵小蒙's avatar
赵小蒙 committed
59
60
61
62
63
64
65
66
        """
        有状态的组装markdown
        """
        raise NotImplementedError

    @staticmethod
    def classify(pdf_bytes: bytes) -> str:
        """
赵小蒙's avatar
赵小蒙 committed
67
        根据pdf的元数据,判断是文本pdf,还是ocr pdf
赵小蒙's avatar
赵小蒙 committed
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
        """
        pdf_meta = pdf_meta_scan(pdf_bytes)
        if pdf_meta.get("_need_drop", False):  # 如果返回了需要丢弃的标志,则抛出异常
            raise Exception(f"pdf meta_scan need_drop,reason is {pdf_meta['_drop_reason']}")
        else:
            is_encrypted = pdf_meta["is_encrypted"]
            is_needs_password = pdf_meta["is_needs_password"]
            if is_encrypted or is_needs_password:  # 加密的,需要密码的,没有页面的,都不处理
                raise Exception(f"pdf meta_scan need_drop,reason is {DropReason.ENCRYPTED}")
            else:
                is_text_pdf, results = classify(
                    pdf_meta["total_page"],
                    pdf_meta["page_width_pts"],
                    pdf_meta["page_height_pts"],
                    pdf_meta["image_info_per_page"],
                    pdf_meta["text_len_per_page"],
                    pdf_meta["imgs_per_page"],
                    pdf_meta["text_layout_per_page"],
                )
                if is_text_pdf:
kernel.h@qq.com's avatar
kernel.h@qq.com committed
88
                    return AbsPipe.PIP_TXT
赵小蒙's avatar
赵小蒙 committed
89
                else:
kernel.h@qq.com's avatar
kernel.h@qq.com committed
90
                    return AbsPipe.PIP_OCR
赵小蒙's avatar
赵小蒙 committed
91
92

    @staticmethod
93
    def mk_uni_format(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF) -> list:
赵小蒙's avatar
赵小蒙 committed
94
95
96
97
98
        """
        根据pdf类型,生成统一格式content_list
        """
        pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
        pdf_info_list = pdf_mid_data["pdf_info"]
99
        content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, img_buket_path)
赵小蒙's avatar
赵小蒙 committed
100
101
102
        return content_list

    @staticmethod
103
    def mk_markdown(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF) -> list:
赵小蒙's avatar
赵小蒙 committed
104
105
106
107
108
        """
        根据pdf类型,markdown
        """
        pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
        pdf_info_list = pdf_mid_data["pdf_info"]
109
        md_content = union_make(pdf_info_list, MakeMode.MM_MD, drop_mode, img_buket_path)
赵小蒙's avatar
赵小蒙 committed
110
111
112
        return md_content