AbsPipe.py 3.7 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
2
from abc import ABC, abstractmethod

赵小蒙's avatar
赵小蒙 committed
3
from magic_pdf.dict2md.ocr_mkcontent import union_make
赵小蒙's avatar
赵小蒙 committed
4
5
from magic_pdf.filter.pdf_classify_by_type import classify
from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
6
from magic_pdf.libs.MakeContentConfig import MakeMode, DropMode
kernel.h@qq.com's avatar
kernel.h@qq.com committed
7
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
赵小蒙's avatar
赵小蒙 committed
8
9
10
11
12
13
14
15
from magic_pdf.libs.drop_reason import DropReason
from magic_pdf.libs.json_compressor import JsonCompressor


class AbsPipe(ABC):
    """
    txt和ocr处理的抽象类
    """
kernel.h@qq.com's avatar
kernel.h@qq.com committed
16
17
    PIP_OCR = "ocr"
    PIP_TXT = "txt"
赵小蒙's avatar
赵小蒙 committed
18

19
    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False):
赵小蒙's avatar
赵小蒙 committed
20
21
22
        self.pdf_bytes = pdf_bytes
        self.model_list = model_list
        self.image_writer = image_writer
23
        self.pdf_mid_data = None  # 未压缩
kernel.h@qq.com's avatar
kernel.h@qq.com committed
24
        self.is_debug = is_debug
kernel.h@qq.com's avatar
kernel.h@qq.com committed
25
26
27
    
    def get_compress_pdf_mid_data(self):
        return JsonCompressor.compress_json(self.pdf_mid_data)
赵小蒙's avatar
赵小蒙 committed
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43

    @abstractmethod
    def pipe_classify(self):
        """
        有状态的分类
        """
        raise NotImplementedError

    @abstractmethod
    def pipe_parse(self):
        """
        有状态的解析
        """
        raise NotImplementedError

    @abstractmethod
44
    def pipe_mk_uni_format(self, img_parent_path, drop_mode):
赵小蒙's avatar
赵小蒙 committed
45
46
47
48
49
50
        """
        有状态的组装统一格式
        """
        raise NotImplementedError

    @abstractmethod
51
    def pipe_mk_markdown(self, img_parent_path, drop_mode):
赵小蒙's avatar
赵小蒙 committed
52
53
54
55
56
57
58
59
        """
        有状态的组装markdown
        """
        raise NotImplementedError

    @staticmethod
    def classify(pdf_bytes: bytes) -> str:
        """
赵小蒙's avatar
赵小蒙 committed
60
        根据pdf的元数据,判断是文本pdf,还是ocr pdf
赵小蒙's avatar
赵小蒙 committed
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
        """
        pdf_meta = pdf_meta_scan(pdf_bytes)
        if pdf_meta.get("_need_drop", False):  # 如果返回了需要丢弃的标志,则抛出异常
            raise Exception(f"pdf meta_scan need_drop,reason is {pdf_meta['_drop_reason']}")
        else:
            is_encrypted = pdf_meta["is_encrypted"]
            is_needs_password = pdf_meta["is_needs_password"]
            if is_encrypted or is_needs_password:  # 加密的,需要密码的,没有页面的,都不处理
                raise Exception(f"pdf meta_scan need_drop,reason is {DropReason.ENCRYPTED}")
            else:
                is_text_pdf, results = classify(
                    pdf_meta["total_page"],
                    pdf_meta["page_width_pts"],
                    pdf_meta["page_height_pts"],
                    pdf_meta["image_info_per_page"],
                    pdf_meta["text_len_per_page"],
                    pdf_meta["imgs_per_page"],
                    pdf_meta["text_layout_per_page"],
                )
                if is_text_pdf:
kernel.h@qq.com's avatar
kernel.h@qq.com committed
81
                    return AbsPipe.PIP_TXT
赵小蒙's avatar
赵小蒙 committed
82
                else:
kernel.h@qq.com's avatar
kernel.h@qq.com committed
83
                    return AbsPipe.PIP_OCR
赵小蒙's avatar
赵小蒙 committed
84
85

    @staticmethod
86
    def mk_uni_format(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF) -> list:
赵小蒙's avatar
赵小蒙 committed
87
88
89
90
91
        """
        根据pdf类型,生成统一格式content_list
        """
        pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
        pdf_info_list = pdf_mid_data["pdf_info"]
92
        content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, img_buket_path)
赵小蒙's avatar
赵小蒙 committed
93
94
95
        return content_list

    @staticmethod
96
    def mk_markdown(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF) -> list:
赵小蒙's avatar
赵小蒙 committed
97
98
99
100
101
        """
        根据pdf类型,markdown
        """
        pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
        pdf_info_list = pdf_mid_data["pdf_info"]
102
        md_content = union_make(pdf_info_list, MakeMode.MM_MD, drop_mode, img_buket_path)
赵小蒙's avatar
赵小蒙 committed
103
104
105
        return md_content