OCRPipe.py 2.34 KB
Newer Older
1
2
from loguru import logger

3
from magic_pdf.config.make_content_config import DropMode, MakeMode
4
from magic_pdf.data.data_reader_writer import DataWriter
5
from magic_pdf.data.dataset import Dataset
赵小蒙's avatar
赵小蒙 committed
6
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
赵小蒙's avatar
赵小蒙 committed
7
8
9
10
11
from magic_pdf.pipe.AbsPipe import AbsPipe
from magic_pdf.user_api import parse_ocr_pdf


class OCRPipe(AbsPipe):
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
    def __init__(
        self,
        dataset: Dataset,
        model_list: list,
        image_writer: DataWriter,
        is_debug: bool = False,
        start_page_id=0,
        end_page_id=None,
        lang=None,
        layout_model=None,
        formula_enable=None,
        table_enable=None,
    ):
        super().__init__(
            dataset,
            model_list,
            image_writer,
            is_debug,
            start_page_id,
            end_page_id,
            lang,
            layout_model,
            formula_enable,
            table_enable,
        )
赵小蒙's avatar
赵小蒙 committed
37
38
39
40

    def pipe_classify(self):
        pass

41
    def pipe_analyze(self):
42
43
44
45
46
47
48
49
50
51
        self.infer_res = doc_analyze(
            self.dataset,
            ocr=True,
            start_page_id=self.start_page_id,
            end_page_id=self.end_page_id,
            lang=self.lang,
            layout_model=self.layout_model,
            formula_enable=self.formula_enable,
            table_enable=self.table_enable,
        )
52

赵小蒙's avatar
赵小蒙 committed
53
    def pipe_parse(self):
54
55
56
57
58
59
60
61
62
63
64
65
        self.pdf_mid_data = parse_ocr_pdf(
            self.dataset,
            self.infer_res,
            self.image_writer,
            is_debug=self.is_debug,
            start_page_id=self.start_page_id,
            end_page_id=self.end_page_id,
            lang=self.lang,
            layout_model=self.layout_model,
            formula_enable=self.formula_enable,
            table_enable=self.table_enable,
        )
赵小蒙's avatar
赵小蒙 committed
66

67
    def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
68
        result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
69
        logger.info('ocr_pipe mk content list finished')
70
71
        return result

72
73
74
75
76
77
    def pipe_mk_markdown(
        self,
        img_parent_path: str,
        drop_mode=DropMode.WHOLE_PDF,
        md_make_mode=MakeMode.MM_MD,
    ):
78
        result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
79
        logger.info(f'ocr_pipe mk {md_make_mode} finished')
80
        return result