TXTPipe.py 2.14 KB
Newer Older
1
2
from loguru import logger

3
from magic_pdf.config.make_content_config import DropMode, MakeMode
4
from magic_pdf.data.data_reader_writer import DataWriter
5
from magic_pdf.data.dataset import Dataset
赵小蒙's avatar
赵小蒙 committed
6
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
赵小蒙's avatar
赵小蒙 committed
7
8
9
10
11
12
from magic_pdf.pipe.AbsPipe import AbsPipe
from magic_pdf.user_api import parse_txt_pdf


class TXTPipe(AbsPipe):

13
    def __init__(self, dataset: Dataset, model_list: list, image_writer: DataWriter, is_debug: bool = False,
14
15
                 start_page_id=0, end_page_id=None, lang=None,
                 layout_model=None, formula_enable=None, table_enable=None):
16
        super().__init__(dataset, model_list, image_writer, is_debug, start_page_id, end_page_id, lang,
17
                         layout_model, formula_enable, table_enable)
赵小蒙's avatar
赵小蒙 committed
18
19
20
21

    def pipe_classify(self):
        pass

22
    def pipe_analyze(self):
23
        self.model_list = doc_analyze(self.dataset, ocr=False,
24
                                      start_page_id=self.start_page_id, end_page_id=self.end_page_id,
25
26
                                      lang=self.lang, layout_model=self.layout_model,
                                      formula_enable=self.formula_enable, table_enable=self.table_enable)
27

赵小蒙's avatar
赵小蒙 committed
28
    def pipe_parse(self):
29
        self.pdf_mid_data = parse_txt_pdf(self.dataset, self.model_list, self.image_writer, is_debug=self.is_debug,
30
                                          start_page_id=self.start_page_id, end_page_id=self.end_page_id,
31
32
                                          lang=self.lang, layout_model=self.layout_model,
                                          formula_enable=self.formula_enable, table_enable=self.table_enable)
赵小蒙's avatar
赵小蒙 committed
33

34
    def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
35
        result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
36
        logger.info('txt_pipe mk content list finished')
37
38
39
40
        return result

    def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
        result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
41
        logger.info(f'txt_pipe mk {md_make_mode} finished')
42
        return result