operators.py 6.47 KB
Newer Older
icecraft's avatar
icecraft committed
1
2
3
4
5
import copy
import json
import os
from typing import Callable

icecraft's avatar
icecraft committed
6
from magic_pdf.config.constants import PARSE_TYPE_OCR, PARSE_TYPE_TXT
icecraft's avatar
icecraft committed
7
8
9
10
11
from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.data.data_reader_writer import DataWriter
from magic_pdf.data.dataset import Dataset
from magic_pdf.filter import classify
from magic_pdf.libs.draw_bbox import draw_model_bbox
icecraft's avatar
icecraft committed
12
13
from magic_pdf.libs.version import __version__
from magic_pdf.model import InferenceResultBase
icecraft's avatar
icecraft committed
14
15
from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
from magic_pdf.pipe.operators import PipeResult
icecraft's avatar
icecraft committed
16

icecraft's avatar
icecraft committed
17

xu rui's avatar
xu rui committed
18
class InferenceResult(InferenceResultBase):
icecraft's avatar
icecraft committed
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
    def __init__(self, inference_results: list, dataset: Dataset):
        """Initialized method.

        Args:
            inference_results (list): the inference result generated by model
            dataset (Dataset): the dataset related with model inference result
        """
        self._infer_res = inference_results
        self._dataset = dataset

    def draw_model(self, file_path: str) -> None:
        """Draw model inference result.

        Args:
            file_path (str): the output file path
        """
        dir_name = os.path.dirname(file_path)
        base_name = os.path.basename(file_path)
        if not os.path.exists(dir_name):
            os.makedirs(dir_name, exist_ok=True)
        draw_model_bbox(
            copy.deepcopy(self._infer_res), self._dataset, dir_name, base_name
        )

    def dump_model(self, writer: DataWriter, file_path: str):
        """Dump model inference result to file.

        Args:
            writer (DataWriter): writer handle
            file_path (str): the location of target file
        """
        writer.write_string(
            file_path, json.dumps(self._infer_res, ensure_ascii=False, indent=4)
        )

    def get_infer_res(self):
        """Get the inference result.

        Returns:
xu rui's avatar
xu rui committed
58
            list: the inference result generated by model
icecraft's avatar
icecraft committed
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
        """
        return self._infer_res

    def apply(self, proc: Callable, *args, **kwargs):
        """Apply callable method which.

        Args:
            proc (Callable): invoke proc as follows:
                proc(inference_result, *args, **kwargs)

        Returns:
            Any: return the result generated by proc
        """
        return proc(copy.deepcopy(self._infer_res), *args, **kwargs)

    def pipe_auto_mode(
        self,
        imageWriter: DataWriter,
        start_page_id=0,
        end_page_id=None,
        debug_mode=False,
        lang=None,
    ) -> PipeResult:
        """Post-proc the model inference result.
            step1: classify the dataset type
            step2: based the result of step1, using `pipe_txt_mode` or `pipe_ocr_mode`

        Args:
            imageWriter (DataWriter): the image writer handle
            start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
xu rui's avatar
xu rui committed
89
            end_page_id (int, optional):  Defaults to the last page index of dataset. Let user select some pages He/She want to process
icecraft's avatar
icecraft committed
90
            debug_mode (bool, optional): Defaults to False. will dump more log if enabled
xu rui's avatar
xu rui committed
91
            lang (str, optional): Defaults to None.
icecraft's avatar
icecraft committed
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121

        Returns:
            PipeResult: the result
        """

        pdf_proc_method = classify(self._dataset.data_bits())

        if pdf_proc_method == SupportedPdfParseMethod.TXT:
            return self.pipe_txt_mode(
                imageWriter, start_page_id, end_page_id, debug_mode, lang
            )
        else:
            return self.pipe_ocr_mode(
                imageWriter, start_page_id, end_page_id, debug_mode, lang
            )

    def pipe_txt_mode(
        self,
        imageWriter: DataWriter,
        start_page_id=0,
        end_page_id=None,
        debug_mode=False,
        lang=None,
    ) -> PipeResult:
        """Post-proc the model inference result, Extract the text using the
        third library, such as `pymupdf`

        Args:
            imageWriter (DataWriter): the image writer handle
            start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
xu rui's avatar
xu rui committed
122
            end_page_id (int, optional):  Defaults to the last page index of dataset. Let user select some pages He/She want to process
icecraft's avatar
icecraft committed
123
            debug_mode (bool, optional): Defaults to False. will dump more log if enabled
xu rui's avatar
xu rui committed
124
            lang (str, optional): Defaults to None.
icecraft's avatar
icecraft committed
125
126
127
128
129
130
131

        Returns:
            PipeResult: the result
        """

        def proc(*args, **kwargs) -> PipeResult:
            res = pdf_parse_union(*args, **kwargs)
icecraft's avatar
icecraft committed
132
133
134
135
            res['_parse_type'] = PARSE_TYPE_TXT
            res['_version_name'] = __version__
            if 'lang' in kwargs and kwargs['lang'] is not None:
                res['lang'] = kwargs['lang']
icecraft's avatar
icecraft committed
136
137
            return PipeResult(res, self._dataset)

138
        res = self.apply(
icecraft's avatar
icecraft committed
139
140
141
142
143
144
145
146
147
            proc,
            self._dataset,
            imageWriter,
            SupportedPdfParseMethod.TXT,
            start_page_id=start_page_id,
            end_page_id=end_page_id,
            debug_mode=debug_mode,
            lang=lang,
        )
148
        return res
icecraft's avatar
icecraft committed
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163

    def pipe_ocr_mode(
        self,
        imageWriter: DataWriter,
        start_page_id=0,
        end_page_id=None,
        debug_mode=False,
        lang=None,
    ) -> PipeResult:
        """Post-proc the model inference result, Extract the text using `OCR`
        technical.

        Args:
            imageWriter (DataWriter): the image writer handle
            start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
xu rui's avatar
xu rui committed
164
            end_page_id (int, optional):  Defaults to the last page index of dataset. Let user select some pages He/She want to process
icecraft's avatar
icecraft committed
165
            debug_mode (bool, optional): Defaults to False. will dump more log if enabled
xu rui's avatar
xu rui committed
166
            lang (str, optional): Defaults to None.
icecraft's avatar
icecraft committed
167
168
169
170
171
172
173

        Returns:
            PipeResult: the result
        """

        def proc(*args, **kwargs) -> PipeResult:
            res = pdf_parse_union(*args, **kwargs)
icecraft's avatar
icecraft committed
174
175
176
177
            res['_parse_type'] = PARSE_TYPE_OCR
            res['_version_name'] = __version__
            if 'lang' in kwargs and kwargs['lang'] is not None:
                res['lang'] = kwargs['lang']
icecraft's avatar
icecraft committed
178
179
            return PipeResult(res, self._dataset)

180
        res = self.apply(
icecraft's avatar
icecraft committed
181
182
183
            proc,
            self._dataset,
            imageWriter,
icecraft's avatar
icecraft committed
184
            SupportedPdfParseMethod.OCR,
icecraft's avatar
icecraft committed
185
186
187
188
189
            start_page_id=start_page_id,
            end_page_id=end_page_id,
            debug_mode=debug_mode,
            lang=lang,
        )
icecraft's avatar
icecraft committed
190
        return res