pdf_parse_by_ocr.py 8.77 KB
Newer Older
1
import time
赵小蒙's avatar
赵小蒙 committed
2
from loguru import logger
许瑞's avatar
许瑞 committed
3
4
5
6
7
from magic_pdf.libs.commons import (
    fitz,
    get_delta_time,
    get_docx_model_output,
)
赵小蒙's avatar
赵小蒙 committed
8
from magic_pdf.libs.convert_utils import dict_to_list
9
from magic_pdf.libs.coordinate_transform import get_scale_ratio
10
from magic_pdf.libs.drop_tag import DropTag
11
from magic_pdf.libs.hash_utils import compute_md5
赵小蒙's avatar
赵小蒙 committed
12
from magic_pdf.libs.ocr_content_type import ContentType
xuchao's avatar
xuchao committed
13
from magic_pdf.para.para_split import para_split
14
from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component
15
16
17
18
from magic_pdf.pre_proc.detect_footer_by_model import parse_footers
from magic_pdf.pre_proc.detect_footnote import parse_footnotes_by_model
from magic_pdf.pre_proc.detect_header import parse_headers
from magic_pdf.pre_proc.detect_page_number import parse_pageNos
赵小蒙's avatar
赵小蒙 committed
19
from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
20
from magic_pdf.pre_proc.ocr_detect_layout import layout_detect
许瑞's avatar
许瑞 committed
21
from magic_pdf.pre_proc.ocr_dict_merge import (
赵小蒙's avatar
赵小蒙 committed
22
    merge_spans_to_line_by_layout, merge_lines_to_block,
许瑞's avatar
许瑞 committed
23
)
24
from magic_pdf.pre_proc.ocr_span_list_modify import remove_spans_by_bboxes, remove_overlaps_min_spans, \
赵小蒙's avatar
赵小蒙 committed
25
26
    adjust_bbox_for_standalone_block, modify_y_axis, modify_inline_equation, get_qa_need_list, \
    remove_spans_by_bboxes_dict
许瑞's avatar
许瑞 committed
27
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
赵小蒙's avatar
赵小蒙 committed
28

赵小蒙's avatar
赵小蒙 committed
29

赵小蒙's avatar
赵小蒙 committed
30
def parse_pdf_by_ocr(
赵小蒙's avatar
赵小蒙 committed
31
        pdf_bytes,
赵小蒙's avatar
赵小蒙 committed
32
        pdf_model_output,
33
        imageWriter,
赵小蒙's avatar
赵小蒙 committed
34
35
36
        start_page_id=0,
        end_page_id=None,
        debug_mode=False,
赵小蒙's avatar
赵小蒙 committed
37
):
38
    pdf_bytes_md5 = compute_md5(pdf_bytes)
赵小蒙's avatar
赵小蒙 committed
39

40
41
    pdf_docs = fitz.open("pdf", pdf_bytes)
    # 初始化空的pdf_info_dict
赵小蒙's avatar
赵小蒙 committed
42
    pdf_info_dict = {}
43
44
45
46

    start_time = time.time()

    end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
赵小蒙's avatar
赵小蒙 committed
47
    for page_id in range(start_page_id, end_page_id + 1):
48
49
50

        # 获取当前页的page对象
        page = pdf_docs[page_id]
赵小蒙's avatar
赵小蒙 committed
51
52
53
        # 获取当前页的宽高
        page_w = page.rect.width
        page_h = page.rect.height
54
55
56

        if debug_mode:
            time_now = time.time()
许瑞's avatar
许瑞 committed
57
58
59
            logger.info(
                f"page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}"
            )
60
61
62
            start_time = time_now

        # 获取当前页的模型数据
许瑞's avatar
许瑞 committed
63
        ocr_page_info = get_docx_model_output(
64
            pdf_model_output, page_id
许瑞's avatar
许瑞 committed
65
        )
66
67
68
69
70

        """从json中获取每页的页码、页眉、页脚的bbox"""
        page_no_bboxes = parse_pageNos(page_id, page, ocr_page_info)
        header_bboxes = parse_headers(page_id, page, ocr_page_info)
        footer_bboxes = parse_footers(page_id, page, ocr_page_info)
71
        footnote_bboxes = parse_footnotes_by_model(page_id, page, ocr_page_info, debug_mode=debug_mode)
72

赵小蒙's avatar
赵小蒙 committed
73
74
        # 构建需要remove的bbox字典
        need_remove_spans_bboxes_dict = {
75
76
77
78
            DropTag.PAGE_NUMBER: page_no_bboxes,
            DropTag.HEADER: header_bboxes,
            DropTag.FOOTER: footer_bboxes,
            DropTag.FOOTNOTE: footnote_bboxes,
赵小蒙's avatar
赵小蒙 committed
79
        }
80

许瑞's avatar
许瑞 committed
81
        layout_dets = ocr_page_info["layout_dets"]
赵小蒙's avatar
赵小蒙 committed
82
        spans = []
83

84
        # 计算模型坐标和pymu坐标的缩放比例
许瑞's avatar
许瑞 committed
85
86
87
        horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(
            ocr_page_info, page
        )
88

赵小蒙's avatar
赵小蒙 committed
89
        for layout_det in layout_dets:
许瑞's avatar
许瑞 committed
90
            category_id = layout_det["category_id"]
赵小蒙's avatar
赵小蒙 committed
91
            allow_category_id_list = [1, 7, 13, 14, 15]
赵小蒙's avatar
赵小蒙 committed
92
            if category_id in allow_category_id_list:
许瑞's avatar
许瑞 committed
93
94
95
96
97
98
99
                x0, y0, _, _, x1, y1, _, _ = layout_det["poly"]
                bbox = [
                    int(x0 / horizontal_scale_ratio),
                    int(y0 / vertical_scale_ratio),
                    int(x1 / horizontal_scale_ratio),
                    int(y1 / vertical_scale_ratio),
                ]
100
101
102
                # 删除高度或者宽度为0的spans
                if bbox[2] - bbox[0] == 0 or bbox[3] - bbox[1] == 0:
                    continue
许瑞's avatar
许瑞 committed
103
                """要删除的"""
赵小蒙's avatar
赵小蒙 committed
104
105
106
107
                #  3: 'header',      # 页眉
                #  4: 'page number', # 页码
                #  5: 'footnote',    # 脚注
                #  6: 'footer',      # 页脚
许瑞's avatar
许瑞 committed
108
                """当成span拼接的"""
赵小蒙's avatar
赵小蒙 committed
109
110
111
                #  1: 'image', # 图片
                #  7: 'table',       # 表格
                #  13: 'inline_equation',     # 行内公式
赵小蒙's avatar
赵小蒙 committed
112
                #  14: 'interline_equation',      # 行间公式
赵小蒙's avatar
赵小蒙 committed
113
                #  15: 'text',      # ocr识别文本
许瑞's avatar
许瑞 committed
114
                """layout信息"""
赵小蒙's avatar
赵小蒙 committed
115
116
                #  11: 'full column',   # 单栏
                #  12: 'sub column',    # 多栏
赵小蒙's avatar
赵小蒙 committed
117
                span = {
许瑞's avatar
许瑞 committed
118
                    "bbox": bbox,
赵小蒙's avatar
赵小蒙 committed
119
                }
赵小蒙's avatar
赵小蒙 committed
120
                if category_id == 1:
赵小蒙's avatar
赵小蒙 committed
121
                    span["type"] = ContentType.Image
122

赵小蒙's avatar
赵小蒙 committed
123
                elif category_id == 7:
赵小蒙's avatar
赵小蒙 committed
124
                    span["type"] = ContentType.Table
125

赵小蒙's avatar
赵小蒙 committed
126
                elif category_id == 13:
许瑞's avatar
许瑞 committed
127
                    span["content"] = layout_det["latex"]
赵小蒙's avatar
赵小蒙 committed
128
                    span["type"] = ContentType.InlineEquation
赵小蒙's avatar
赵小蒙 committed
129
                elif category_id == 14:
许瑞's avatar
许瑞 committed
130
                    span["content"] = layout_det["latex"]
赵小蒙's avatar
赵小蒙 committed
131
                    span["type"] = ContentType.InterlineEquation
赵小蒙's avatar
赵小蒙 committed
132
                elif category_id == 15:
许瑞's avatar
许瑞 committed
133
                    span["content"] = layout_det["text"]
赵小蒙's avatar
赵小蒙 committed
134
                    span["type"] = ContentType.Text
赵小蒙's avatar
赵小蒙 committed
135
136
137
138
139
                # print(span)
                spans.append(span)
            else:
                continue

赵小蒙's avatar
赵小蒙 committed
140
        '''删除重叠spans中较小的那些'''
141
        spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
赵小蒙's avatar
赵小蒙 committed
142

赵小蒙's avatar
赵小蒙 committed
143
144
145
146
        '''
        删除remove_span_block_bboxes中的bbox
        并增加drop相关数据
        '''
147
        spans, dropped_spans_by_removed_bboxes = remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict)
148

赵小蒙's avatar
赵小蒙 committed
149
        '''对image和table截图'''
赵小蒙's avatar
赵小蒙 committed
150
        spans = ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter)
赵小蒙's avatar
赵小蒙 committed
151

赵小蒙's avatar
赵小蒙 committed
152
        '''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
liukaiwen's avatar
liukaiwen committed
153
154
155
        displayed_list = []
        text_inline_lines = []
        modify_y_axis(spans, displayed_list, text_inline_lines)
赵小蒙's avatar
赵小蒙 committed
156
157

        '''模型识别错误的行间公式, type类型转换成行内公式'''
liukaiwen's avatar
liukaiwen committed
158
        spans = modify_inline_equation(spans, displayed_list, text_inline_lines)
赵小蒙's avatar
赵小蒙 committed
159

赵小蒙's avatar
赵小蒙 committed
160
        '''bbox去除粘连'''
许瑞's avatar
许瑞 committed
161
        spans = remove_overlap_between_bbox(spans)
赵小蒙's avatar
赵小蒙 committed
162

赵小蒙's avatar
赵小蒙 committed
163
164
165
        '''用现有的bbox计算layout'''


赵小蒙's avatar
赵小蒙 committed
166
167
168
169
        '''
        对tpye=["interline_equation", "image", "table"]进行额外处理,
        如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
        '''
170
171
        spans = adjust_bbox_for_standalone_block(spans)

赵小蒙's avatar
赵小蒙 committed
172
        '''从ocr_page_info中解析layout信息(按自然阅读方向排序,并修复重叠和交错的bad case)'''
赵小蒙's avatar
赵小蒙 committed
173
        layout_bboxes, layout_tree = layout_detect(ocr_page_info['subfield_dets'], page, ocr_page_info)
赵小蒙's avatar
赵小蒙 committed
174

赵小蒙's avatar
赵小蒙 committed
175
        '''将spans合并成line(在layout内,从上到下,从左到右)'''
176
        lines, dropped_spans_by_layout = merge_spans_to_line_by_layout(spans, layout_bboxes)
赵小蒙's avatar
赵小蒙 committed
177

赵小蒙's avatar
赵小蒙 committed
178
        '''将lines合并成block'''
赵小蒙's avatar
赵小蒙 committed
179
180
        blocks = merge_lines_to_block(lines)

赵小蒙's avatar
赵小蒙 committed
181
        '''获取QA需要外置的list'''
赵小蒙's avatar
赵小蒙 committed
182
        images, tables, interline_equations, inline_equations = get_qa_need_list(blocks)
赵小蒙's avatar
赵小蒙 committed
183

赵小蒙's avatar
赵小蒙 committed
184
        '''drop的span_list合并'''
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
        dropped_spans = []
        dropped_spans.extend(dropped_spans_by_span_overlap)
        dropped_spans.extend(dropped_spans_by_removed_bboxes)
        dropped_spans.extend(dropped_spans_by_layout)

        dropped_text_block = []
        dropped_image_block = []
        dropped_table_block = []
        dropped_equation_block = []
        for span in dropped_spans:
            # drop出的spans进行分类
            if span['type'] == ContentType.Text:
                dropped_text_block.append(span)
            elif span['type'] == ContentType.Image:
                dropped_image_block.append(span)
            elif span['type'] == ContentType.Table:
                dropped_table_block.append(span)
            elif span['type'] in [ContentType.InlineEquation, ContentType.InterlineEquation]:
                dropped_equation_block.append(span)

赵小蒙's avatar
赵小蒙 committed
205
        '''构造pdf_info_dict'''
206
        page_info = ocr_construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
赵小蒙's avatar
赵小蒙 committed
207
208
209
210
                                                 images, tables, interline_equations, inline_equations,
                                                 dropped_text_block, dropped_image_block, dropped_table_block,
                                                 dropped_equation_block,
                                                 need_remove_spans_bboxes_dict)
赵小蒙's avatar
赵小蒙 committed
211
212
        pdf_info_dict[f"page_{page_id}"] = page_info

213
    """分段"""
214
    para_split(pdf_info_dict, debug_mode=debug_mode)
赵小蒙's avatar
赵小蒙 committed
215

赵小蒙's avatar
赵小蒙 committed
216
217
218
219
220
221
222
    """dict转list"""
    pdf_info_list = dict_to_list(pdf_info_dict)
    new_pdf_info_dict = {
        "pdf_info": pdf_info_list,
    }

    return new_pdf_info_dict