ocr_dict_merge.py 13.9 KB
Newer Older
1
2
3
from magic_pdf.libs.boxbase import (__is_overlaps_y_exceeds_threshold,
                                    _is_in_or_part_overlap_with_area_ratio,
                                    calculate_overlap_area_in_bbox1_area_ratio)
4
from magic_pdf.libs.drop_tag import DropTag
5
from magic_pdf.libs.ocr_content_type import BlockType, ContentType
赵小蒙's avatar
赵小蒙 committed
6
7


8
9
10
11
# 将每一个line中的span从左到右排序
def line_sort_spans_by_left_to_right(lines):
    line_objects = []
    for line in lines:
12
        #  按照x0坐标排序
13
14
15
16
17
18
19
20
        line.sort(key=lambda span: span['bbox'][0])
        line_bbox = [
            min(span['bbox'][0] for span in line),  # x0
            min(span['bbox'][1] for span in line),  # y0
            max(span['bbox'][2] for span in line),  # x1
            max(span['bbox'][3] for span in line),  # y1
        ]
        line_objects.append({
21
22
            'bbox': line_bbox,
            'spans': line,
23
24
25
        })
    return line_objects

赵小蒙's avatar
赵小蒙 committed
26

赵小蒙's avatar
赵小蒙 committed
27
def merge_spans_to_line(spans):
28
29
30
31
32
    if len(spans) == 0:
        return []
    else:
        # 按照y0坐标排序
        spans.sort(key=lambda span: span['bbox'][1])
赵小蒙's avatar
赵小蒙 committed
33

34
35
36
37
38
        lines = []
        current_line = [spans[0]]
        for span in spans[1:]:
            # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
            # image和table类型,同上
39
40
41
42
43
44
45
            if span['type'] in [
                    ContentType.InterlineEquation, ContentType.Image,
                    ContentType.Table
            ] or any(s['type'] in [
                    ContentType.InterlineEquation, ContentType.Image,
                    ContentType.Table
            ] for s in current_line):
46
47
48
49
                # 则开始新行
                lines.append(current_line)
                current_line = [span]
                continue
赵小蒙's avatar
赵小蒙 committed
50

51
            # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
52
            if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox'], 0.6):
53
54
55
56
57
                current_line.append(span)
            else:
                # 否则,开始新行
                lines.append(current_line)
                current_line = [span]
赵小蒙's avatar
赵小蒙 committed
58

59
60
61
        # 添加最后一行
        if current_line:
            lines.append(current_line)
赵小蒙's avatar
赵小蒙 committed
62

63
        return lines
赵小蒙's avatar
赵小蒙 committed
64

赵小蒙's avatar
赵小蒙 committed
65

66
67
68
def merge_spans_to_line_by_layout(spans, layout_bboxes):
    lines = []
    new_spans = []
69
    dropped_spans = []
70
71
72
73
74
    for item in layout_bboxes:
        layout_bbox = item['layout_bbox']
        # 遍历spans,将每个span放入对应的layout中
        layout_sapns = []
        for span in spans:
75
76
            if calculate_overlap_area_in_bbox1_area_ratio(
                    span['bbox'], layout_bbox) > 0.6:
77
                layout_sapns.append(span)
78
79
80
81
82
83
        # 如果layout_sapns不为空,则放入new_spans中
        if len(layout_sapns) > 0:
            new_spans.append(layout_sapns)
            # 从spans删除已经放入layout_sapns中的span
            for layout_sapn in layout_sapns:
                spans.remove(layout_sapn)
84

85
86
87
88
    if len(new_spans) > 0:
        for layout_sapns in new_spans:
            layout_lines = merge_spans_to_line(layout_sapns)
            lines.extend(layout_lines)
89

90
    # 对line中的span进行排序
91
92
    lines = line_sort_spans_by_left_to_right(lines)

93
94
95
96
97
    for span in spans:
        span['tag'] = DropTag.NOT_IN_LAYOUT
        dropped_spans.append(span)

    return lines, dropped_spans
liukaiwen's avatar
lkw  
liukaiwen committed
98
99


赵小蒙's avatar
赵小蒙 committed
100
101
102
103
def merge_lines_to_block(lines):
    # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
    blocks = []
    for line in lines:
104
105
106
107
        blocks.append({
            'bbox': line['bbox'],
            'lines': [line],
        })
赵小蒙's avatar
赵小蒙 committed
108
109
110
    return blocks


赵小蒙's avatar
赵小蒙 committed
111
112
113
114
115
def sort_blocks_by_layout(all_bboxes, layout_bboxes):
    new_blocks = []
    sort_blocks = []
    for item in layout_bboxes:
        layout_bbox = item['layout_bbox']
liukaiwen's avatar
lkw  
liukaiwen committed
116

赵小蒙's avatar
赵小蒙 committed
117
118
119
120
        # 遍历blocks,将每个blocks放入对应的layout中
        layout_blocks = []
        for block in all_bboxes:
            # 如果是footnote则跳过
赵小蒙's avatar
赵小蒙 committed
121
            if block[7] == BlockType.Footnote:
赵小蒙's avatar
赵小蒙 committed
122
                continue
赵小蒙's avatar
赵小蒙 committed
123
            block_bbox = block[:4]
124
125
            if calculate_overlap_area_in_bbox1_area_ratio(
                    block_bbox, layout_bbox) > 0.8:
赵小蒙's avatar
赵小蒙 committed
126
                layout_blocks.append(block)
liukaiwen's avatar
lkw  
liukaiwen committed
127

赵小蒙's avatar
赵小蒙 committed
128
129
130
        # 如果layout_blocks不为空,则放入new_blocks中
        if len(layout_blocks) > 0:
            new_blocks.append(layout_blocks)
赵小蒙's avatar
赵小蒙 committed
131
            # 从all_bboxes删除已经放入layout_blocks中的block
赵小蒙's avatar
赵小蒙 committed
132
133
134
135
136
137
            for layout_block in layout_blocks:
                all_bboxes.remove(layout_block)

    # 如果new_blocks不为空,则对new_blocks中每个block进行排序
    if len(new_blocks) > 0:
        for bboxes_in_layout_block in new_blocks:
138
139
            bboxes_in_layout_block.sort(
                key=lambda x: x[1])  # 一个layout内部的box,按照y0自上而下排序
赵小蒙's avatar
赵小蒙 committed
140
141
142
143
144
145
            sort_blocks.extend(bboxes_in_layout_block)

    # sort_blocks中已经包含了当前页面所有最终留下的block,且已经排好了顺序
    return sort_blocks


146
def fill_spans_in_blocks(blocks, spans, radio):
147
    """将allspans中的span按位置关系,放入blocks中."""
赵小蒙's avatar
赵小蒙 committed
148
149
150
151
152
    block_with_spans = []
    for block in blocks:
        block_type = block[7]
        block_bbox = block[0:4]
        block_dict = {
赵小蒙's avatar
赵小蒙 committed
153
            'type': block_type,
赵小蒙's avatar
赵小蒙 committed
154
155
            'bbox': block_bbox,
        }
156
157
158
159
160
        if block_type in [
            BlockType.ImageBody, BlockType.ImageCaption, BlockType.ImageFootnote,
            BlockType.TableBody, BlockType.TableCaption, BlockType.TableFootnote
        ]:
            block_dict["group_id"] = block[-1]
赵小蒙's avatar
赵小蒙 committed
161
162
163
        block_spans = []
        for span in spans:
            span_bbox = span['bbox']
164
165
            if calculate_overlap_area_in_bbox1_area_ratio(
                    span_bbox, block_bbox) > radio:
赵小蒙's avatar
赵小蒙 committed
166
                block_spans.append(span)
赵小蒙's avatar
赵小蒙 committed
167
        '''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
168
169
170
        # displayed_list = []
        # text_inline_lines = []
        # modify_y_axis(block_spans, displayed_list, text_inline_lines)
赵小蒙's avatar
赵小蒙 committed
171
        '''模型识别错误的行间公式, type类型转换成行内公式'''
172
        # block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)
赵小蒙's avatar
赵小蒙 committed
173
        '''bbox去除粘连'''  # 去粘连会影响span的bbox,导致后续fill的时候出错
174
        # block_spans = remove_overlap_between_bbox_for_span(block_spans)
赵小蒙's avatar
赵小蒙 committed
175

赵小蒙's avatar
赵小蒙 committed
176
177
178
179
180
181
182
183
        block_dict['spans'] = block_spans
        block_with_spans.append(block_dict)

        # 从spans删除已经放入block_spans中的span
        if len(block_spans) > 0:
            for span in block_spans:
                spans.remove(span)

184
    return block_with_spans, spans
赵小蒙's avatar
赵小蒙 committed
185
186
187


def fix_block_spans(block_with_spans, img_blocks, table_blocks):
188
189
190
    """1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系
    需要将caption和footnote的text_span放入相应img_block和table_block内的
    caption_block和footnote_block中 2、同时需要删除block中的spans字段."""
赵小蒙's avatar
赵小蒙 committed
191
192
    fix_blocks = []
    for block in block_with_spans:
赵小蒙's avatar
赵小蒙 committed
193
        block_type = block['type']
赵小蒙's avatar
赵小蒙 committed
194
195

        if block_type == BlockType.Image:
赵小蒙's avatar
赵小蒙 committed
196
            block = fix_image_block(block, img_blocks)
赵小蒙's avatar
赵小蒙 committed
197
        elif block_type == BlockType.Table:
赵小蒙's avatar
赵小蒙 committed
198
            block = fix_table_block(block, table_blocks)
199
        elif block_type in [BlockType.Text, BlockType.Title]:
赵小蒙's avatar
赵小蒙 committed
200
            block = fix_text_block(block)
201
202
        elif block_type == BlockType.InterlineEquation:
            block = fix_interline_block(block)
赵小蒙's avatar
赵小蒙 committed
203
204
205
206
        else:
            continue
        fix_blocks.append(block)
    return fix_blocks
207
208


209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
def fix_block_spans_v2(block_with_spans):
    """1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系
    需要将caption和footnote的text_span放入相应img_block和table_block内的
    caption_block和footnote_block中 2、同时需要删除block中的spans字段."""
    fix_blocks = []
    for block in block_with_spans:
        block_type = block['type']

        if block_type in [BlockType.Text, BlockType.Title,
                          BlockType.ImageCaption, BlockType.ImageFootnote,
                          BlockType.TableCaption, BlockType.TableFootnote
                          ]:
            block = fix_text_block(block)
        elif block_type in [BlockType.InterlineEquation, BlockType.ImageBody, BlockType.TableBody]:
            block = fix_interline_block(block)
        else:
            continue
        fix_blocks.append(block)
    return fix_blocks


230
231
232
233
234
235
236
237
def fix_discarded_block(discarded_block_with_spans):
    fix_discarded_blocks = []
    for block in discarded_block_with_spans:
        block = fix_text_block(block)
        fix_discarded_blocks.append(block)
    return fix_discarded_blocks


238
239
240
241
def merge_spans_to_block(spans: list, block_bbox: list, block_type: str):
    block_spans = []
    # 如果有img_caption,则将img_block中的text_spans放入img_caption_block中
    for span in spans:
242
243
        if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'],
                                                      block_bbox) > 0.6:
244
245
246
247
            block_spans.append(span)
    block_lines = merge_spans_to_line(block_spans)
    # 对line中的span进行排序
    sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
248
    block = {'bbox': block_bbox, 'type': block_type, 'lines': sort_block_lines}
249
250
251
252
253
254
255
256
257
    return block, block_spans


def make_body_block(span: dict, block_bbox: list, block_type: str):
    # 创建body_block
    body_line = {
        'bbox': block_bbox,
        'spans': [span],
    }
258
    body_block = {'bbox': block_bbox, 'type': block_type, 'lines': [body_line]}
259
260
261
262
263
264
265
    return body_block


def fix_image_block(block, img_blocks):
    block['blocks'] = []
    # 遍历img_blocks,找到与当前block匹配的img_block
    for img_block in img_blocks:
266
267
        if _is_in_or_part_overlap_with_area_ratio(block['bbox'],
                                                  img_block['bbox'], 0.95):
赵小蒙's avatar
赵小蒙 committed
268

269
270
            # 创建img_body_block
            for span in block['spans']:
271
272
                if span['type'] == ContentType.Image and img_block[
                        'img_body_bbox'] == span['bbox']:
273
                    # 创建img_body_block
274
275
                    img_body_block = make_body_block(
                        span, img_block['img_body_bbox'], BlockType.ImageBody)
276
277
278
279
280
281
282
                    block['blocks'].append(img_body_block)

                    # 从spans中移除img_body_block中已经放入的span
                    block['spans'].remove(span)
                    break

            # 根据list长度,判断img_block中是否有img_caption
283
            if img_block['img_caption_bbox'] is not None:
284
                img_caption_block, img_caption_spans = merge_spans_to_block(
285
286
                    block['spans'], img_block['img_caption_bbox'],
                    BlockType.ImageCaption)
287
288
                block['blocks'].append(img_caption_block)

289
290
291
292
293
            if img_block['img_footnote_bbox'] is not None:
                img_footnote_block, img_footnote_spans = merge_spans_to_block(
                    block['spans'], img_block['img_footnote_bbox'],
                    BlockType.ImageFootnote)
                block['blocks'].append(img_footnote_block)
294
295
296
297
298
299
300
301
302
            break
    del block['spans']
    return block


def fix_table_block(block, table_blocks):
    block['blocks'] = []
    # 遍历table_blocks,找到与当前block匹配的table_block
    for table_block in table_blocks:
303
304
        if _is_in_or_part_overlap_with_area_ratio(block['bbox'],
                                                  table_block['bbox'], 0.95):
赵小蒙's avatar
赵小蒙 committed
305

306
307
            # 创建table_body_block
            for span in block['spans']:
308
309
                if span['type'] == ContentType.Table and table_block[
                        'table_body_bbox'] == span['bbox']:
310
                    # 创建table_body_block
311
312
313
                    table_body_block = make_body_block(
                        span, table_block['table_body_bbox'],
                        BlockType.TableBody)
314
315
316
317
318
319
320
                    block['blocks'].append(table_body_block)

                    # 从spans中移除img_body_block中已经放入的span
                    block['spans'].remove(span)
                    break

            # 根据list长度,判断table_block中是否有caption
321
            if table_block['table_caption_bbox'] is not None:
322
                table_caption_block, table_caption_spans = merge_spans_to_block(
323
324
                    block['spans'], table_block['table_caption_bbox'],
                    BlockType.TableCaption)
325
326
327
328
329
330
331
332
333
                block['blocks'].append(table_caption_block)

                # 如果table_caption_block_spans不为空
                if len(table_caption_spans) > 0:
                    #  一些span已经放入了caption_block中,需要从block['spans']中删除
                    for span in table_caption_spans:
                        block['spans'].remove(span)

            # 根据list长度,判断table_block中是否有table_note
334
            if table_block['table_footnote_bbox'] is not None:
335
                table_footnote_block, table_footnote_spans = merge_spans_to_block(
336
337
                    block['spans'], table_block['table_footnote_bbox'],
                    BlockType.TableFootnote)
338
339
340
341
342
343
344
345
                block['blocks'].append(table_footnote_block)

            break
    del block['spans']
    return block


def fix_text_block(block):
346
347
348
349
350
351
352
353
354
355
356
357
    # 文本block中的公式span都应该转换成行内type
    for span in block['spans']:
        if span['type'] == ContentType.InterlineEquation:
            span['type'] = ContentType.InlineEquation
    block_lines = merge_spans_to_line(block['spans'])
    sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
    block['lines'] = sort_block_lines
    del block['spans']
    return block


def fix_interline_block(block):
358
359
360
361
362
    block_lines = merge_spans_to_line(block['spans'])
    sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
    block['lines'] = sort_block_lines
    del block['spans']
    return block