ocr_dict_merge.py 11.4 KB
Newer Older
1
2
3
4
from loguru import logger

from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \
    calculate_overlap_area_in_bbox1_area_ratio
5
from magic_pdf.libs.drop_tag import DropTag
赵小蒙's avatar
赵小蒙 committed
6
from magic_pdf.libs.ocr_content_type import ContentType, BlockType
赵小蒙's avatar
赵小蒙 committed
7
8
from magic_pdf.pre_proc.ocr_span_list_modify import modify_y_axis, modify_inline_equation
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
赵小蒙's avatar
赵小蒙 committed
9
10


11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# 将每一个line中的span从左到右排序
def line_sort_spans_by_left_to_right(lines):
    line_objects = []
    for line in lines:
        # 按照x0坐标排序
        line.sort(key=lambda span: span['bbox'][0])
        line_bbox = [
            min(span['bbox'][0] for span in line),  # x0
            min(span['bbox'][1] for span in line),  # y0
            max(span['bbox'][2] for span in line),  # x1
            max(span['bbox'][3] for span in line),  # y1
        ]
        line_objects.append({
            "bbox": line_bbox,
            "spans": line,
        })
    return line_objects

赵小蒙's avatar
赵小蒙 committed
29

赵小蒙's avatar
赵小蒙 committed
30
def merge_spans_to_line(spans):
31
32
33
34
35
    if len(spans) == 0:
        return []
    else:
        # 按照y0坐标排序
        spans.sort(key=lambda span: span['bbox'][1])
赵小蒙's avatar
赵小蒙 committed
36

37
38
39
40
41
42
        lines = []
        current_line = [spans[0]]
        for span in spans[1:]:
            # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
            # image和table类型,同上
            if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
赵小蒙's avatar
赵小蒙 committed
43
44
                    s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in
                    current_line):
45
46
47
48
                # 则开始新行
                lines.append(current_line)
                current_line = [span]
                continue
赵小蒙's avatar
赵小蒙 committed
49

50
51
52
53
54
55
56
            # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
            if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
                current_line.append(span)
            else:
                # 否则,开始新行
                lines.append(current_line)
                current_line = [span]
赵小蒙's avatar
赵小蒙 committed
57

58
59
60
        # 添加最后一行
        if current_line:
            lines.append(current_line)
赵小蒙's avatar
赵小蒙 committed
61

62
        return lines
赵小蒙's avatar
赵小蒙 committed
63

赵小蒙's avatar
赵小蒙 committed
64

65
66
67
def merge_spans_to_line_by_layout(spans, layout_bboxes):
    lines = []
    new_spans = []
68
    dropped_spans = []
69
70
71
72
73
    for item in layout_bboxes:
        layout_bbox = item['layout_bbox']
        # 遍历spans,将每个span放入对应的layout中
        layout_sapns = []
        for span in spans:
74
            if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], layout_bbox) > 0.6:
75
                layout_sapns.append(span)
76
77
78
79
80
81
        # 如果layout_sapns不为空,则放入new_spans中
        if len(layout_sapns) > 0:
            new_spans.append(layout_sapns)
            # 从spans删除已经放入layout_sapns中的span
            for layout_sapn in layout_sapns:
                spans.remove(layout_sapn)
82

83
84
85
86
    if len(new_spans) > 0:
        for layout_sapns in new_spans:
            layout_lines = merge_spans_to_line(layout_sapns)
            lines.extend(layout_lines)
87

88
    # 对line中的span进行排序
89
90
    lines = line_sort_spans_by_left_to_right(lines)

91
92
93
94
95
    for span in spans:
        span['tag'] = DropTag.NOT_IN_LAYOUT
        dropped_spans.append(span)

    return lines, dropped_spans
liukaiwen's avatar
lkw  
liukaiwen committed
96
97


赵小蒙's avatar
赵小蒙 committed
98
99
100
101
102
103
104
105
106
107
108
109
110
def merge_lines_to_block(lines):
    # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
    blocks = []
    for line in lines:
        blocks.append(
            {
                "bbox": line["bbox"],
                "lines": [line],
            }
        )
    return blocks


赵小蒙's avatar
赵小蒙 committed
111
112
113
114
115
def sort_blocks_by_layout(all_bboxes, layout_bboxes):
    new_blocks = []
    sort_blocks = []
    for item in layout_bboxes:
        layout_bbox = item['layout_bbox']
liukaiwen's avatar
lkw  
liukaiwen committed
116

赵小蒙's avatar
赵小蒙 committed
117
118
119
120
        # 遍历blocks,将每个blocks放入对应的layout中
        layout_blocks = []
        for block in all_bboxes:
            # 如果是footnote则跳过
赵小蒙's avatar
赵小蒙 committed
121
            if block[7] == BlockType.Footnote:
赵小蒙's avatar
赵小蒙 committed
122
123
124
125
                continue
            block_bbox = [block[0], block[1], block[2], block[3]]
            if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, layout_bbox) > 0.8:
                layout_blocks.append(block)
liukaiwen's avatar
lkw  
liukaiwen committed
126

赵小蒙's avatar
赵小蒙 committed
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
        # 如果layout_blocks不为空,则放入new_blocks中
        if len(layout_blocks) > 0:
            new_blocks.append(layout_blocks)
            # 从spans删除已经放入layout_sapns中的span
            for layout_block in layout_blocks:
                all_bboxes.remove(layout_block)

    # 如果new_blocks不为空,则对new_blocks中每个block进行排序
    if len(new_blocks) > 0:
        for bboxes_in_layout_block in new_blocks:
            bboxes_in_layout_block.sort(key=lambda x: x[1])  # 一个layout内部的box,按照y0自上而下排序
            sort_blocks.extend(bboxes_in_layout_block)

    # sort_blocks中已经包含了当前页面所有最终留下的block,且已经排好了顺序
    return sort_blocks


def fill_spans_in_blocks(blocks, spans):
赵小蒙's avatar
赵小蒙 committed
145
146
147
    '''
    将allspans中的span按位置关系,放入blocks中
    '''
赵小蒙's avatar
赵小蒙 committed
148
149
150
151
152
153
154
155
156
157
158
159
160
    block_with_spans = []
    for block in blocks:
        block_type = block[7]
        block_bbox = block[0:4]
        block_dict = {
            'block_type': block_type,
            'bbox': block_bbox,
        }
        block_spans = []
        for span in spans:
            span_bbox = span['bbox']
            if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.8:
                block_spans.append(span)
赵小蒙's avatar
赵小蒙 committed
161
162
163
164
165
166
167
168
169
170
171
172

        '''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
        displayed_list = []
        text_inline_lines = []
        modify_y_axis(block_spans, displayed_list, text_inline_lines)

        '''模型识别错误的行间公式, type类型转换成行内公式'''
        block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)

        '''bbox去除粘连'''
        block_spans = remove_overlap_between_bbox(block_spans)

赵小蒙's avatar
赵小蒙 committed
173
174
175
176
177
178
179
180
181
182
183
184
        block_dict['spans'] = block_spans
        block_with_spans.append(block_dict)

        # 从spans删除已经放入block_spans中的span
        if len(block_spans) > 0:
            for span in block_spans:
                spans.remove(span)

    return block_with_spans


def fix_block_spans(block_with_spans, img_blocks, table_blocks):
赵小蒙's avatar
赵小蒙 committed
185
186
187
188
189
190
    '''
    1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系
        需要将caption和footnote的text_span放入相应img_block和table_block内的
        caption_block和footnote_block中
    2、同时需要删除block中的spans字段
    '''
赵小蒙's avatar
赵小蒙 committed
191
192
193
    fix_blocks = []
    for block in block_with_spans:
        block_type = block['block_type']
赵小蒙's avatar
赵小蒙 committed
194
195

        if block_type == BlockType.Image:
赵小蒙's avatar
赵小蒙 committed
196
            block = fix_image_block(block, img_blocks)
赵小蒙's avatar
赵小蒙 committed
197
        elif block_type == BlockType.Table:
赵小蒙's avatar
赵小蒙 committed
198
            block = fix_table_block(block, table_blocks)
赵小蒙's avatar
赵小蒙 committed
199
200
        elif block_type in [BlockType.Text, BlockType.Title, BlockType.InterlineEquation]:
            block = fix_text_block(block)
赵小蒙's avatar
赵小蒙 committed
201
202
203
204
        else:
            continue
        fix_blocks.append(block)
    return fix_blocks
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312


def merge_spans_to_block(spans: list, block_bbox: list, block_type: str):
    block_spans = []
    # 如果有img_caption,则将img_block中的text_spans放入img_caption_block中
    for span in spans:
        if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.8:
            block_spans.append(span)
    block_lines = merge_spans_to_line(block_spans)
    # 对line中的span进行排序
    sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
    block = {
        'bbox': block_bbox,
        'block_type': block_type,
        'lines': sort_block_lines
    }
    return block, block_spans


def make_body_block(span: dict, block_bbox: list, block_type: str):
    # 创建body_block
    body_line = {
        'bbox': block_bbox,
        'spans': [span],
    }
    body_block = {
        'bbox': block_bbox,
        'block_type': block_type,
        'lines': [body_line]
    }
    return body_block


def fix_image_block(block, img_blocks):
    block['blocks'] = []
    # 遍历img_blocks,找到与当前block匹配的img_block
    for img_block in img_blocks:
        if img_block['bbox'] == block['bbox']:
            # 创建img_body_block
            for span in block['spans']:
                if span['type'] == ContentType.Image and span['bbox'] == img_block['img_body_bbox']:
                    # 创建img_body_block
                    img_body_block = make_body_block(span, img_block['img_body_bbox'], BlockType.ImageBody)
                    block['blocks'].append(img_body_block)

                    # 从spans中移除img_body_block中已经放入的span
                    block['spans'].remove(span)
                    break

            # 根据list长度,判断img_block中是否有img_caption
            if len(img_block['img_caption_bbox']) > 0:
                img_caption_block, img_caption_spans = merge_spans_to_block(
                    block['spans'], img_block['img_caption_bbox'], BlockType.ImageCaption
                )
                block['blocks'].append(img_caption_block)

            break
    del block['spans']
    return block


def fix_table_block(block, table_blocks):
    block['blocks'] = []
    # 遍历table_blocks,找到与当前block匹配的table_block
    for table_block in table_blocks:
        if table_block['bbox'] == block['bbox']:
            # 创建table_body_block
            for span in block['spans']:
                if span['type'] == ContentType.Table and span['bbox'] == table_block['table_body_bbox']:
                    # 创建table_body_block
                    table_body_block = make_body_block(span, table_block['table_body_bbox'], BlockType.TableBody)
                    block['blocks'].append(table_body_block)

                    # 从spans中移除img_body_block中已经放入的span
                    block['spans'].remove(span)
                    break

            # 根据list长度,判断table_block中是否有caption
            if len(table_block['table_caption_bbox']) > 0:
                table_caption_block, table_caption_spans = merge_spans_to_block(
                    block['spans'], table_block['table_caption_bbox'], BlockType.TableCaption
                )
                block['blocks'].append(table_caption_block)

                # 如果table_caption_block_spans不为空
                if len(table_caption_spans) > 0:
                    #  一些span已经放入了caption_block中,需要从block['spans']中删除
                    for span in table_caption_spans:
                        block['spans'].remove(span)

            # 根据list长度,判断table_block中是否有table_note
            if len(table_block['table_footnote_bbox']) > 0:
                table_footnote_block, table_footnote_spans = merge_spans_to_block(
                    block['spans'], table_block['table_footnote_bbox'], BlockType.TableFootnote
                )
                block['blocks'].append(table_footnote_block)

            break
    del block['spans']
    return block


def fix_text_block(block):
    block_lines = merge_spans_to_line(block['spans'])
    sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
    block['lines'] = sort_block_lines
    del block['spans']
    return block