ocr_detect_all_bboxes.py 13.8 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1

2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
from magic_pdf.config.ocr_content_type import BlockType
from magic_pdf.libs.boxbase import (
    calculate_iou, calculate_overlap_area_in_bbox1_area_ratio,
    calculate_vertical_projection_overlap_ratio,
    get_minbox_if_overlap_by_ratio)
from magic_pdf.pre_proc.remove_bbox_overlap import \
    remove_overlap_between_bbox_for_block


def ocr_prepare_bboxes_for_layout_split(
    img_blocks,
    table_blocks,
    discarded_blocks,
    text_blocks,
    title_blocks,
    interline_equation_blocks,
    page_w,
    page_h,
):
赵小蒙's avatar
赵小蒙 committed
21
    all_bboxes = []
22
    all_discarded_blocks = []
赵小蒙's avatar
赵小蒙 committed
23
24
    for image in img_blocks:
        x0, y0, x1, y1 = image['bbox']
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
        all_bboxes.append(
            [
                x0,
                y0,
                x1,
                y1,
                None,
                None,
                None,
                BlockType.Image,
                None,
                None,
                None,
                None,
                image['score'],
            ]
        )
赵小蒙's avatar
赵小蒙 committed
42
43
44

    for table in table_blocks:
        x0, y0, x1, y1 = table['bbox']
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
        all_bboxes.append(
            [
                x0,
                y0,
                x1,
                y1,
                None,
                None,
                None,
                BlockType.Table,
                None,
                None,
                None,
                None,
                table['score'],
            ]
        )
赵小蒙's avatar
赵小蒙 committed
62
63
64

    for text in text_blocks:
        x0, y0, x1, y1 = text['bbox']
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
        all_bboxes.append(
            [
                x0,
                y0,
                x1,
                y1,
                None,
                None,
                None,
                BlockType.Text,
                None,
                None,
                None,
                None,
                text['score'],
            ]
        )
赵小蒙's avatar
赵小蒙 committed
82
83
84

    for title in title_blocks:
        x0, y0, x1, y1 = title['bbox']
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
        all_bboxes.append(
            [
                x0,
                y0,
                x1,
                y1,
                None,
                None,
                None,
                BlockType.Title,
                None,
                None,
                None,
                None,
                title['score'],
            ]
        )
赵小蒙's avatar
赵小蒙 committed
102
103
104

    for interline_equation in interline_equation_blocks:
        x0, y0, x1, y1 = interline_equation['bbox']
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
        all_bboxes.append(
            [
                x0,
                y0,
                x1,
                y1,
                None,
                None,
                None,
                BlockType.InterlineEquation,
                None,
                None,
                None,
                None,
                interline_equation['score'],
            ]
        )

    """block嵌套问题解决"""
    """文本框与标题框重叠,优先信任文本框"""
赵小蒙's avatar
赵小蒙 committed
125
    all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
126
    """任何框体与舍弃框重叠,优先信任舍弃框"""
赵小蒙's avatar
赵小蒙 committed
127
    all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
128
129

    # interline_equation 与title或text框冲突的情况,分两种情况处理
130
    """interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框"""
131
    all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
132
    """interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框"""
133
    # 通过后续大框套小框逻辑删除
赵小蒙's avatar
赵小蒙 committed
134

135
    """discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)"""
赵小蒙's avatar
赵小蒙 committed
136
137
    for discarded in discarded_blocks:
        x0, y0, x1, y1 = discarded['bbox']
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
        all_discarded_blocks.append(
            [
                x0,
                y0,
                x1,
                y1,
                None,
                None,
                None,
                BlockType.Discarded,
                None,
                None,
                None,
                None,
                discarded['score'],
            ]
        )
155
        # 将footnote加入到all_bboxes中,用来计算layout
赵小蒙's avatar
赵小蒙 committed
156
        if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
            all_bboxes.append(
                [
                    x0,
                    y0,
                    x1,
                    y1,
                    None,
                    None,
                    None,
                    BlockType.Footnote,
                    None,
                    None,
                    None,
                    None,
                    discarded['score'],
                ]
            )

    """经过以上处理后,还存在大框套小框的情况,则删除小框"""
176
177
    all_bboxes = remove_overlaps_min_blocks(all_bboxes)
    all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
178
    """将剩余的bbox做分离处理,防止后面分layout时出错"""
179
180
181
182
183
    all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)

    return all_bboxes, all_discarded_blocks, drop_reasons


184
185
186
187
def add_bboxes(blocks, block_type, bboxes):
    for block in blocks:
        x0, y0, x1, y1 = block['bbox']
        if block_type in [
188
189
190
191
192
193
            BlockType.ImageBody,
            BlockType.ImageCaption,
            BlockType.ImageFootnote,
            BlockType.TableBody,
            BlockType.TableCaption,
            BlockType.TableFootnote,
194
        ]:
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
            bboxes.append(
                [
                    x0,
                    y0,
                    x1,
                    y1,
                    None,
                    None,
                    None,
                    block_type,
                    None,
                    None,
                    None,
                    None,
                    block['score'],
                    block['group_id'],
                ]
            )
213
        else:
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
            bboxes.append(
                [
                    x0,
                    y0,
                    x1,
                    y1,
                    None,
                    None,
                    None,
                    block_type,
                    None,
                    None,
                    None,
                    None,
                    block['score'],
                ]
            )
231
232
233


def ocr_prepare_bboxes_for_layout_split_v2(
234
235
236
237
238
239
240
241
242
243
244
245
    img_body_blocks,
    img_caption_blocks,
    img_footnote_blocks,
    table_body_blocks,
    table_caption_blocks,
    table_footnote_blocks,
    discarded_blocks,
    text_blocks,
    title_blocks,
    interline_equation_blocks,
    page_w,
    page_h,
246
):
247
248
    all_bboxes = []

249
250
251
252
253
254
255
256
257
    add_bboxes(img_body_blocks, BlockType.ImageBody, all_bboxes)
    add_bboxes(img_caption_blocks, BlockType.ImageCaption, all_bboxes)
    add_bboxes(img_footnote_blocks, BlockType.ImageFootnote, all_bboxes)
    add_bboxes(table_body_blocks, BlockType.TableBody, all_bboxes)
    add_bboxes(table_caption_blocks, BlockType.TableCaption, all_bboxes)
    add_bboxes(table_footnote_blocks, BlockType.TableFootnote, all_bboxes)
    add_bboxes(text_blocks, BlockType.Text, all_bboxes)
    add_bboxes(title_blocks, BlockType.Title, all_bboxes)
    add_bboxes(interline_equation_blocks, BlockType.InterlineEquation, all_bboxes)
258

259
260
    """block嵌套问题解决"""
    """文本框与标题框重叠,优先信任文本框"""
261
    all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
262
    """任何框体与舍弃框重叠,优先信任舍弃框"""
263
264
265
    all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)

    # interline_equation 与title或text框冲突的情况,分两种情况处理
266
    """interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框"""
267
    all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
268
    """interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框"""
269
270
    # 通过后续大框套小框逻辑删除

271
    """discarded_blocks"""
272
273
274
    all_discarded_blocks = []
    add_bboxes(discarded_blocks, BlockType.Discarded, all_discarded_blocks)

275
    """footnote识别:宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的"""
276
    footnote_blocks = []
277
278
    for discarded in discarded_blocks:
        x0, y0, x1, y1 = discarded['bbox']
279
280
281
        if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
            footnote_blocks.append([x0, y0, x1, y1])

282
    """移除在footnote下面的任何框"""
283
284
285
286
287
    need_remove_blocks = find_blocks_under_footnote(all_bboxes, footnote_blocks)
    if len(need_remove_blocks) > 0:
        for block in need_remove_blocks:
            all_bboxes.remove(block)
            all_discarded_blocks.append(block)
288

289
    """经过以上处理后,还存在大框套小框的情况,则删除小框"""
赵小蒙's avatar
赵小蒙 committed
290
291
    all_bboxes = remove_overlaps_min_blocks(all_bboxes)
    all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
292
    """将剩余的bbox做分离处理,防止后面分layout时出错"""
293
    all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
赵小蒙's avatar
赵小蒙 committed
294

295
    return all_bboxes, all_discarded_blocks
赵小蒙's avatar
赵小蒙 committed
296
297


298
299
300
301
302
303
304
def find_blocks_under_footnote(all_bboxes, footnote_blocks):
    need_remove_blocks = []
    for block in all_bboxes:
        block_x0, block_y0, block_x1, block_y1 = block[:4]
        for footnote_bbox in footnote_blocks:
            footnote_x0, footnote_y0, footnote_x1, footnote_y1 = footnote_bbox
            # 如果footnote的纵向投影覆盖了block的纵向投影的80%且block的y0大于等于footnote的y1
305
306
307
308
309
310
311
            if (
                block_y0 >= footnote_y1
                and calculate_vertical_projection_overlap_ratio(
                    (block_x0, block_y0, block_x1, block_y1), footnote_bbox
                )
                >= 0.8
            ):
312
313
314
315
316
317
                if block not in need_remove_blocks:
                    need_remove_blocks.append(block)
                    break
    return need_remove_blocks


318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
def fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes):
    # 先提取所有text和interline block
    text_blocks = []
    for block in all_bboxes:
        if block[7] == BlockType.Text:
            text_blocks.append(block)
    interline_equation_blocks = []
    for block in all_bboxes:
        if block[7] == BlockType.InterlineEquation:
            interline_equation_blocks.append(block)

    need_remove = []

    for interline_equation_block in interline_equation_blocks:
        for text_block in text_blocks:
            interline_equation_block_bbox = interline_equation_block[:4]
            text_block_bbox = text_block[:4]
            if calculate_iou(interline_equation_block_bbox, text_block_bbox) > 0.8:
                if text_block not in need_remove:
                    need_remove.append(text_block)

    if len(need_remove) > 0:
        for block in need_remove:
            all_bboxes.remove(block)

    return all_bboxes


赵小蒙's avatar
赵小蒙 committed
346
347
348
349
350
351
352
353
354
355
def fix_text_overlap_title_blocks(all_bboxes):
    # 先提取所有text和title block
    text_blocks = []
    for block in all_bboxes:
        if block[7] == BlockType.Text:
            text_blocks.append(block)
    title_blocks = []
    for block in all_bboxes:
        if block[7] == BlockType.Title:
            title_blocks.append(block)
赵小蒙's avatar
赵小蒙 committed
356

357
358
    need_remove = []

赵小蒙's avatar
赵小蒙 committed
359
360
    for text_block in text_blocks:
        for title_block in title_blocks:
赵小蒙's avatar
赵小蒙 committed
361
362
            text_block_bbox = text_block[:4]
            title_block_bbox = title_block[:4]
赵小蒙's avatar
赵小蒙 committed
363
            if calculate_iou(text_block_bbox, title_block_bbox) > 0.8:
364
365
366
367
368
369
                if title_block not in need_remove:
                    need_remove.append(title_block)

    if len(need_remove) > 0:
        for block in need_remove:
            all_bboxes.remove(block)
赵小蒙's avatar
赵小蒙 committed
370
371
372
373
374

    return all_bboxes


def remove_need_drop_blocks(all_bboxes, discarded_blocks):
赵小蒙's avatar
赵小蒙 committed
375
376
    need_remove = []
    for block in all_bboxes:
赵小蒙's avatar
赵小蒙 committed
377
        for discarded_block in discarded_blocks:
赵小蒙's avatar
赵小蒙 committed
378
            block_bbox = block[:4]
379
380
381
382
383
384
            if (
                calculate_overlap_area_in_bbox1_area_ratio(
                    block_bbox, discarded_block['bbox']
                )
                > 0.6
            ):
赵小蒙's avatar
赵小蒙 committed
385
386
387
                if block not in need_remove:
                    need_remove.append(block)
                    break
赵小蒙's avatar
赵小蒙 committed
388

赵小蒙's avatar
赵小蒙 committed
389
390
391
    if len(need_remove) > 0:
        for block in need_remove:
            all_bboxes.remove(block)
赵小蒙's avatar
赵小蒙 committed
392
393
    return all_bboxes

赵小蒙's avatar
赵小蒙 committed
394
395

def remove_overlaps_min_blocks(all_bboxes):
396
    #  重叠block,小的不能直接删除,需要和大的那个合并成一个更大的。
赵小蒙's avatar
赵小蒙 committed
397
    #  删除重叠blocks中较小的那些
赵小蒙's avatar
赵小蒙 committed
398
399
400
    need_remove = []
    for block1 in all_bboxes:
        for block2 in all_bboxes:
赵小蒙's avatar
赵小蒙 committed
401
            if block1 != block2:
赵小蒙's avatar
赵小蒙 committed
402
403
                block1_bbox = block1[:4]
                block2_bbox = block2[:4]
404
405
406
                overlap_box = get_minbox_if_overlap_by_ratio(
                    block1_bbox, block2_bbox, 0.8
                )
赵小蒙's avatar
赵小蒙 committed
407
                if overlap_box is not None:
408
409
410
411
412
413
414
415
                    block_to_remove = next(
                        (block for block in all_bboxes if block[:4] == overlap_box),
                        None,
                    )
                    if (
                        block_to_remove is not None
                        and block_to_remove not in need_remove
                    ):
416
417
418
419
420
421
422
423
424
                        large_block = block1 if block1 != block_to_remove else block2
                        x1, y1, x2, y2 = large_block[:4]
                        sx1, sy1, sx2, sy2 = block_to_remove[:4]
                        x1 = min(x1, sx1)
                        y1 = min(y1, sy1)
                        x2 = max(x2, sx2)
                        y2 = max(y2, sy2)
                        large_block[:4] = [x1, y1, x2, y2]
                        need_remove.append(block_to_remove)
赵小蒙's avatar
赵小蒙 committed
425

赵小蒙's avatar
赵小蒙 committed
426
427
428
    if len(need_remove) > 0:
        for block in need_remove:
            all_bboxes.remove(block)
赵小蒙's avatar
赵小蒙 committed
429
430

    return all_bboxes