ocr_detect_all_bboxes.py 2.92 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
2
from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio
from magic_pdf.libs.drop_tag import DropTag
赵小蒙's avatar
赵小蒙 committed
3
4
5
from magic_pdf.libs.ocr_content_type import BlockType


赵小蒙's avatar
赵小蒙 committed
6
7
8
9
10
11
def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_blocks, text_blocks,
                                        title_blocks, interline_equation_blocks, page_w, page_h):
    all_bboxes = []

    for image in img_blocks:
        x0, y0, x1, y1 = image['bbox']
赵小蒙's avatar
赵小蒙 committed
12
        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Image, None, None, None, None])
赵小蒙's avatar
赵小蒙 committed
13
14
15

    for table in table_blocks:
        x0, y0, x1, y1 = table['bbox']
赵小蒙's avatar
赵小蒙 committed
16
        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Table, None, None, None, None])
赵小蒙's avatar
赵小蒙 committed
17
18
19

    for text in text_blocks:
        x0, y0, x1, y1 = text['bbox']
赵小蒙's avatar
赵小蒙 committed
20
        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Text, None, None, None, None])
赵小蒙's avatar
赵小蒙 committed
21
22
23

    for title in title_blocks:
        x0, y0, x1, y1 = title['bbox']
赵小蒙's avatar
赵小蒙 committed
24
        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Title, None, None, None, None])
赵小蒙's avatar
赵小蒙 committed
25
26
27

    for interline_equation in interline_equation_blocks:
        x0, y0, x1, y1 = interline_equation['bbox']
赵小蒙's avatar
赵小蒙 committed
28
        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.InterlineEquation, None, None, None, None])
赵小蒙's avatar
赵小蒙 committed
29
30
31
32
33

    '''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)'''
    for discarded in discarded_blocks:
        x0, y0, x1, y1 = discarded['bbox']
        if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
赵小蒙's avatar
赵小蒙 committed
34
            all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Footnote, None, None, None, None])
赵小蒙's avatar
赵小蒙 committed
35

赵小蒙's avatar
赵小蒙 committed
36
37
38
39
    '''block嵌套问题解决'''
    # @todo 1. text block大框套小框,删除小框 2. 图片或文本框与舍弃框重叠,优先信任舍弃框 3. 文本框与标题框重叠,优先信任文本框
    all_bboxes, dropped_blocks = remove_overlaps_min_blocks(all_bboxes)

赵小蒙's avatar
赵小蒙 committed
40
41
    return all_bboxes

赵小蒙's avatar
赵小蒙 committed
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60

def remove_overlaps_min_blocks(all_bboxes):
    dropped_blocks = []
    #  删除重叠blocks中较小的那些
    for block1 in all_bboxes.copy():
        for block2 in all_bboxes.copy():
            if block1 != block2:
                block1_box = block1[0], block1[1], block1[2], block1[3]
                block2_box = block2[0], block2[1], block2[2], block2[3]
                overlap_box = get_minbox_if_overlap_by_ratio(block1_box, block2_box, 0.8)
                if overlap_box is not None:
                    bbox_to_remove = next(
                        (block for block in all_bboxes if [block[0], block[1], block[2], block[3]] == overlap_box),
                        None)
                    if bbox_to_remove is not None:
                        all_bboxes.remove(bbox_to_remove)
                        bbox_to_remove['tag'] = DropTag.BLOCK_OVERLAP
                        dropped_blocks.append(bbox_to_remove)
    return all_bboxes, dropped_blocks