ocr_detect_all_bboxes.py 6.42 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
2
from loguru import logger

赵小蒙's avatar
赵小蒙 committed
3
4
from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio, calculate_overlap_area_in_bbox1_area_ratio, \
    calculate_iou
赵小蒙's avatar
赵小蒙 committed
5
from magic_pdf.libs.drop_tag import DropTag
赵小蒙's avatar
赵小蒙 committed
6
from magic_pdf.libs.ocr_content_type import BlockType
7
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_block
赵小蒙's avatar
赵小蒙 committed
8
9


赵小蒙's avatar
赵小蒙 committed
10
11
12
def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_blocks, text_blocks,
                                        title_blocks, interline_equation_blocks, page_w, page_h):
    all_bboxes = []
13
    all_discarded_blocks = []
赵小蒙's avatar
赵小蒙 committed
14
15
    for image in img_blocks:
        x0, y0, x1, y1 = image['bbox']
赵小蒙's avatar
赵小蒙 committed
16
        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Image, None, None, None, None, image["score"]])
赵小蒙's avatar
赵小蒙 committed
17
18
19

    for table in table_blocks:
        x0, y0, x1, y1 = table['bbox']
赵小蒙's avatar
赵小蒙 committed
20
        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Table, None, None, None, None, table["score"]])
赵小蒙's avatar
赵小蒙 committed
21
22
23

    for text in text_blocks:
        x0, y0, x1, y1 = text['bbox']
赵小蒙's avatar
赵小蒙 committed
24
        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Text, None, None, None, None, text["score"]])
赵小蒙's avatar
赵小蒙 committed
25
26
27

    for title in title_blocks:
        x0, y0, x1, y1 = title['bbox']
赵小蒙's avatar
赵小蒙 committed
28
        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Title, None, None, None, None, title["score"]])
赵小蒙's avatar
赵小蒙 committed
29
30
31

    for interline_equation in interline_equation_blocks:
        x0, y0, x1, y1 = interline_equation['bbox']
赵小蒙's avatar
赵小蒙 committed
32
        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.InterlineEquation, None, None, None, None, interline_equation["score"]])
赵小蒙's avatar
赵小蒙 committed
33

赵小蒙's avatar
赵小蒙 committed
34
    '''block嵌套问题解决'''
35
    '''文本框与标题框重叠,优先信任文本框'''
赵小蒙's avatar
赵小蒙 committed
36
37
38
    all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
    '''任何框体与舍弃框重叠,优先信任舍弃框'''
    all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
39
40

    # interline_equation 与title或text框冲突的情况,分两种情况处理
赵小蒙's avatar
赵小蒙 committed
41
    '''interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框'''
42
    all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
赵小蒙's avatar
赵小蒙 committed
43
    '''interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框'''
44
    # 通过后续大框套小框逻辑删除
赵小蒙's avatar
赵小蒙 committed
45

赵小蒙's avatar
赵小蒙 committed
46
47
48
    '''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)'''
    for discarded in discarded_blocks:
        x0, y0, x1, y1 = discarded['bbox']
赵小蒙's avatar
赵小蒙 committed
49
        all_discarded_blocks.append([x0, y0, x1, y1, None, None, None, BlockType.Discarded, None, None, None, None, discarded["score"]])
50
        # 将footnote加入到all_bboxes中,用来计算layout
赵小蒙's avatar
赵小蒙 committed
51
        if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
赵小蒙's avatar
赵小蒙 committed
52
            all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Footnote, None, None, None, None, discarded["score"]])
赵小蒙's avatar
赵小蒙 committed
53

赵小蒙's avatar
赵小蒙 committed
54
55
56
57
    '''经过以上处理后,还存在大框套小框的情况,则删除小框'''
    all_bboxes = remove_overlaps_min_blocks(all_bboxes)
    all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
    '''将剩余的bbox做分离处理,防止后面分layout时出错'''
赵小蒙's avatar
赵小蒙 committed
58
    all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
赵小蒙's avatar
赵小蒙 committed
59

赵小蒙's avatar
赵小蒙 committed
60
    return all_bboxes, all_discarded_blocks, drop_reasons
赵小蒙's avatar
赵小蒙 committed
61
62


63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
def fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes):
    # 先提取所有text和interline block
    text_blocks = []
    for block in all_bboxes:
        if block[7] == BlockType.Text:
            text_blocks.append(block)
    interline_equation_blocks = []
    for block in all_bboxes:
        if block[7] == BlockType.InterlineEquation:
            interline_equation_blocks.append(block)

    need_remove = []

    for interline_equation_block in interline_equation_blocks:
        for text_block in text_blocks:
            interline_equation_block_bbox = interline_equation_block[:4]
            text_block_bbox = text_block[:4]
            if calculate_iou(interline_equation_block_bbox, text_block_bbox) > 0.8:
                if text_block not in need_remove:
                    need_remove.append(text_block)

    if len(need_remove) > 0:
        for block in need_remove:
            all_bboxes.remove(block)

    return all_bboxes


赵小蒙's avatar
赵小蒙 committed
91
92
93
94
95
96
97
98
99
100
def fix_text_overlap_title_blocks(all_bboxes):
    # 先提取所有text和title block
    text_blocks = []
    for block in all_bboxes:
        if block[7] == BlockType.Text:
            text_blocks.append(block)
    title_blocks = []
    for block in all_bboxes:
        if block[7] == BlockType.Title:
            title_blocks.append(block)
赵小蒙's avatar
赵小蒙 committed
101

102
103
    need_remove = []

赵小蒙's avatar
赵小蒙 committed
104
105
    for text_block in text_blocks:
        for title_block in title_blocks:
赵小蒙's avatar
赵小蒙 committed
106
107
            text_block_bbox = text_block[:4]
            title_block_bbox = title_block[:4]
赵小蒙's avatar
赵小蒙 committed
108
            if calculate_iou(text_block_bbox, title_block_bbox) > 0.8:
109
110
111
112
113
114
                if title_block not in need_remove:
                    need_remove.append(title_block)

    if len(need_remove) > 0:
        for block in need_remove:
            all_bboxes.remove(block)
赵小蒙's avatar
赵小蒙 committed
115
116
117
118
119

    return all_bboxes


def remove_need_drop_blocks(all_bboxes, discarded_blocks):
赵小蒙's avatar
赵小蒙 committed
120
121
    need_remove = []
    for block in all_bboxes:
赵小蒙's avatar
赵小蒙 committed
122
        for discarded_block in discarded_blocks:
赵小蒙's avatar
赵小蒙 committed
123
            block_bbox = block[:4]
赵小蒙's avatar
赵小蒙 committed
124
            if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, discarded_block['bbox']) > 0.6:
赵小蒙's avatar
赵小蒙 committed
125
126
127
                if block not in need_remove:
                    need_remove.append(block)
                    break
赵小蒙's avatar
赵小蒙 committed
128

赵小蒙's avatar
赵小蒙 committed
129
130
131
    if len(need_remove) > 0:
        for block in need_remove:
            all_bboxes.remove(block)
赵小蒙's avatar
赵小蒙 committed
132
133
    return all_bboxes

赵小蒙's avatar
赵小蒙 committed
134
135
136

def remove_overlaps_min_blocks(all_bboxes):
    #  删除重叠blocks中较小的那些
赵小蒙's avatar
赵小蒙 committed
137
138
139
    need_remove = []
    for block1 in all_bboxes:
        for block2 in all_bboxes:
赵小蒙's avatar
赵小蒙 committed
140
            if block1 != block2:
赵小蒙's avatar
赵小蒙 committed
141
142
                block1_bbox = block1[:4]
                block2_bbox = block2[:4]
赵小蒙's avatar
赵小蒙 committed
143
                overlap_box = get_minbox_if_overlap_by_ratio(block1_bbox, block2_bbox, 0.8)
赵小蒙's avatar
赵小蒙 committed
144
                if overlap_box is not None:
赵小蒙's avatar
赵小蒙 committed
145
                    bbox_to_remove = next((block for block in all_bboxes if block[:4] == overlap_box), None)
赵小蒙's avatar
赵小蒙 committed
146
                    if bbox_to_remove is not None and bbox_to_remove not in need_remove:
赵小蒙's avatar
赵小蒙 committed
147
                        need_remove.append(bbox_to_remove)
赵小蒙's avatar
赵小蒙 committed
148

赵小蒙's avatar
赵小蒙 committed
149
150
151
    if len(need_remove) > 0:
        for block in need_remove:
            all_bboxes.remove(block)
赵小蒙's avatar
赵小蒙 committed
152
153

    return all_bboxes