ocr_detect_all_bboxes.py 9.96 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
2
from loguru import logger

赵小蒙's avatar
赵小蒙 committed
3
4
from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio, calculate_overlap_area_in_bbox1_area_ratio, \
    calculate_iou
赵小蒙's avatar
赵小蒙 committed
5
from magic_pdf.libs.drop_tag import DropTag
赵小蒙's avatar
赵小蒙 committed
6
from magic_pdf.libs.ocr_content_type import BlockType
7
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_block
赵小蒙's avatar
赵小蒙 committed
8
9


赵小蒙's avatar
赵小蒙 committed
10
11
12
def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_blocks, text_blocks,
                                        title_blocks, interline_equation_blocks, page_w, page_h):
    all_bboxes = []
13
    all_discarded_blocks = []
赵小蒙's avatar
赵小蒙 committed
14
15
    for image in img_blocks:
        x0, y0, x1, y1 = image['bbox']
赵小蒙's avatar
赵小蒙 committed
16
        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Image, None, None, None, None, image["score"]])
赵小蒙's avatar
赵小蒙 committed
17
18
19

    for table in table_blocks:
        x0, y0, x1, y1 = table['bbox']
赵小蒙's avatar
赵小蒙 committed
20
        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Table, None, None, None, None, table["score"]])
赵小蒙's avatar
赵小蒙 committed
21
22
23

    for text in text_blocks:
        x0, y0, x1, y1 = text['bbox']
赵小蒙's avatar
赵小蒙 committed
24
        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Text, None, None, None, None, text["score"]])
赵小蒙's avatar
赵小蒙 committed
25
26
27

    for title in title_blocks:
        x0, y0, x1, y1 = title['bbox']
赵小蒙's avatar
赵小蒙 committed
28
        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Title, None, None, None, None, title["score"]])
赵小蒙's avatar
赵小蒙 committed
29
30
31

    for interline_equation in interline_equation_blocks:
        x0, y0, x1, y1 = interline_equation['bbox']
赵小蒙's avatar
赵小蒙 committed
32
        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.InterlineEquation, None, None, None, None, interline_equation["score"]])
赵小蒙's avatar
赵小蒙 committed
33

赵小蒙's avatar
赵小蒙 committed
34
    '''block嵌套问题解决'''
35
    '''文本框与标题框重叠,优先信任文本框'''
赵小蒙's avatar
赵小蒙 committed
36
37
38
    all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
    '''任何框体与舍弃框重叠,优先信任舍弃框'''
    all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
39
40

    # interline_equation 与title或text框冲突的情况,分两种情况处理
赵小蒙's avatar
赵小蒙 committed
41
    '''interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框'''
42
    all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
赵小蒙's avatar
赵小蒙 committed
43
    '''interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框'''
44
    # 通过后续大框套小框逻辑删除
赵小蒙's avatar
赵小蒙 committed
45

赵小蒙's avatar
赵小蒙 committed
46
47
48
    '''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)'''
    for discarded in discarded_blocks:
        x0, y0, x1, y1 = discarded['bbox']
赵小蒙's avatar
赵小蒙 committed
49
        all_discarded_blocks.append([x0, y0, x1, y1, None, None, None, BlockType.Discarded, None, None, None, None, discarded["score"]])
50
        # 将footnote加入到all_bboxes中,用来计算layout
赵小蒙's avatar
赵小蒙 committed
51
        if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
赵小蒙's avatar
赵小蒙 committed
52
            all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Footnote, None, None, None, None, discarded["score"]])
赵小蒙's avatar
赵小蒙 committed
53

赵小蒙's avatar
赵小蒙 committed
54
    '''经过以上处理后,还存在大框套小框的情况,则删除小框'''
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
    all_bboxes = remove_overlaps_min_blocks(all_bboxes)
    all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
    '''将剩余的bbox做分离处理,防止后面分layout时出错'''
    all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)

    return all_bboxes, all_discarded_blocks, drop_reasons


def ocr_prepare_bboxes_for_layout_split_v2(img_blocks, table_blocks, discarded_blocks, text_blocks,
                                        title_blocks, interline_equation_blocks, page_w, page_h):
    all_bboxes = []
    all_discarded_blocks = []
    for image in img_blocks:
        x0, y0, x1, y1 = image['bbox']
        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Image, None, None, None, None, image["score"]])

    for table in table_blocks:
        x0, y0, x1, y1 = table['bbox']
        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Table, None, None, None, None, table["score"]])

    for text in text_blocks:
        x0, y0, x1, y1 = text['bbox']
        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Text, None, None, None, None, text["score"]])

    for title in title_blocks:
        x0, y0, x1, y1 = title['bbox']
        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Title, None, None, None, None, title["score"]])

    for interline_equation in interline_equation_blocks:
        x0, y0, x1, y1 = interline_equation['bbox']
        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.InterlineEquation, None, None, None, None, interline_equation["score"]])

    '''block嵌套问题解决'''
    '''文本框与标题框重叠,优先信任文本框'''
    all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
    '''任何框体与舍弃框重叠,优先信任舍弃框'''
    all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)

    # interline_equation 与title或text框冲突的情况,分两种情况处理
    '''interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框'''
    all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
    '''interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框'''
    # 通过后续大框套小框逻辑删除

    '''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)'''
    for discarded in discarded_blocks:
        x0, y0, x1, y1 = discarded['bbox']
        all_discarded_blocks.append([x0, y0, x1, y1, None, None, None, BlockType.Discarded, None, None, None, None, discarded["score"]])
        # 将footnote加入到all_bboxes中,用来计算layout
        # if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
        #     all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Footnote, None, None, None, None, discarded["score"]])

    '''经过以上处理后,还存在大框套小框的情况,则删除小框'''
赵小蒙's avatar
赵小蒙 committed
108
109
110
    all_bboxes = remove_overlaps_min_blocks(all_bboxes)
    all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
    '''将剩余的bbox做分离处理,防止后面分layout时出错'''
赵小蒙's avatar
赵小蒙 committed
111
    all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
赵小蒙's avatar
赵小蒙 committed
112

赵小蒙's avatar
赵小蒙 committed
113
    return all_bboxes, all_discarded_blocks, drop_reasons
赵小蒙's avatar
赵小蒙 committed
114
115


116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
def fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes):
    # 先提取所有text和interline block
    text_blocks = []
    for block in all_bboxes:
        if block[7] == BlockType.Text:
            text_blocks.append(block)
    interline_equation_blocks = []
    for block in all_bboxes:
        if block[7] == BlockType.InterlineEquation:
            interline_equation_blocks.append(block)

    need_remove = []

    for interline_equation_block in interline_equation_blocks:
        for text_block in text_blocks:
            interline_equation_block_bbox = interline_equation_block[:4]
            text_block_bbox = text_block[:4]
            if calculate_iou(interline_equation_block_bbox, text_block_bbox) > 0.8:
                if text_block not in need_remove:
                    need_remove.append(text_block)

    if len(need_remove) > 0:
        for block in need_remove:
            all_bboxes.remove(block)

    return all_bboxes


赵小蒙's avatar
赵小蒙 committed
144
145
146
147
148
149
150
151
152
153
def fix_text_overlap_title_blocks(all_bboxes):
    # 先提取所有text和title block
    text_blocks = []
    for block in all_bboxes:
        if block[7] == BlockType.Text:
            text_blocks.append(block)
    title_blocks = []
    for block in all_bboxes:
        if block[7] == BlockType.Title:
            title_blocks.append(block)
赵小蒙's avatar
赵小蒙 committed
154

155
156
    need_remove = []

赵小蒙's avatar
赵小蒙 committed
157
158
    for text_block in text_blocks:
        for title_block in title_blocks:
赵小蒙's avatar
赵小蒙 committed
159
160
            text_block_bbox = text_block[:4]
            title_block_bbox = title_block[:4]
赵小蒙's avatar
赵小蒙 committed
161
            if calculate_iou(text_block_bbox, title_block_bbox) > 0.8:
162
163
164
165
166
167
                if title_block not in need_remove:
                    need_remove.append(title_block)

    if len(need_remove) > 0:
        for block in need_remove:
            all_bboxes.remove(block)
赵小蒙's avatar
赵小蒙 committed
168
169
170
171
172

    return all_bboxes


def remove_need_drop_blocks(all_bboxes, discarded_blocks):
赵小蒙's avatar
赵小蒙 committed
173
174
    need_remove = []
    for block in all_bboxes:
赵小蒙's avatar
赵小蒙 committed
175
        for discarded_block in discarded_blocks:
赵小蒙's avatar
赵小蒙 committed
176
            block_bbox = block[:4]
赵小蒙's avatar
赵小蒙 committed
177
            if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, discarded_block['bbox']) > 0.6:
赵小蒙's avatar
赵小蒙 committed
178
179
180
                if block not in need_remove:
                    need_remove.append(block)
                    break
赵小蒙's avatar
赵小蒙 committed
181

赵小蒙's avatar
赵小蒙 committed
182
183
184
    if len(need_remove) > 0:
        for block in need_remove:
            all_bboxes.remove(block)
赵小蒙's avatar
赵小蒙 committed
185
186
    return all_bboxes

赵小蒙's avatar
赵小蒙 committed
187
188

def remove_overlaps_min_blocks(all_bboxes):
189
    #  重叠block,小的不能直接删除,需要和大的那个合并成一个更大的。
赵小蒙's avatar
赵小蒙 committed
190
    #  删除重叠blocks中较小的那些
赵小蒙's avatar
赵小蒙 committed
191
192
193
    need_remove = []
    for block1 in all_bboxes:
        for block2 in all_bboxes:
赵小蒙's avatar
赵小蒙 committed
194
            if block1 != block2:
赵小蒙's avatar
赵小蒙 committed
195
196
                block1_bbox = block1[:4]
                block2_bbox = block2[:4]
赵小蒙's avatar
赵小蒙 committed
197
                overlap_box = get_minbox_if_overlap_by_ratio(block1_bbox, block2_bbox, 0.8)
赵小蒙's avatar
赵小蒙 committed
198
                if overlap_box is not None:
199
200
201
202
203
204
205
206
207
208
209
                    block_to_remove = next((block for block in all_bboxes if block[:4] == overlap_box), None)
                    if block_to_remove is not None and block_to_remove not in need_remove:
                        large_block = block1 if block1 != block_to_remove else block2
                        x1, y1, x2, y2 = large_block[:4]
                        sx1, sy1, sx2, sy2 = block_to_remove[:4]
                        x1 = min(x1, sx1)
                        y1 = min(y1, sy1)
                        x2 = max(x2, sx2)
                        y2 = max(y2, sy2)
                        large_block[:4] = [x1, y1, x2, y2]
                        need_remove.append(block_to_remove)
赵小蒙's avatar
赵小蒙 committed
210

赵小蒙's avatar
赵小蒙 committed
211
212
213
    if len(need_remove) > 0:
        for block in need_remove:
            all_bboxes.remove(block)
赵小蒙's avatar
赵小蒙 committed
214
215

    return all_bboxes