ocr_detect_all_bboxes.py 11.1 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
2
from loguru import logger

赵小蒙's avatar
赵小蒙 committed
3
from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio, calculate_overlap_area_in_bbox1_area_ratio, \
4
    calculate_iou, calculate_vertical_projection_overlap_ratio
赵小蒙's avatar
赵小蒙 committed
5
from magic_pdf.libs.drop_tag import DropTag
赵小蒙's avatar
赵小蒙 committed
6
from magic_pdf.libs.ocr_content_type import BlockType
7
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_block
赵小蒙's avatar
赵小蒙 committed
8
9


赵小蒙's avatar
赵小蒙 committed
10
11
12
def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_blocks, text_blocks,
                                        title_blocks, interline_equation_blocks, page_w, page_h):
    all_bboxes = []
13
    all_discarded_blocks = []
赵小蒙's avatar
赵小蒙 committed
14
15
    for image in img_blocks:
        x0, y0, x1, y1 = image['bbox']
赵小蒙's avatar
赵小蒙 committed
16
        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Image, None, None, None, None, image["score"]])
赵小蒙's avatar
赵小蒙 committed
17
18
19

    for table in table_blocks:
        x0, y0, x1, y1 = table['bbox']
赵小蒙's avatar
赵小蒙 committed
20
        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Table, None, None, None, None, table["score"]])
赵小蒙's avatar
赵小蒙 committed
21
22
23

    for text in text_blocks:
        x0, y0, x1, y1 = text['bbox']
赵小蒙's avatar
赵小蒙 committed
24
        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Text, None, None, None, None, text["score"]])
赵小蒙's avatar
赵小蒙 committed
25
26
27

    for title in title_blocks:
        x0, y0, x1, y1 = title['bbox']
赵小蒙's avatar
赵小蒙 committed
28
        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Title, None, None, None, None, title["score"]])
赵小蒙's avatar
赵小蒙 committed
29
30
31

    for interline_equation in interline_equation_blocks:
        x0, y0, x1, y1 = interline_equation['bbox']
赵小蒙's avatar
赵小蒙 committed
32
        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.InterlineEquation, None, None, None, None, interline_equation["score"]])
赵小蒙's avatar
赵小蒙 committed
33

赵小蒙's avatar
赵小蒙 committed
34
    '''block嵌套问题解决'''
35
    '''文本框与标题框重叠,优先信任文本框'''
赵小蒙's avatar
赵小蒙 committed
36
37
38
    all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
    '''任何框体与舍弃框重叠,优先信任舍弃框'''
    all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
39
40

    # interline_equation 与title或text框冲突的情况,分两种情况处理
赵小蒙's avatar
赵小蒙 committed
41
    '''interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框'''
42
    all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
赵小蒙's avatar
赵小蒙 committed
43
    '''interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框'''
44
    # 通过后续大框套小框逻辑删除
赵小蒙's avatar
赵小蒙 committed
45

赵小蒙's avatar
赵小蒙 committed
46
47
48
    '''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)'''
    for discarded in discarded_blocks:
        x0, y0, x1, y1 = discarded['bbox']
赵小蒙's avatar
赵小蒙 committed
49
        all_discarded_blocks.append([x0, y0, x1, y1, None, None, None, BlockType.Discarded, None, None, None, None, discarded["score"]])
50
        # 将footnote加入到all_bboxes中,用来计算layout
赵小蒙's avatar
赵小蒙 committed
51
        if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
赵小蒙's avatar
赵小蒙 committed
52
            all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Footnote, None, None, None, None, discarded["score"]])
赵小蒙's avatar
赵小蒙 committed
53

赵小蒙's avatar
赵小蒙 committed
54
    '''经过以上处理后,还存在大框套小框的情况,则删除小框'''
55
56
57
58
59
60
61
62
    all_bboxes = remove_overlaps_min_blocks(all_bboxes)
    all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
    '''将剩余的bbox做分离处理,防止后面分layout时出错'''
    all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)

    return all_bboxes, all_discarded_blocks, drop_reasons


63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
def add_bboxes(blocks, block_type, bboxes):
    for block in blocks:
        x0, y0, x1, y1 = block['bbox']
        if block_type in [
            BlockType.ImageBody, BlockType.ImageCaption, BlockType.ImageFootnote,
            BlockType.TableBody, BlockType.TableCaption, BlockType.TableFootnote
        ]:
            bboxes.append([x0, y0, x1, y1, None, None, None, block_type, None, None, None, None, block["score"], block["group_id"]])
        else:
            bboxes.append([x0, y0, x1, y1, None, None, None, block_type, None, None, None, None, block["score"]])


def ocr_prepare_bboxes_for_layout_split_v2(
        img_body_blocks, img_caption_blocks, img_footnote_blocks,
        table_body_blocks, table_caption_blocks, table_footnote_blocks,
        discarded_blocks, text_blocks, title_blocks, interline_equation_blocks, page_w, page_h
):
80
81
    all_bboxes = []

82
83
84
85
86
87
88
89
90
    add_bboxes(img_body_blocks, BlockType.ImageBody, all_bboxes)
    add_bboxes(img_caption_blocks, BlockType.ImageCaption, all_bboxes)
    add_bboxes(img_footnote_blocks, BlockType.ImageFootnote, all_bboxes)
    add_bboxes(table_body_blocks, BlockType.TableBody, all_bboxes)
    add_bboxes(table_caption_blocks, BlockType.TableCaption, all_bboxes)
    add_bboxes(table_footnote_blocks, BlockType.TableFootnote, all_bboxes)
    add_bboxes(text_blocks, BlockType.Text, all_bboxes)
    add_bboxes(title_blocks, BlockType.Title, all_bboxes)
    add_bboxes(interline_equation_blocks, BlockType.InterlineEquation, all_bboxes)
91
92
93
94
95
96
97
98
99
100
101
102
103

    '''block嵌套问题解决'''
    '''文本框与标题框重叠,优先信任文本框'''
    all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
    '''任何框体与舍弃框重叠,优先信任舍弃框'''
    all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)

    # interline_equation 与title或text框冲突的情况,分两种情况处理
    '''interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框'''
    all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
    '''interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框'''
    # 通过后续大框套小框逻辑删除

104
105
106
107
108
    '''discarded_blocks'''
    all_discarded_blocks = []
    add_bboxes(discarded_blocks, BlockType.Discarded, all_discarded_blocks)

    '''footnote识别:宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的'''
109
    footnote_blocks = []
110
111
    for discarded in discarded_blocks:
        x0, y0, x1, y1 = discarded['bbox']
112
113
114
115
116
117
118
119
120
        if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
            footnote_blocks.append([x0, y0, x1, y1])

    '''移除在footnote下面的任何框'''
    need_remove_blocks = find_blocks_under_footnote(all_bboxes, footnote_blocks)
    if len(need_remove_blocks) > 0:
        for block in need_remove_blocks:
            all_bboxes.remove(block)
            all_discarded_blocks.append(block)
121
122

    '''经过以上处理后,还存在大框套小框的情况,则删除小框'''
赵小蒙's avatar
赵小蒙 committed
123
124
125
    all_bboxes = remove_overlaps_min_blocks(all_bboxes)
    all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
    '''将剩余的bbox做分离处理,防止后面分layout时出错'''
126
    all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
赵小蒙's avatar
赵小蒙 committed
127

128
    return all_bboxes, all_discarded_blocks
赵小蒙's avatar
赵小蒙 committed
129
130


131
132
133
134
135
136
137
138
139
140
141
142
143
144
def find_blocks_under_footnote(all_bboxes, footnote_blocks):
    need_remove_blocks = []
    for block in all_bboxes:
        block_x0, block_y0, block_x1, block_y1 = block[:4]
        for footnote_bbox in footnote_blocks:
            footnote_x0, footnote_y0, footnote_x1, footnote_y1 = footnote_bbox
            # 如果footnote的纵向投影覆盖了block的纵向投影的80%且block的y0大于等于footnote的y1
            if block_y0 >= footnote_y1 and calculate_vertical_projection_overlap_ratio((block_x0, block_y0, block_x1, block_y1), footnote_bbox) >= 0.8:
                if block not in need_remove_blocks:
                    need_remove_blocks.append(block)
                    break
    return need_remove_blocks


145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
def fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes):
    # 先提取所有text和interline block
    text_blocks = []
    for block in all_bboxes:
        if block[7] == BlockType.Text:
            text_blocks.append(block)
    interline_equation_blocks = []
    for block in all_bboxes:
        if block[7] == BlockType.InterlineEquation:
            interline_equation_blocks.append(block)

    need_remove = []

    for interline_equation_block in interline_equation_blocks:
        for text_block in text_blocks:
            interline_equation_block_bbox = interline_equation_block[:4]
            text_block_bbox = text_block[:4]
            if calculate_iou(interline_equation_block_bbox, text_block_bbox) > 0.8:
                if text_block not in need_remove:
                    need_remove.append(text_block)

    if len(need_remove) > 0:
        for block in need_remove:
            all_bboxes.remove(block)

    return all_bboxes


赵小蒙's avatar
赵小蒙 committed
173
174
175
176
177
178
179
180
181
182
def fix_text_overlap_title_blocks(all_bboxes):
    # 先提取所有text和title block
    text_blocks = []
    for block in all_bboxes:
        if block[7] == BlockType.Text:
            text_blocks.append(block)
    title_blocks = []
    for block in all_bboxes:
        if block[7] == BlockType.Title:
            title_blocks.append(block)
赵小蒙's avatar
赵小蒙 committed
183

184
185
    need_remove = []

赵小蒙's avatar
赵小蒙 committed
186
187
    for text_block in text_blocks:
        for title_block in title_blocks:
赵小蒙's avatar
赵小蒙 committed
188
189
            text_block_bbox = text_block[:4]
            title_block_bbox = title_block[:4]
赵小蒙's avatar
赵小蒙 committed
190
            if calculate_iou(text_block_bbox, title_block_bbox) > 0.8:
191
192
193
194
195
196
                if title_block not in need_remove:
                    need_remove.append(title_block)

    if len(need_remove) > 0:
        for block in need_remove:
            all_bboxes.remove(block)
赵小蒙's avatar
赵小蒙 committed
197
198
199
200
201

    return all_bboxes


def remove_need_drop_blocks(all_bboxes, discarded_blocks):
赵小蒙's avatar
赵小蒙 committed
202
203
    need_remove = []
    for block in all_bboxes:
赵小蒙's avatar
赵小蒙 committed
204
        for discarded_block in discarded_blocks:
赵小蒙's avatar
赵小蒙 committed
205
            block_bbox = block[:4]
赵小蒙's avatar
赵小蒙 committed
206
            if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, discarded_block['bbox']) > 0.6:
赵小蒙's avatar
赵小蒙 committed
207
208
209
                if block not in need_remove:
                    need_remove.append(block)
                    break
赵小蒙's avatar
赵小蒙 committed
210

赵小蒙's avatar
赵小蒙 committed
211
212
213
    if len(need_remove) > 0:
        for block in need_remove:
            all_bboxes.remove(block)
赵小蒙's avatar
赵小蒙 committed
214
215
    return all_bboxes

赵小蒙's avatar
赵小蒙 committed
216
217

def remove_overlaps_min_blocks(all_bboxes):
218
    #  重叠block,小的不能直接删除,需要和大的那个合并成一个更大的。
赵小蒙's avatar
赵小蒙 committed
219
    #  删除重叠blocks中较小的那些
赵小蒙's avatar
赵小蒙 committed
220
221
222
    need_remove = []
    for block1 in all_bboxes:
        for block2 in all_bboxes:
赵小蒙's avatar
赵小蒙 committed
223
            if block1 != block2:
赵小蒙's avatar
赵小蒙 committed
224
225
                block1_bbox = block1[:4]
                block2_bbox = block2[:4]
赵小蒙's avatar
赵小蒙 committed
226
                overlap_box = get_minbox_if_overlap_by_ratio(block1_bbox, block2_bbox, 0.8)
赵小蒙's avatar
赵小蒙 committed
227
                if overlap_box is not None:
228
229
230
231
232
233
234
235
236
237
238
                    block_to_remove = next((block for block in all_bboxes if block[:4] == overlap_box), None)
                    if block_to_remove is not None and block_to_remove not in need_remove:
                        large_block = block1 if block1 != block_to_remove else block2
                        x1, y1, x2, y2 = large_block[:4]
                        sx1, sy1, sx2, sy2 = block_to_remove[:4]
                        x1 = min(x1, sx1)
                        y1 = min(y1, sy1)
                        x2 = max(x2, sx2)
                        y2 = max(y2, sy2)
                        large_block[:4] = [x1, y1, x2, y2]
                        need_remove.append(block_to_remove)
赵小蒙's avatar
赵小蒙 committed
239

赵小蒙's avatar
赵小蒙 committed
240
241
242
    if len(need_remove) > 0:
        for block in need_remove:
            all_bboxes.remove(block)
赵小蒙's avatar
赵小蒙 committed
243
244

    return all_bboxes