ocr_detect_all_bboxes.py 9.11 KB
Newer Older
1
2
from magic_pdf.config.ocr_content_type import BlockType
from magic_pdf.libs.boxbase import (
3
4
    calculate_iou,
    calculate_overlap_area_in_bbox1_area_ratio,
5
    calculate_vertical_projection_overlap_ratio,
6
7
8
    get_minbox_if_overlap_by_ratio
)
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_block
9
10


11
12
13
14
def add_bboxes(blocks, block_type, bboxes):
    for block in blocks:
        x0, y0, x1, y1 = block['bbox']
        if block_type in [
15
16
17
18
19
20
            BlockType.ImageBody,
            BlockType.ImageCaption,
            BlockType.ImageFootnote,
            BlockType.TableBody,
            BlockType.TableCaption,
            BlockType.TableFootnote,
21
        ]:
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
            bboxes.append(
                [
                    x0,
                    y0,
                    x1,
                    y1,
                    None,
                    None,
                    None,
                    block_type,
                    None,
                    None,
                    None,
                    None,
                    block['score'],
                    block['group_id'],
                ]
            )
40
        else:
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
            bboxes.append(
                [
                    x0,
                    y0,
                    x1,
                    y1,
                    None,
                    None,
                    None,
                    block_type,
                    None,
                    None,
                    None,
                    None,
                    block['score'],
                ]
            )
58
59
60


def ocr_prepare_bboxes_for_layout_split_v2(
61
62
63
64
65
66
67
68
69
70
71
72
    img_body_blocks,
    img_caption_blocks,
    img_footnote_blocks,
    table_body_blocks,
    table_caption_blocks,
    table_footnote_blocks,
    discarded_blocks,
    text_blocks,
    title_blocks,
    interline_equation_blocks,
    page_w,
    page_h,
73
):
74
75
    all_bboxes = []

76
77
78
79
80
81
82
83
84
    add_bboxes(img_body_blocks, BlockType.ImageBody, all_bboxes)
    add_bboxes(img_caption_blocks, BlockType.ImageCaption, all_bboxes)
    add_bboxes(img_footnote_blocks, BlockType.ImageFootnote, all_bboxes)
    add_bboxes(table_body_blocks, BlockType.TableBody, all_bboxes)
    add_bboxes(table_caption_blocks, BlockType.TableCaption, all_bboxes)
    add_bboxes(table_footnote_blocks, BlockType.TableFootnote, all_bboxes)
    add_bboxes(text_blocks, BlockType.Text, all_bboxes)
    add_bboxes(title_blocks, BlockType.Title, all_bboxes)
    add_bboxes(interline_equation_blocks, BlockType.InterlineEquation, all_bboxes)
85

86
87
    """block嵌套问题解决"""
    """文本框与标题框重叠,优先信任文本框"""
88
    all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
89
    """任何框体与舍弃框重叠,优先信任舍弃框"""
90
91
92
    all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)

    # interline_equation 与title或text框冲突的情况,分两种情况处理
93
    """interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框"""
94
    all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
95
    """interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框"""
96
97
    # 通过后续大框套小框逻辑删除

98
    """discarded_blocks"""
99
100
101
    all_discarded_blocks = []
    add_bboxes(discarded_blocks, BlockType.Discarded, all_discarded_blocks)

102
    """footnote识别:宽度超过1/3页面宽度的,高度超过10的,处于页面下半30%区域的"""
103
    footnote_blocks = []
104
105
    for discarded in discarded_blocks:
        x0, y0, x1, y1 = discarded['bbox']
106
        if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h * 0.7):
107
108
            footnote_blocks.append([x0, y0, x1, y1])

109
    """移除在footnote下面的任何框"""
110
111
112
113
114
    need_remove_blocks = find_blocks_under_footnote(all_bboxes, footnote_blocks)
    if len(need_remove_blocks) > 0:
        for block in need_remove_blocks:
            all_bboxes.remove(block)
            all_discarded_blocks.append(block)
115

116
    """经过以上处理后,还存在大框套小框的情况,则删除小框"""
赵小蒙's avatar
赵小蒙 committed
117
118
    all_bboxes = remove_overlaps_min_blocks(all_bboxes)
    all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
119
    """将剩余的bbox做分离处理,防止后面分layout时出错"""
120
121
    # all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
    all_bboxes.sort(key=lambda x: x[0]+x[1])
122
    return all_bboxes, all_discarded_blocks
赵小蒙's avatar
赵小蒙 committed
123
124


125
126
127
128
129
130
131
def find_blocks_under_footnote(all_bboxes, footnote_blocks):
    need_remove_blocks = []
    for block in all_bboxes:
        block_x0, block_y0, block_x1, block_y1 = block[:4]
        for footnote_bbox in footnote_blocks:
            footnote_x0, footnote_y0, footnote_x1, footnote_y1 = footnote_bbox
            # 如果footnote的纵向投影覆盖了block的纵向投影的80%且block的y0大于等于footnote的y1
132
133
134
135
136
137
138
            if (
                block_y0 >= footnote_y1
                and calculate_vertical_projection_overlap_ratio(
                    (block_x0, block_y0, block_x1, block_y1), footnote_bbox
                )
                >= 0.8
            ):
139
140
141
142
143
144
                if block not in need_remove_blocks:
                    need_remove_blocks.append(block)
                    break
    return need_remove_blocks


145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
def fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes):
    # 先提取所有text和interline block
    text_blocks = []
    for block in all_bboxes:
        if block[7] == BlockType.Text:
            text_blocks.append(block)
    interline_equation_blocks = []
    for block in all_bboxes:
        if block[7] == BlockType.InterlineEquation:
            interline_equation_blocks.append(block)

    need_remove = []

    for interline_equation_block in interline_equation_blocks:
        for text_block in text_blocks:
            interline_equation_block_bbox = interline_equation_block[:4]
            text_block_bbox = text_block[:4]
            if calculate_iou(interline_equation_block_bbox, text_block_bbox) > 0.8:
                if text_block not in need_remove:
                    need_remove.append(text_block)

    if len(need_remove) > 0:
        for block in need_remove:
            all_bboxes.remove(block)

    return all_bboxes


赵小蒙's avatar
赵小蒙 committed
173
174
175
176
177
178
179
180
181
182
def fix_text_overlap_title_blocks(all_bboxes):
    # 先提取所有text和title block
    text_blocks = []
    for block in all_bboxes:
        if block[7] == BlockType.Text:
            text_blocks.append(block)
    title_blocks = []
    for block in all_bboxes:
        if block[7] == BlockType.Title:
            title_blocks.append(block)
赵小蒙's avatar
赵小蒙 committed
183

184
185
    need_remove = []

赵小蒙's avatar
赵小蒙 committed
186
187
    for text_block in text_blocks:
        for title_block in title_blocks:
赵小蒙's avatar
赵小蒙 committed
188
189
            text_block_bbox = text_block[:4]
            title_block_bbox = title_block[:4]
赵小蒙's avatar
赵小蒙 committed
190
            if calculate_iou(text_block_bbox, title_block_bbox) > 0.8:
191
192
193
194
195
196
                if title_block not in need_remove:
                    need_remove.append(title_block)

    if len(need_remove) > 0:
        for block in need_remove:
            all_bboxes.remove(block)
赵小蒙's avatar
赵小蒙 committed
197
198
199
200
201

    return all_bboxes


def remove_need_drop_blocks(all_bboxes, discarded_blocks):
赵小蒙's avatar
赵小蒙 committed
202
203
    need_remove = []
    for block in all_bboxes:
赵小蒙's avatar
赵小蒙 committed
204
        for discarded_block in discarded_blocks:
赵小蒙's avatar
赵小蒙 committed
205
            block_bbox = block[:4]
206
207
208
209
210
211
            if (
                calculate_overlap_area_in_bbox1_area_ratio(
                    block_bbox, discarded_block['bbox']
                )
                > 0.6
            ):
赵小蒙's avatar
赵小蒙 committed
212
213
214
                if block not in need_remove:
                    need_remove.append(block)
                    break
赵小蒙's avatar
赵小蒙 committed
215

赵小蒙's avatar
赵小蒙 committed
216
217
218
    if len(need_remove) > 0:
        for block in need_remove:
            all_bboxes.remove(block)
赵小蒙's avatar
赵小蒙 committed
219
220
    return all_bboxes

赵小蒙's avatar
赵小蒙 committed
221
222

def remove_overlaps_min_blocks(all_bboxes):
223
    #  重叠block,小的不能直接删除,需要和大的那个合并成一个更大的。
赵小蒙's avatar
赵小蒙 committed
224
    #  删除重叠blocks中较小的那些
赵小蒙's avatar
赵小蒙 committed
225
226
227
    need_remove = []
    for block1 in all_bboxes:
        for block2 in all_bboxes:
赵小蒙's avatar
赵小蒙 committed
228
            if block1 != block2:
赵小蒙's avatar
赵小蒙 committed
229
230
                block1_bbox = block1[:4]
                block2_bbox = block2[:4]
231
232
233
                overlap_box = get_minbox_if_overlap_by_ratio(
                    block1_bbox, block2_bbox, 0.8
                )
赵小蒙's avatar
赵小蒙 committed
234
                if overlap_box is not None:
235
236
237
238
239
240
241
242
                    block_to_remove = next(
                        (block for block in all_bboxes if block[:4] == overlap_box),
                        None,
                    )
                    if (
                        block_to_remove is not None
                        and block_to_remove not in need_remove
                    ):
243
244
245
246
247
248
249
250
251
                        large_block = block1 if block1 != block_to_remove else block2
                        x1, y1, x2, y2 = large_block[:4]
                        sx1, sy1, sx2, sy2 = block_to_remove[:4]
                        x1 = min(x1, sx1)
                        y1 = min(y1, sy1)
                        x2 = max(x2, sx2)
                        y2 = max(y2, sy2)
                        large_block[:4] = [x1, y1, x2, y2]
                        need_remove.append(block_to_remove)
赵小蒙's avatar
赵小蒙 committed
252

赵小蒙's avatar
赵小蒙 committed
253
254
255
    if len(need_remove) > 0:
        for block in need_remove:
            all_bboxes.remove(block)
赵小蒙's avatar
赵小蒙 committed
256
257

    return all_bboxes