resolve_bbox_conflict.py.bak 7.14 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
2
3
4
5
6
"""
从pdf里提取出来api给出的bbox,然后根据重叠情况做出取舍
1. 首先去掉出现在图片上的bbox,图片包括表格和图片
2. 然后去掉出现在文字blcok上的图片bbox
"""

7
8
9
from magic_pdf.config.drop_tag import ON_IMAGE_TEXT, ON_TABLE_TEXT
from magic_pdf.libs.boxbase import (_is_in, _is_in_or_part_overlap,
                                    _is_left_overlap)
赵小蒙's avatar
赵小蒙 committed
10
11


赵小蒙's avatar
赵小蒙 committed
12
13
def resolve_bbox_overlap_conflict(images: list, tables: list, interline_equations: list, inline_equations: list,
                                  text_raw_blocks: list):
赵小蒙's avatar
赵小蒙 committed
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
    """
    text_raw_blocks结构是从pymupdf里直接取到的结构,具体样例参考test/assets/papre/pymu_textblocks.json
    当下采用一种粗暴的方式:
    1. 去掉图片上的公式
    2. 去掉table上的公式
    2. 图片和文字block部分重叠,首先丢弃图片
    3. 图片和图片重叠,修改图片的bbox,使得图片不重叠(暂时没这么做,先把图片都扔掉)
    4. 去掉文字bbox里位于图片、表格上的文字(一定要完全在图、表内部)
    5. 去掉表格上的文字
    """
    text_block_removed = []
    images_backup = []

    # 去掉位于图片上的文字block
    for image_box in images:
        for text_block in text_raw_blocks:
30
            text_bbox = text_block['bbox']
赵小蒙's avatar
赵小蒙 committed
31
            if _is_in(text_bbox, image_box):
32
                text_block['tag'] = ON_IMAGE_TEXT
赵小蒙's avatar
赵小蒙 committed
33
34
35
36
                text_block_removed.append(text_block)
    # 去掉table上的文字block
    for table_box in tables:
        for text_block in text_raw_blocks:
37
            text_bbox = text_block['bbox']
赵小蒙's avatar
赵小蒙 committed
38
            if _is_in(text_bbox, table_box):
39
                text_block['tag'] = ON_TABLE_TEXT
赵小蒙's avatar
赵小蒙 committed
40
                text_block_removed.append(text_block)
赵小蒙's avatar
赵小蒙 committed
41

赵小蒙's avatar
赵小蒙 committed
42
43
44
    for text_block in text_block_removed:
        if text_block in text_raw_blocks:
            text_raw_blocks.remove(text_block)
赵小蒙's avatar
赵小蒙 committed
45

赵小蒙's avatar
赵小蒙 committed
46
47
48
49
50
51
52
53
54
    # 第一步去掉在图片上出现的公式box
    temp = []
    for image_box in images:
        for eq1 in interline_equations:
            if _is_in_or_part_overlap(image_box, eq1[:4]):
                temp.append(eq1)
        for eq2 in inline_equations:
            if _is_in_or_part_overlap(image_box, eq2[:4]):
                temp.append(eq2)
赵小蒙's avatar
赵小蒙 committed
55

赵小蒙's avatar
赵小蒙 committed
56
57
58
59
60
    for eq in temp:
        if eq in interline_equations:
            interline_equations.remove(eq)
        if eq in inline_equations:
            inline_equations.remove(eq)
赵小蒙's avatar
赵小蒙 committed
61

赵小蒙's avatar
赵小蒙 committed
62
63
64
65
66
67
68
69
70
    # 第二步去掉在表格上出现的公式box
    temp = []
    for table_box in tables:
        for eq1 in interline_equations:
            if _is_in_or_part_overlap(table_box, eq1[:4]):
                temp.append(eq1)
        for eq2 in inline_equations:
            if _is_in_or_part_overlap(table_box, eq2[:4]):
                temp.append(eq2)
赵小蒙's avatar
赵小蒙 committed
71

赵小蒙's avatar
赵小蒙 committed
72
73
74
75
76
    for eq in temp:
        if eq in interline_equations:
            interline_equations.remove(eq)
        if eq in inline_equations:
            inline_equations.remove(eq)
赵小蒙's avatar
赵小蒙 committed
77

赵小蒙's avatar
赵小蒙 committed
78
79
80
    # 图片和文字重叠,丢掉图片
    for image_box in images:
        for text_block in text_raw_blocks:
81
            text_bbox = text_block['bbox']
赵小蒙's avatar
赵小蒙 committed
82
83
84
85
86
            if _is_in_or_part_overlap(image_box, text_bbox):
                images_backup.append(image_box)
                break
    for image_box in images_backup:
        images.remove(image_box)
赵小蒙's avatar
赵小蒙 committed
87

赵小蒙's avatar
赵小蒙 committed
88
89
90
    # 图片和图片重叠,两张都暂时不参与版面计算
    images_dup_index = []
    for i in range(len(images)):
赵小蒙's avatar
赵小蒙 committed
91
        for j in range(i + 1, len(images)):
赵小蒙's avatar
赵小蒙 committed
92
93
94
            if _is_in_or_part_overlap(images[i], images[j]):
                images_dup_index.append(i)
                images_dup_index.append(j)
赵小蒙's avatar
赵小蒙 committed
95

赵小蒙's avatar
赵小蒙 committed
96
97
98
99
    dup_idx = set(images_dup_index)
    for img_id in dup_idx:
        images_backup.append(images[img_id])
        images[img_id] = None
赵小蒙's avatar
赵小蒙 committed
100

赵小蒙's avatar
赵小蒙 committed
101
    images = [img for img in images if img is not None]
赵小蒙's avatar
赵小蒙 committed
102

赵小蒙's avatar
赵小蒙 committed
103
104
105
106
107
108
109
110
111
112
113
114
    # 如果行间公式和文字block重叠,放到临时的数据里,防止这些文字box影响到layout计算。通过计算IOU合并行间公式和文字block
    # 对于这样的文本块删除,然后保留行间公式的大小不变。
    # 当计算完毕layout,这部分再合并回来
    text_block_removed_2 = []
    # for text_block in text_raw_blocks:
    #     text_bbox = text_block["bbox"]
    #     for eq in interline_equations:
    #         ratio = calculate_overlap_area_2_minbox_area_ratio(text_bbox, eq[:4])
    #         if ratio>0.05:
    #             text_block['tag'] = "belong-to-interline-equation"
    #             text_block_removed_2.append(text_block)
    #             break
赵小蒙's avatar
赵小蒙 committed
115

赵小蒙's avatar
赵小蒙 committed
116
117
118
    # for tb in text_block_removed_2:
    #     if tb in text_raw_blocks:
    #         text_raw_blocks.remove(tb)
赵小蒙's avatar
赵小蒙 committed
119

赵小蒙's avatar
赵小蒙 committed
120
    # text_block_removed = text_block_removed + text_block_removed_2
赵小蒙's avatar
赵小蒙 committed
121

赵小蒙's avatar
赵小蒙 committed
122
123
124
    return images, tables, interline_equations, inline_equations, text_raw_blocks, text_block_removed, images_backup, text_block_removed_2


赵小蒙's avatar
赵小蒙 committed
125
def check_text_block_horizontal_overlap(text_blocks: list, header, footer) -> bool:
126
    """检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。 因为这种情况大概率发生了公式没有被检测出来。"""
赵小蒙's avatar
赵小蒙 committed
127
    if len(text_blocks) == 0:
赵小蒙's avatar
赵小蒙 committed
128
        return False
赵小蒙's avatar
赵小蒙 committed
129

赵小蒙's avatar
赵小蒙 committed
130
131
    page_min_y = 0
    page_max_y = max(yy['bbox'][3] for yy in text_blocks)
赵小蒙's avatar
赵小蒙 committed
132
133
134

    def __max_y(lst: list):
        if len(lst) > 0:
赵小蒙's avatar
赵小蒙 committed
135
136
            return max([item[1] for item in lst])
        return page_min_y
赵小蒙's avatar
赵小蒙 committed
137
138
139

    def __min_y(lst: list):
        if len(lst) > 0:
赵小蒙's avatar
赵小蒙 committed
140
141
            return min([item[3] for item in lst])
        return page_max_y
赵小蒙's avatar
赵小蒙 committed
142

赵小蒙's avatar
赵小蒙 committed
143
144
    clip_y0 = __max_y(header)
    clip_y1 = __min_y(footer)
赵小蒙's avatar
赵小蒙 committed
145

赵小蒙's avatar
赵小蒙 committed
146
147
    txt_bboxes = []
    for text_block in text_blocks:
148
        bbox = text_block['bbox']
赵小蒙's avatar
赵小蒙 committed
149
        if bbox[1] >= clip_y0 and bbox[3] <= clip_y1:
赵小蒙's avatar
赵小蒙 committed
150
            txt_bboxes.append(bbox)
赵小蒙's avatar
赵小蒙 committed
151

赵小蒙's avatar
赵小蒙 committed
152
    for i in range(len(txt_bboxes)):
赵小蒙's avatar
赵小蒙 committed
153
        for j in range(i + 1, len(txt_bboxes)):
赵小蒙's avatar
赵小蒙 committed
154
155
            if _is_left_overlap(txt_bboxes[i], txt_bboxes[j]) or _is_left_overlap(txt_bboxes[j], txt_bboxes[i]):
                return True
赵小蒙's avatar
赵小蒙 committed
156
157
158
159
160

    return False


def check_useful_block_horizontal_overlap(useful_blocks: list) -> bool:
161
    """检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。 因为这种情况大概率发生了公式没有被检测出来。"""
赵小蒙's avatar
赵小蒙 committed
162
163
164
165
166
167
168
169
    if len(useful_blocks) == 0:
        return False

    page_min_y = 0
    page_max_y = max(yy['bbox'][3] for yy in useful_blocks)

    useful_bboxes = []
    for text_block in useful_blocks:
170
        bbox = text_block['bbox']
赵小蒙's avatar
赵小蒙 committed
171
172
173
174
175
        if bbox[1] >= page_min_y and bbox[3] <= page_max_y:
            useful_bboxes.append(bbox)

    for i in range(len(useful_bboxes)):
        for j in range(i + 1, len(useful_bboxes)):
赵小蒙's avatar
赵小蒙 committed
176
177
            area_i = (useful_bboxes[i][2] - useful_bboxes[i][0]) * (useful_bboxes[i][3] - useful_bboxes[i][1])
            area_j = (useful_bboxes[j][2] - useful_bboxes[j][0]) * (useful_bboxes[j][3] - useful_bboxes[j][1])
赵小蒙's avatar
赵小蒙 committed
178
            if _is_left_overlap(useful_bboxes[i], useful_bboxes[j]) or _is_left_overlap(useful_bboxes[j], useful_bboxes[i]):
赵小蒙's avatar
赵小蒙 committed
179
                if area_i > area_j:
赵小蒙's avatar
赵小蒙 committed
180
                    return True, useful_bboxes[j], useful_bboxes[i]
赵小蒙's avatar
赵小蒙 committed
181
                else:
赵小蒙's avatar
赵小蒙 committed
182
                    return True, useful_bboxes[i], useful_bboxes[j]
赵小蒙's avatar
赵小蒙 committed
183

赵小蒙's avatar
赵小蒙 committed
184
    return False, None, None