"lm_eval/_cli/ls.py" did not exist on "f9d5d3e7474e413512a6ead3541ba02407668ec4"
resolve_bbox_conflict.py 7.09 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
2
3
4
5
6
"""
从pdf里提取出来api给出的bbox,然后根据重叠情况做出取舍
1. 首先去掉出现在图片上的bbox,图片包括表格和图片
2. 然后去掉出现在文字blcok上的图片bbox
"""

赵小蒙's avatar
赵小蒙 committed
7
from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_left_overlap
8
from magic_pdf.libs.drop_tag import ON_IMAGE_TEXT, ON_TABLE_TEXT
赵小蒙's avatar
赵小蒙 committed
9
10


赵小蒙's avatar
赵小蒙 committed
11
12
def resolve_bbox_overlap_conflict(images: list, tables: list, interline_equations: list, inline_equations: list,
                                  text_raw_blocks: list):
赵小蒙's avatar
赵小蒙 committed
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
    """
    text_raw_blocks结构是从pymupdf里直接取到的结构,具体样例参考test/assets/papre/pymu_textblocks.json
    当下采用一种粗暴的方式:
    1. 去掉图片上的公式
    2. 去掉table上的公式
    2. 图片和文字block部分重叠,首先丢弃图片
    3. 图片和图片重叠,修改图片的bbox,使得图片不重叠(暂时没这么做,先把图片都扔掉)
    4. 去掉文字bbox里位于图片、表格上的文字(一定要完全在图、表内部)
    5. 去掉表格上的文字
    """
    text_block_removed = []
    images_backup = []

    # 去掉位于图片上的文字block
    for image_box in images:
        for text_block in text_raw_blocks:
            text_bbox = text_block["bbox"]
            if _is_in(text_bbox, image_box):
31
                text_block['tag'] = ON_IMAGE_TEXT
赵小蒙's avatar
赵小蒙 committed
32
33
34
35
36
37
                text_block_removed.append(text_block)
    # 去掉table上的文字block
    for table_box in tables:
        for text_block in text_raw_blocks:
            text_bbox = text_block["bbox"]
            if _is_in(text_bbox, table_box):
38
                text_block['tag'] = ON_TABLE_TEXT
赵小蒙's avatar
赵小蒙 committed
39
                text_block_removed.append(text_block)
赵小蒙's avatar
赵小蒙 committed
40

赵小蒙's avatar
赵小蒙 committed
41
42
43
    for text_block in text_block_removed:
        if text_block in text_raw_blocks:
            text_raw_blocks.remove(text_block)
赵小蒙's avatar
赵小蒙 committed
44

赵小蒙's avatar
赵小蒙 committed
45
46
47
48
49
50
51
52
53
    # 第一步去掉在图片上出现的公式box
    temp = []
    for image_box in images:
        for eq1 in interline_equations:
            if _is_in_or_part_overlap(image_box, eq1[:4]):
                temp.append(eq1)
        for eq2 in inline_equations:
            if _is_in_or_part_overlap(image_box, eq2[:4]):
                temp.append(eq2)
赵小蒙's avatar
赵小蒙 committed
54

赵小蒙's avatar
赵小蒙 committed
55
56
57
58
59
    for eq in temp:
        if eq in interline_equations:
            interline_equations.remove(eq)
        if eq in inline_equations:
            inline_equations.remove(eq)
赵小蒙's avatar
赵小蒙 committed
60

赵小蒙's avatar
赵小蒙 committed
61
62
63
64
65
66
67
68
69
    # 第二步去掉在表格上出现的公式box
    temp = []
    for table_box in tables:
        for eq1 in interline_equations:
            if _is_in_or_part_overlap(table_box, eq1[:4]):
                temp.append(eq1)
        for eq2 in inline_equations:
            if _is_in_or_part_overlap(table_box, eq2[:4]):
                temp.append(eq2)
赵小蒙's avatar
赵小蒙 committed
70

赵小蒙's avatar
赵小蒙 committed
71
72
73
74
75
    for eq in temp:
        if eq in interline_equations:
            interline_equations.remove(eq)
        if eq in inline_equations:
            inline_equations.remove(eq)
赵小蒙's avatar
赵小蒙 committed
76

赵小蒙's avatar
赵小蒙 committed
77
78
79
80
81
82
83
84
85
    # 图片和文字重叠,丢掉图片
    for image_box in images:
        for text_block in text_raw_blocks:
            text_bbox = text_block["bbox"]
            if _is_in_or_part_overlap(image_box, text_bbox):
                images_backup.append(image_box)
                break
    for image_box in images_backup:
        images.remove(image_box)
赵小蒙's avatar
赵小蒙 committed
86

赵小蒙's avatar
赵小蒙 committed
87
88
89
    # 图片和图片重叠,两张都暂时不参与版面计算
    images_dup_index = []
    for i in range(len(images)):
赵小蒙's avatar
赵小蒙 committed
90
        for j in range(i + 1, len(images)):
赵小蒙's avatar
赵小蒙 committed
91
92
93
            if _is_in_or_part_overlap(images[i], images[j]):
                images_dup_index.append(i)
                images_dup_index.append(j)
赵小蒙's avatar
赵小蒙 committed
94

赵小蒙's avatar
赵小蒙 committed
95
96
97
98
    dup_idx = set(images_dup_index)
    for img_id in dup_idx:
        images_backup.append(images[img_id])
        images[img_id] = None
赵小蒙's avatar
赵小蒙 committed
99

赵小蒙's avatar
赵小蒙 committed
100
    images = [img for img in images if img is not None]
赵小蒙's avatar
赵小蒙 committed
101

赵小蒙's avatar
赵小蒙 committed
102
103
104
105
106
107
108
109
110
111
112
113
    # 如果行间公式和文字block重叠,放到临时的数据里,防止这些文字box影响到layout计算。通过计算IOU合并行间公式和文字block
    # 对于这样的文本块删除,然后保留行间公式的大小不变。
    # 当计算完毕layout,这部分再合并回来
    text_block_removed_2 = []
    # for text_block in text_raw_blocks:
    #     text_bbox = text_block["bbox"]
    #     for eq in interline_equations:
    #         ratio = calculate_overlap_area_2_minbox_area_ratio(text_bbox, eq[:4])
    #         if ratio>0.05:
    #             text_block['tag'] = "belong-to-interline-equation"
    #             text_block_removed_2.append(text_block)
    #             break
赵小蒙's avatar
赵小蒙 committed
114

赵小蒙's avatar
赵小蒙 committed
115
116
117
    # for tb in text_block_removed_2:
    #     if tb in text_raw_blocks:
    #         text_raw_blocks.remove(tb)
赵小蒙's avatar
赵小蒙 committed
118

赵小蒙's avatar
赵小蒙 committed
119
    # text_block_removed = text_block_removed + text_block_removed_2
赵小蒙's avatar
赵小蒙 committed
120

赵小蒙's avatar
赵小蒙 committed
121
122
123
    return images, tables, interline_equations, inline_equations, text_raw_blocks, text_block_removed, images_backup, text_block_removed_2


赵小蒙's avatar
赵小蒙 committed
124
def check_text_block_horizontal_overlap(text_blocks: list, header, footer) -> bool:
赵小蒙's avatar
赵小蒙 committed
125
126
127
128
129
    """
    检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。
    因为这种情况大概率发生了公式没有被检测出来。
    
    """
赵小蒙's avatar
赵小蒙 committed
130
    if len(text_blocks) == 0:
赵小蒙's avatar
赵小蒙 committed
131
        return False
赵小蒙's avatar
赵小蒙 committed
132

赵小蒙's avatar
赵小蒙 committed
133
134
    page_min_y = 0
    page_max_y = max(yy['bbox'][3] for yy in text_blocks)
赵小蒙's avatar
赵小蒙 committed
135
136
137

    def __max_y(lst: list):
        if len(lst) > 0:
赵小蒙's avatar
赵小蒙 committed
138
139
            return max([item[1] for item in lst])
        return page_min_y
赵小蒙's avatar
赵小蒙 committed
140
141
142

    def __min_y(lst: list):
        if len(lst) > 0:
赵小蒙's avatar
赵小蒙 committed
143
144
            return min([item[3] for item in lst])
        return page_max_y
赵小蒙's avatar
赵小蒙 committed
145

赵小蒙's avatar
赵小蒙 committed
146
147
    clip_y0 = __max_y(header)
    clip_y1 = __min_y(footer)
赵小蒙's avatar
赵小蒙 committed
148

赵小蒙's avatar
赵小蒙 committed
149
150
151
    txt_bboxes = []
    for text_block in text_blocks:
        bbox = text_block["bbox"]
赵小蒙's avatar
赵小蒙 committed
152
        if bbox[1] >= clip_y0 and bbox[3] <= clip_y1:
赵小蒙's avatar
赵小蒙 committed
153
            txt_bboxes.append(bbox)
赵小蒙's avatar
赵小蒙 committed
154

赵小蒙's avatar
赵小蒙 committed
155
    for i in range(len(txt_bboxes)):
赵小蒙's avatar
赵小蒙 committed
156
        for j in range(i + 1, len(txt_bboxes)):
赵小蒙's avatar
赵小蒙 committed
157
158
            if _is_left_overlap(txt_bboxes[i], txt_bboxes[j]) or _is_left_overlap(txt_bboxes[j], txt_bboxes[i]):
                return True
赵小蒙's avatar
赵小蒙 committed
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182

    return False


def check_useful_block_horizontal_overlap(useful_blocks: list) -> bool:
    """
    检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。
    因为这种情况大概率发生了公式没有被检测出来。

    """
    if len(useful_blocks) == 0:
        return False

    page_min_y = 0
    page_max_y = max(yy['bbox'][3] for yy in useful_blocks)

    useful_bboxes = []
    for text_block in useful_blocks:
        bbox = text_block["bbox"]
        if bbox[1] >= page_min_y and bbox[3] <= page_max_y:
            useful_bboxes.append(bbox)

    for i in range(len(useful_bboxes)):
        for j in range(i + 1, len(useful_bboxes)):
赵小蒙's avatar
赵小蒙 committed
183
184
            area_i = (useful_bboxes[i][2] - useful_bboxes[i][0]) * (useful_bboxes[i][3] - useful_bboxes[i][1])
            area_j = (useful_bboxes[j][2] - useful_bboxes[j][0]) * (useful_bboxes[j][3] - useful_bboxes[j][1])
赵小蒙's avatar
赵小蒙 committed
185
            if _is_left_overlap(useful_bboxes[i], useful_bboxes[j]) or _is_left_overlap(useful_bboxes[j], useful_bboxes[i]):
赵小蒙's avatar
赵小蒙 committed
186
187
188
189
                if area_i > area_j:
                    return True, useful_bboxes[j]
                else:
                    return True, useful_bboxes[i]
赵小蒙's avatar
赵小蒙 committed
190

赵小蒙's avatar
赵小蒙 committed
191
    return False, None