ocr_dict_merge.py 6.87 KB
Newer Older
1
2
3
4
from loguru import logger

from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \
    calculate_overlap_area_in_bbox1_area_ratio
赵小蒙's avatar
赵小蒙 committed
5
6


7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# 将每一个line中的span从左到右排序
def line_sort_spans_by_left_to_right(lines):
    line_objects = []
    for line in lines:
        # 按照x0坐标排序
        line.sort(key=lambda span: span['bbox'][0])
        line_bbox = [
            min(span['bbox'][0] for span in line),  # x0
            min(span['bbox'][1] for span in line),  # y0
            max(span['bbox'][2] for span in line),  # x1
            max(span['bbox'][3] for span in line),  # y1
        ]
        line_objects.append({
            "bbox": line_bbox,
            "spans": line,
        })
    return line_objects

赵小蒙's avatar
赵小蒙 committed
25
def merge_spans_to_line(spans):
赵小蒙's avatar
赵小蒙 committed
26
27
28
29
30
31
32
    # 按照y0坐标排序
    spans.sort(key=lambda span: span['bbox'][1])

    lines = []
    current_line = [spans[0]]
    for span in spans[1:]:
        # 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
赵小蒙's avatar
赵小蒙 committed
33
        # image和table类型,同上
34
35
        if span['type'] in ["displayed_equation", "image", "table"] or any(
                s['type'] in ["displayed_equation", "image", "table"] for s in current_line):
赵小蒙's avatar
赵小蒙 committed
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
            # 则开始新行
            lines.append(current_line)
            current_line = [span]
            continue

        # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
        if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
            current_line.append(span)
        else:
            # 否则,开始新行
            lines.append(current_line)
            current_line = [span]

    # 添加最后一行
    if current_line:
        lines.append(current_line)

53
    return lines
赵小蒙's avatar
赵小蒙 committed
54

55
56
57
58
59
60
61
62
63
64
def merge_spans_to_line_by_layout(spans, layout_bboxes):
    lines = []
    new_spans = []
    for item in layout_bboxes:
        layout_bbox = item['layout_bbox']
        # 遍历spans,将每个span放入对应的layout中
        layout_sapns = []
        for span in spans:
            if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], layout_bbox) > 0.8:
                layout_sapns.append(span)
65
66
67
68
69
70
        # 如果layout_sapns不为空,则放入new_spans中
        if len(layout_sapns) > 0:
            new_spans.append(layout_sapns)
            # 从spans删除已经放入layout_sapns中的span
            for layout_sapn in layout_sapns:
                spans.remove(layout_sapn)
71

72
73
74
75
    if len(new_spans) > 0:
        for layout_sapns in new_spans:
            layout_lines = merge_spans_to_line(layout_sapns)
            lines.extend(layout_lines)
76
77
78
79
80

    #对line中的span进行排序
    lines = line_sort_spans_by_left_to_right(lines)

    return lines
liukaiwen's avatar
lkw  
liukaiwen committed
81
82
83



liukaiwen's avatar
liukaiwen committed
84
85
def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
    # displayed_list = []
liukaiwen's avatar
lkw  
liukaiwen committed
86
87
88
89
90
91
92

    spans.sort(key=lambda span: span['bbox'][1])

    lines = []
    current_line = [spans[0]]
    if spans[0]["type"] in ["displayed_equation", "image", "table"]:
        displayed_list.append(spans[0])
liukaiwen's avatar
lkw  
liukaiwen committed
93

liukaiwen's avatar
lkw  
liukaiwen committed
94
95
96
    line_first_y0 = spans[0]["bbox"][1]
    line_first_y = spans[0]["bbox"][3]
    #用于给行间公式搜索
liukaiwen's avatar
liukaiwen committed
97
    # text_inline_lines = []
liukaiwen's avatar
lkw  
liukaiwen committed
98
    for span in spans[1:]:
99
100
        # if span.get("content","") == "78.":
        #     print("debug")
liukaiwen's avatar
lkw  
liukaiwen committed
101
102
103
104
105
        # 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
        # image和table类型,同上
        if span['type'] in ["displayed_equation", "image", "table"] or any(
                s['type'] in ["displayed_equation", "image", "table"] for s in current_line):
            #传入
liukaiwen's avatar
lkw  
liukaiwen committed
106
            if span["type"] in ["displayed_equation", "image", "table"]:
liukaiwen's avatar
lkw  
liukaiwen committed
107
108
109
                displayed_list.append(span)
            # 则开始新行
            lines.append(current_line)
liukaiwen's avatar
lkw  
liukaiwen committed
110
111
            if len(current_line) > 1 or current_line[0]["type"] in ["text", "inline_equation"]:
                text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
liukaiwen's avatar
lkw  
liukaiwen committed
112
            current_line = [span]
liukaiwen's avatar
lkw  
liukaiwen committed
113
114
            line_first_y0 = span["bbox"][1]
            line_first_y = span["bbox"][3]
liukaiwen's avatar
lkw  
liukaiwen committed
115
116
117
118
            continue

        # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
        if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
liukaiwen's avatar
liukaiwen committed
119
            if span["type"] == "text":
liukaiwen's avatar
lkw  
liukaiwen committed
120
121
                line_first_y0 = span["bbox"][1]
                line_first_y = span["bbox"][3]
liukaiwen's avatar
lkw  
liukaiwen committed
122
123
124
125
126
127
128
            current_line.append(span)

        else:
            # 否则,开始新行
            lines.append(current_line)
            text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
            current_line = [span]
liukaiwen's avatar
lkw  
liukaiwen committed
129
130
            line_first_y0 = span["bbox"][1]
            line_first_y = span["bbox"][3]
liukaiwen's avatar
lkw  
liukaiwen committed
131
132
133
134

        # 添加最后一行
    if current_line:
        lines.append(current_line)
liukaiwen's avatar
lkw  
liukaiwen committed
135
        if len(current_line) > 1 or current_line[0]["type"] in ["text", "inline_equation"]:
liukaiwen's avatar
lkw  
liukaiwen committed
136
            text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
liukaiwen's avatar
lkw  
liukaiwen committed
137
138
    for line in text_inline_lines:
        # 按照x0坐标排序
liukaiwen's avatar
lkw  
liukaiwen committed
139
140
        current_line = line[0]
        current_line.sort(key=lambda span: span['bbox'][0])
liukaiwen's avatar
lkw  
liukaiwen committed
141
142


liukaiwen's avatar
lkw  
liukaiwen committed
143
144
145
146
147
148
    #调整每一个文字行内bbox统一
    for line in text_inline_lines:
        current_line, (line_first_y0, line_first_y) = line
        for span in current_line:
            span["bbox"][1] = line_first_y0
            span["bbox"][3] = line_first_y
liukaiwen's avatar
liukaiwen committed
149
150
151
152

    # return spans, displayed_list, text_inline_lines

def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines: list):
liukaiwen's avatar
lkw  
liukaiwen committed
153
    #错误行间公式转行内公式
liukaiwen's avatar
lkw  
liukaiwen committed
154
    j = 0
liukaiwen's avatar
lkw  
liukaiwen committed
155
    for i in range(len(displayed_list)):
156
157
        # if i == 8:
        #     print("debug")
liukaiwen's avatar
lkw  
liukaiwen committed
158
        span = displayed_list[i]
liukaiwen's avatar
lkw  
liukaiwen committed
159
        span_y0, span_y = span["bbox"][1], span["bbox"][3]
liukaiwen's avatar
lkw  
liukaiwen committed
160

liukaiwen's avatar
lkw  
liukaiwen committed
161
162
163
        while j < len(text_inline_lines):
            text_line = text_inline_lines[j]
            y0, y1 = text_line[1]
liukaiwen's avatar
lkw  
liukaiwen committed
164
            if (span_y0 < y0 and span_y > y0 or span_y0 < y1 and span_y > y1 or span_y0 < y0 and span_y > y1) and __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
liukaiwen's avatar
lkw  
liukaiwen committed
165
                span["bbox"][1] = y0
liukaiwen's avatar
lkw  
liukaiwen committed
166
                # span["bbox"][3] = y1
liukaiwen's avatar
liukaiwen committed
167
                #调整公式类型
liukaiwen's avatar
lkw  
liukaiwen committed
168
                if span["type"] == "displayed_equation":
liukaiwen's avatar
liukaiwen committed
169
170
171
172
173
174
                    if j+1 >= len(text_inline_lines):
                        span["type"] = "inline_equation"
                    else:
                        y0_next, y1_next = text_inline_lines[j + 1][1]
                        if not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0_next, 0, y1_next)):
                            span["type"] = "inline_equation"
liukaiwen's avatar
lkw  
liukaiwen committed
175
176
177
178
179
                break
            elif span_y < y0 or span_y0 < y0 and span_y > y0 and not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
                break
            else:
                j += 1
liukaiwen's avatar
lkw  
liukaiwen committed
180

liukaiwen's avatar
lkw  
liukaiwen committed
181
    return spans
liukaiwen's avatar
lkw  
liukaiwen committed
182
183
184
185