ocr_dict_merge.py 3.35 KB
Newer Older
1
2
3
4
from loguru import logger

from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \
    calculate_overlap_area_in_bbox1_area_ratio
赵小蒙's avatar
赵小蒙 committed
5
from magic_pdf.libs.ocr_content_type import ContentType
赵小蒙's avatar
赵小蒙 committed
6
7


8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# 将每一个line中的span从左到右排序
def line_sort_spans_by_left_to_right(lines):
    line_objects = []
    for line in lines:
        # 按照x0坐标排序
        line.sort(key=lambda span: span['bbox'][0])
        line_bbox = [
            min(span['bbox'][0] for span in line),  # x0
            min(span['bbox'][1] for span in line),  # y0
            max(span['bbox'][2] for span in line),  # x1
            max(span['bbox'][3] for span in line),  # y1
        ]
        line_objects.append({
            "bbox": line_bbox,
            "spans": line,
        })
    return line_objects

赵小蒙's avatar
赵小蒙 committed
26
def merge_spans_to_line(spans):
27
28
29
30
31
    if len(spans) == 0:
        return []
    else:
        # 按照y0坐标排序
        spans.sort(key=lambda span: span['bbox'][1])
赵小蒙's avatar
赵小蒙 committed
32

33
34
35
36
37
38
39
40
41
42
43
        lines = []
        current_line = [spans[0]]
        for span in spans[1:]:
            # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
            # image和table类型,同上
            if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
                    s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in current_line):
                # 则开始新行
                lines.append(current_line)
                current_line = [span]
                continue
赵小蒙's avatar
赵小蒙 committed
44

45
46
47
48
49
50
51
            # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
            if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
                current_line.append(span)
            else:
                # 否则,开始新行
                lines.append(current_line)
                current_line = [span]
赵小蒙's avatar
赵小蒙 committed
52

53
54
55
        # 添加最后一行
        if current_line:
            lines.append(current_line)
赵小蒙's avatar
赵小蒙 committed
56

57
        return lines
赵小蒙's avatar
赵小蒙 committed
58

59
60
61
62
63
64
65
66
def merge_spans_to_line_by_layout(spans, layout_bboxes):
    lines = []
    new_spans = []
    for item in layout_bboxes:
        layout_bbox = item['layout_bbox']
        # 遍历spans,将每个span放入对应的layout中
        layout_sapns = []
        for span in spans:
赵小蒙's avatar
赵小蒙 committed
67
            if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], layout_bbox) > 0.65:
68
                layout_sapns.append(span)
69
70
71
72
73
74
        # 如果layout_sapns不为空,则放入new_spans中
        if len(layout_sapns) > 0:
            new_spans.append(layout_sapns)
            # 从spans删除已经放入layout_sapns中的span
            for layout_sapn in layout_sapns:
                spans.remove(layout_sapn)
75

76
77
78
79
    if len(new_spans) > 0:
        for layout_sapns in new_spans:
            layout_lines = merge_spans_to_line(layout_sapns)
            lines.extend(layout_lines)
80
81
82
83
84

    #对line中的span进行排序
    lines = line_sort_spans_by_left_to_right(lines)

    return lines
liukaiwen's avatar
lkw  
liukaiwen committed
85
86


赵小蒙's avatar
赵小蒙 committed
87
88
89
90
91
92
93
94
95
96
97
98
99
def merge_lines_to_block(lines):
    # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
    blocks = []
    for line in lines:
        blocks.append(
            {
                "bbox": line["bbox"],
                "lines": [line],
            }
        )
    return blocks


liukaiwen's avatar
lkw  
liukaiwen committed
100
101
102
103