ocr_dict_merge.py 6.37 KB
Newer Older
1
2
3
4
from loguru import logger

from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \
    calculate_overlap_area_in_bbox1_area_ratio
5
from magic_pdf.libs.drop_tag import DropTag
赵小蒙's avatar
赵小蒙 committed
6
from magic_pdf.libs.ocr_content_type import ContentType
赵小蒙's avatar
赵小蒙 committed
7
from magic_pdf.pre_proc.ocr_fix_block_logic import fix_image_block, fix_table_block
赵小蒙's avatar
赵小蒙 committed
8
9


10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# 将每一个line中的span从左到右排序
def line_sort_spans_by_left_to_right(lines):
    line_objects = []
    for line in lines:
        # 按照x0坐标排序
        line.sort(key=lambda span: span['bbox'][0])
        line_bbox = [
            min(span['bbox'][0] for span in line),  # x0
            min(span['bbox'][1] for span in line),  # y0
            max(span['bbox'][2] for span in line),  # x1
            max(span['bbox'][3] for span in line),  # y1
        ]
        line_objects.append({
            "bbox": line_bbox,
            "spans": line,
        })
    return line_objects

赵小蒙's avatar
赵小蒙 committed
28

赵小蒙's avatar
赵小蒙 committed
29
def merge_spans_to_line(spans):
30
31
32
33
34
    if len(spans) == 0:
        return []
    else:
        # 按照y0坐标排序
        spans.sort(key=lambda span: span['bbox'][1])
赵小蒙's avatar
赵小蒙 committed
35

36
37
38
39
40
41
        lines = []
        current_line = [spans[0]]
        for span in spans[1:]:
            # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
            # image和table类型,同上
            if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
赵小蒙's avatar
赵小蒙 committed
42
43
                    s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in
                    current_line):
44
45
46
47
                # 则开始新行
                lines.append(current_line)
                current_line = [span]
                continue
赵小蒙's avatar
赵小蒙 committed
48

49
50
51
52
53
54
55
            # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
            if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
                current_line.append(span)
            else:
                # 否则,开始新行
                lines.append(current_line)
                current_line = [span]
赵小蒙's avatar
赵小蒙 committed
56

57
58
59
        # 添加最后一行
        if current_line:
            lines.append(current_line)
赵小蒙's avatar
赵小蒙 committed
60

61
        return lines
赵小蒙's avatar
赵小蒙 committed
62

赵小蒙's avatar
赵小蒙 committed
63

64
65
66
def merge_spans_to_line_by_layout(spans, layout_bboxes):
    lines = []
    new_spans = []
67
    dropped_spans = []
68
69
70
71
72
    for item in layout_bboxes:
        layout_bbox = item['layout_bbox']
        # 遍历spans,将每个span放入对应的layout中
        layout_sapns = []
        for span in spans:
73
            if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], layout_bbox) > 0.6:
74
                layout_sapns.append(span)
75
76
77
78
79
80
        # 如果layout_sapns不为空,则放入new_spans中
        if len(layout_sapns) > 0:
            new_spans.append(layout_sapns)
            # 从spans删除已经放入layout_sapns中的span
            for layout_sapn in layout_sapns:
                spans.remove(layout_sapn)
81

82
83
84
85
    if len(new_spans) > 0:
        for layout_sapns in new_spans:
            layout_lines = merge_spans_to_line(layout_sapns)
            lines.extend(layout_lines)
86

87
    # 对line中的span进行排序
88
89
    lines = line_sort_spans_by_left_to_right(lines)

90
91
92
93
94
    for span in spans:
        span['tag'] = DropTag.NOT_IN_LAYOUT
        dropped_spans.append(span)

    return lines, dropped_spans
liukaiwen's avatar
lkw  
liukaiwen committed
95
96


赵小蒙's avatar
赵小蒙 committed
97
98
99
100
101
102
103
104
105
106
107
108
109
def merge_lines_to_block(lines):
    # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
    blocks = []
    for line in lines:
        blocks.append(
            {
                "bbox": line["bbox"],
                "lines": [line],
            }
        )
    return blocks


赵小蒙's avatar
赵小蒙 committed
110
111
112
113
114
def sort_blocks_by_layout(all_bboxes, layout_bboxes):
    new_blocks = []
    sort_blocks = []
    for item in layout_bboxes:
        layout_bbox = item['layout_bbox']
liukaiwen's avatar
lkw  
liukaiwen committed
115

赵小蒙's avatar
赵小蒙 committed
116
117
118
119
120
121
122
123
124
        # 遍历blocks,将每个blocks放入对应的layout中
        layout_blocks = []
        for block in all_bboxes:
            # 如果是footnote则跳过
            if block[7] == 'footnote':
                continue
            block_bbox = [block[0], block[1], block[2], block[3]]
            if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, layout_bbox) > 0.8:
                layout_blocks.append(block)
liukaiwen's avatar
lkw  
liukaiwen committed
125

赵小蒙's avatar
赵小蒙 committed
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
        # 如果layout_blocks不为空,则放入new_blocks中
        if len(layout_blocks) > 0:
            new_blocks.append(layout_blocks)
            # 从spans删除已经放入layout_sapns中的span
            for layout_block in layout_blocks:
                all_bboxes.remove(layout_block)

    # 如果new_blocks不为空,则对new_blocks中每个block进行排序
    if len(new_blocks) > 0:
        for bboxes_in_layout_block in new_blocks:
            bboxes_in_layout_block.sort(key=lambda x: x[1])  # 一个layout内部的box,按照y0自上而下排序
            sort_blocks.extend(bboxes_in_layout_block)

    # sort_blocks中已经包含了当前页面所有最终留下的block,且已经排好了顺序
    return sort_blocks


def fill_spans_in_blocks(blocks, spans):
    block_with_spans = []
    for block in blocks:
        block_type = block[7]
        block_bbox = block[0:4]
        block_dict = {
            'block_type': block_type,
            'bbox': block_bbox,
        }
        block_spans = []
        for span in spans:
            span_bbox = span['bbox']
            if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.8:
                block_spans.append(span)
        block_dict['spans'] = block_spans
        block_with_spans.append(block_dict)

        # 从spans删除已经放入block_spans中的span
        if len(block_spans) > 0:
            for span in block_spans:
                spans.remove(span)

    return block_with_spans


def fix_block_spans(block_with_spans, img_blocks, table_blocks):
    fix_blocks = []
    for block in block_with_spans:
        block_type = block['block_type']
        # 只有type为image_block和table_block才需要处理
        if block_type == 'image_block':
            block = fix_image_block(block, img_blocks)
        elif block_type == 'table_block':
            block = fix_table_block(block, table_blocks)
        elif block_type == 'text_block':
            pass
        elif block_type == 'title_block':
            pass
        elif block_type == 'interline_equation_block':
            pass
        else:
            continue
        fix_blocks.append(block)
    return fix_blocks