ocr_dict_merge.py 7.32 KB
Newer Older
1
2
3
4
from loguru import logger

from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \
    calculate_overlap_area_in_bbox1_area_ratio
5
from magic_pdf.libs.drop_tag import DropTag
赵小蒙's avatar
赵小蒙 committed
6
7
from magic_pdf.libs.ocr_content_type import ContentType, BlockType
from magic_pdf.pre_proc.ocr_fix_block_logic import fix_image_block, fix_table_block, fix_text_block
赵小蒙's avatar
赵小蒙 committed
8
9
from magic_pdf.pre_proc.ocr_span_list_modify import modify_y_axis, modify_inline_equation
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
赵小蒙's avatar
赵小蒙 committed
10
11


12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# 将每一个line中的span从左到右排序
def line_sort_spans_by_left_to_right(lines):
    line_objects = []
    for line in lines:
        # 按照x0坐标排序
        line.sort(key=lambda span: span['bbox'][0])
        line_bbox = [
            min(span['bbox'][0] for span in line),  # x0
            min(span['bbox'][1] for span in line),  # y0
            max(span['bbox'][2] for span in line),  # x1
            max(span['bbox'][3] for span in line),  # y1
        ]
        line_objects.append({
            "bbox": line_bbox,
            "spans": line,
        })
    return line_objects

赵小蒙's avatar
赵小蒙 committed
30

赵小蒙's avatar
赵小蒙 committed
31
def merge_spans_to_line(spans):
32
33
34
35
36
    if len(spans) == 0:
        return []
    else:
        # 按照y0坐标排序
        spans.sort(key=lambda span: span['bbox'][1])
赵小蒙's avatar
赵小蒙 committed
37

38
39
40
41
42
43
        lines = []
        current_line = [spans[0]]
        for span in spans[1:]:
            # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
            # image和table类型,同上
            if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
赵小蒙's avatar
赵小蒙 committed
44
45
                    s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in
                    current_line):
46
47
48
49
                # 则开始新行
                lines.append(current_line)
                current_line = [span]
                continue
赵小蒙's avatar
赵小蒙 committed
50

51
52
53
54
55
56
57
            # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
            if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
                current_line.append(span)
            else:
                # 否则,开始新行
                lines.append(current_line)
                current_line = [span]
赵小蒙's avatar
赵小蒙 committed
58

59
60
61
        # 添加最后一行
        if current_line:
            lines.append(current_line)
赵小蒙's avatar
赵小蒙 committed
62

63
        return lines
赵小蒙's avatar
赵小蒙 committed
64

赵小蒙's avatar
赵小蒙 committed
65

66
67
68
def merge_spans_to_line_by_layout(spans, layout_bboxes):
    lines = []
    new_spans = []
69
    dropped_spans = []
70
71
72
73
74
    for item in layout_bboxes:
        layout_bbox = item['layout_bbox']
        # 遍历spans,将每个span放入对应的layout中
        layout_sapns = []
        for span in spans:
75
            if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], layout_bbox) > 0.6:
76
                layout_sapns.append(span)
77
78
79
80
81
82
        # 如果layout_sapns不为空,则放入new_spans中
        if len(layout_sapns) > 0:
            new_spans.append(layout_sapns)
            # 从spans删除已经放入layout_sapns中的span
            for layout_sapn in layout_sapns:
                spans.remove(layout_sapn)
83

84
85
86
87
    if len(new_spans) > 0:
        for layout_sapns in new_spans:
            layout_lines = merge_spans_to_line(layout_sapns)
            lines.extend(layout_lines)
88

89
    # 对line中的span进行排序
90
91
    lines = line_sort_spans_by_left_to_right(lines)

92
93
94
95
96
    for span in spans:
        span['tag'] = DropTag.NOT_IN_LAYOUT
        dropped_spans.append(span)

    return lines, dropped_spans
liukaiwen's avatar
lkw  
liukaiwen committed
97
98


赵小蒙's avatar
赵小蒙 committed
99
100
101
102
103
104
105
106
107
108
109
110
111
def merge_lines_to_block(lines):
    # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
    blocks = []
    for line in lines:
        blocks.append(
            {
                "bbox": line["bbox"],
                "lines": [line],
            }
        )
    return blocks


赵小蒙's avatar
赵小蒙 committed
112
113
114
115
116
def sort_blocks_by_layout(all_bboxes, layout_bboxes):
    new_blocks = []
    sort_blocks = []
    for item in layout_bboxes:
        layout_bbox = item['layout_bbox']
liukaiwen's avatar
lkw  
liukaiwen committed
117

赵小蒙's avatar
赵小蒙 committed
118
119
120
121
        # 遍历blocks,将每个blocks放入对应的layout中
        layout_blocks = []
        for block in all_bboxes:
            # 如果是footnote则跳过
赵小蒙's avatar
赵小蒙 committed
122
            if block[7] == BlockType.Footnote:
赵小蒙's avatar
赵小蒙 committed
123
124
125
126
                continue
            block_bbox = [block[0], block[1], block[2], block[3]]
            if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, layout_bbox) > 0.8:
                layout_blocks.append(block)
liukaiwen's avatar
lkw  
liukaiwen committed
127

赵小蒙's avatar
赵小蒙 committed
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
        # 如果layout_blocks不为空,则放入new_blocks中
        if len(layout_blocks) > 0:
            new_blocks.append(layout_blocks)
            # 从spans删除已经放入layout_sapns中的span
            for layout_block in layout_blocks:
                all_bboxes.remove(layout_block)

    # 如果new_blocks不为空,则对new_blocks中每个block进行排序
    if len(new_blocks) > 0:
        for bboxes_in_layout_block in new_blocks:
            bboxes_in_layout_block.sort(key=lambda x: x[1])  # 一个layout内部的box,按照y0自上而下排序
            sort_blocks.extend(bboxes_in_layout_block)

    # sort_blocks中已经包含了当前页面所有最终留下的block,且已经排好了顺序
    return sort_blocks


def fill_spans_in_blocks(blocks, spans):
赵小蒙's avatar
赵小蒙 committed
146
147
148
    '''
    将allspans中的span按位置关系,放入blocks中
    '''
赵小蒙's avatar
赵小蒙 committed
149
150
151
152
153
154
155
156
157
158
159
160
161
    block_with_spans = []
    for block in blocks:
        block_type = block[7]
        block_bbox = block[0:4]
        block_dict = {
            'block_type': block_type,
            'bbox': block_bbox,
        }
        block_spans = []
        for span in spans:
            span_bbox = span['bbox']
            if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.8:
                block_spans.append(span)
赵小蒙's avatar
赵小蒙 committed
162
163
164
165
166
167
168
169
170
171
172
173

        '''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
        displayed_list = []
        text_inline_lines = []
        modify_y_axis(block_spans, displayed_list, text_inline_lines)

        '''模型识别错误的行间公式, type类型转换成行内公式'''
        block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)

        '''bbox去除粘连'''
        block_spans = remove_overlap_between_bbox(block_spans)

赵小蒙's avatar
赵小蒙 committed
174
175
176
177
178
179
180
181
182
183
184
185
        block_dict['spans'] = block_spans
        block_with_spans.append(block_dict)

        # 从spans删除已经放入block_spans中的span
        if len(block_spans) > 0:
            for span in block_spans:
                spans.remove(span)

    return block_with_spans


def fix_block_spans(block_with_spans, img_blocks, table_blocks):
赵小蒙's avatar
赵小蒙 committed
186
187
188
189
190
191
    '''
    1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系
        需要将caption和footnote的text_span放入相应img_block和table_block内的
        caption_block和footnote_block中
    2、同时需要删除block中的spans字段
    '''
赵小蒙's avatar
赵小蒙 committed
192
193
194
    fix_blocks = []
    for block in block_with_spans:
        block_type = block['block_type']
赵小蒙's avatar
赵小蒙 committed
195
196

        if block_type == BlockType.Image:
赵小蒙's avatar
赵小蒙 committed
197
            block = fix_image_block(block, img_blocks)
赵小蒙's avatar
赵小蒙 committed
198
        elif block_type == BlockType.Table:
赵小蒙's avatar
赵小蒙 committed
199
            block = fix_table_block(block, table_blocks)
赵小蒙's avatar
赵小蒙 committed
200
201
        elif block_type in [BlockType.Text, BlockType.Title, BlockType.InterlineEquation]:
            block = fix_text_block(block)
赵小蒙's avatar
赵小蒙 committed
202
203
204
205
        else:
            continue
        fix_blocks.append(block)
    return fix_blocks