ocr_dict_merge.py 6.66 KB
Newer Older
1
2
3
4
from loguru import logger

from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \
    calculate_overlap_area_in_bbox1_area_ratio
5
from magic_pdf.libs.drop_tag import DropTag
赵小蒙's avatar
赵小蒙 committed
6
7
from magic_pdf.libs.ocr_content_type import ContentType, BlockType
from magic_pdf.pre_proc.ocr_fix_block_logic import fix_image_block, fix_table_block, fix_text_block
赵小蒙's avatar
赵小蒙 committed
8
9


10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# 将每一个line中的span从左到右排序
def line_sort_spans_by_left_to_right(lines):
    line_objects = []
    for line in lines:
        # 按照x0坐标排序
        line.sort(key=lambda span: span['bbox'][0])
        line_bbox = [
            min(span['bbox'][0] for span in line),  # x0
            min(span['bbox'][1] for span in line),  # y0
            max(span['bbox'][2] for span in line),  # x1
            max(span['bbox'][3] for span in line),  # y1
        ]
        line_objects.append({
            "bbox": line_bbox,
            "spans": line,
        })
    return line_objects

赵小蒙's avatar
赵小蒙 committed
28

赵小蒙's avatar
赵小蒙 committed
29
def merge_spans_to_line(spans):
30
31
32
33
34
    if len(spans) == 0:
        return []
    else:
        # 按照y0坐标排序
        spans.sort(key=lambda span: span['bbox'][1])
赵小蒙's avatar
赵小蒙 committed
35

36
37
38
39
40
41
        lines = []
        current_line = [spans[0]]
        for span in spans[1:]:
            # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
            # image和table类型,同上
            if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
赵小蒙's avatar
赵小蒙 committed
42
43
                    s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in
                    current_line):
44
45
46
47
                # 则开始新行
                lines.append(current_line)
                current_line = [span]
                continue
赵小蒙's avatar
赵小蒙 committed
48

49
50
51
52
53
54
55
            # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
            if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
                current_line.append(span)
            else:
                # 否则,开始新行
                lines.append(current_line)
                current_line = [span]
赵小蒙's avatar
赵小蒙 committed
56

57
58
59
        # 添加最后一行
        if current_line:
            lines.append(current_line)
赵小蒙's avatar
赵小蒙 committed
60

61
        return lines
赵小蒙's avatar
赵小蒙 committed
62

赵小蒙's avatar
赵小蒙 committed
63

64
65
66
def merge_spans_to_line_by_layout(spans, layout_bboxes):
    lines = []
    new_spans = []
67
    dropped_spans = []
68
69
70
71
72
    for item in layout_bboxes:
        layout_bbox = item['layout_bbox']
        # 遍历spans,将每个span放入对应的layout中
        layout_sapns = []
        for span in spans:
73
            if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], layout_bbox) > 0.6:
74
                layout_sapns.append(span)
75
76
77
78
79
80
        # 如果layout_sapns不为空,则放入new_spans中
        if len(layout_sapns) > 0:
            new_spans.append(layout_sapns)
            # 从spans删除已经放入layout_sapns中的span
            for layout_sapn in layout_sapns:
                spans.remove(layout_sapn)
81

82
83
84
85
    if len(new_spans) > 0:
        for layout_sapns in new_spans:
            layout_lines = merge_spans_to_line(layout_sapns)
            lines.extend(layout_lines)
86

87
    # 对line中的span进行排序
88
89
    lines = line_sort_spans_by_left_to_right(lines)

90
91
92
93
94
    for span in spans:
        span['tag'] = DropTag.NOT_IN_LAYOUT
        dropped_spans.append(span)

    return lines, dropped_spans
liukaiwen's avatar
lkw  
liukaiwen committed
95
96


赵小蒙's avatar
赵小蒙 committed
97
98
99
100
101
102
103
104
105
106
107
108
109
def merge_lines_to_block(lines):
    # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
    blocks = []
    for line in lines:
        blocks.append(
            {
                "bbox": line["bbox"],
                "lines": [line],
            }
        )
    return blocks


赵小蒙's avatar
赵小蒙 committed
110
111
112
113
114
def sort_blocks_by_layout(all_bboxes, layout_bboxes):
    new_blocks = []
    sort_blocks = []
    for item in layout_bboxes:
        layout_bbox = item['layout_bbox']
liukaiwen's avatar
lkw  
liukaiwen committed
115

赵小蒙's avatar
赵小蒙 committed
116
117
118
119
        # 遍历blocks,将每个blocks放入对应的layout中
        layout_blocks = []
        for block in all_bboxes:
            # 如果是footnote则跳过
赵小蒙's avatar
赵小蒙 committed
120
            if block[7] == BlockType.Footnote:
赵小蒙's avatar
赵小蒙 committed
121
122
123
124
                continue
            block_bbox = [block[0], block[1], block[2], block[3]]
            if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, layout_bbox) > 0.8:
                layout_blocks.append(block)
liukaiwen's avatar
lkw  
liukaiwen committed
125

赵小蒙's avatar
赵小蒙 committed
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
        # 如果layout_blocks不为空,则放入new_blocks中
        if len(layout_blocks) > 0:
            new_blocks.append(layout_blocks)
            # 从spans删除已经放入layout_sapns中的span
            for layout_block in layout_blocks:
                all_bboxes.remove(layout_block)

    # 如果new_blocks不为空,则对new_blocks中每个block进行排序
    if len(new_blocks) > 0:
        for bboxes_in_layout_block in new_blocks:
            bboxes_in_layout_block.sort(key=lambda x: x[1])  # 一个layout内部的box,按照y0自上而下排序
            sort_blocks.extend(bboxes_in_layout_block)

    # sort_blocks中已经包含了当前页面所有最终留下的block,且已经排好了顺序
    return sort_blocks


def fill_spans_in_blocks(blocks, spans):
赵小蒙's avatar
赵小蒙 committed
144
145
146
    '''
    将allspans中的span按位置关系,放入blocks中
    '''
赵小蒙's avatar
赵小蒙 committed
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
    block_with_spans = []
    for block in blocks:
        block_type = block[7]
        block_bbox = block[0:4]
        block_dict = {
            'block_type': block_type,
            'bbox': block_bbox,
        }
        block_spans = []
        for span in spans:
            span_bbox = span['bbox']
            if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.8:
                block_spans.append(span)
        block_dict['spans'] = block_spans
        block_with_spans.append(block_dict)

        # 从spans删除已经放入block_spans中的span
        if len(block_spans) > 0:
            for span in block_spans:
                spans.remove(span)

    return block_with_spans


def fix_block_spans(block_with_spans, img_blocks, table_blocks):
赵小蒙's avatar
赵小蒙 committed
172
173
174
175
176
177
    '''
    1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系
        需要将caption和footnote的text_span放入相应img_block和table_block内的
        caption_block和footnote_block中
    2、同时需要删除block中的spans字段
    '''
赵小蒙's avatar
赵小蒙 committed
178
179
180
    fix_blocks = []
    for block in block_with_spans:
        block_type = block['block_type']
赵小蒙's avatar
赵小蒙 committed
181
182

        if block_type == BlockType.Image:
赵小蒙's avatar
赵小蒙 committed
183
            block = fix_image_block(block, img_blocks)
赵小蒙's avatar
赵小蒙 committed
184
        elif block_type == BlockType.Table:
赵小蒙's avatar
赵小蒙 committed
185
            block = fix_table_block(block, table_blocks)
赵小蒙's avatar
赵小蒙 committed
186
187
        elif block_type in [BlockType.Text, BlockType.Title, BlockType.InterlineEquation]:
            block = fix_text_block(block)
赵小蒙's avatar
赵小蒙 committed
188
189
190
191
        else:
            continue
        fix_blocks.append(block)
    return fix_blocks