ocr_dict_merge.py 5.58 KB
Newer Older
1

2
from magic_pdf.config.ocr_content_type import BlockType, ContentType
3
from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, calculate_overlap_area_in_bbox1_area_ratio
赵小蒙's avatar
赵小蒙 committed
4
5


6
7
8
9
# 将每一个line中的span从左到右排序
def line_sort_spans_by_left_to_right(lines):
    line_objects = []
    for line in lines:
10
        #  按照x0坐标排序
11
12
13
14
15
16
17
18
        line.sort(key=lambda span: span['bbox'][0])
        line_bbox = [
            min(span['bbox'][0] for span in line),  # x0
            min(span['bbox'][1] for span in line),  # y0
            max(span['bbox'][2] for span in line),  # x1
            max(span['bbox'][3] for span in line),  # y1
        ]
        line_objects.append({
19
20
            'bbox': line_bbox,
            'spans': line,
21
22
23
        })
    return line_objects

赵小蒙's avatar
赵小蒙 committed
24

25
def merge_spans_to_line(spans, threshold=0.6):
26
27
28
29
30
    if len(spans) == 0:
        return []
    else:
        # 按照y0坐标排序
        spans.sort(key=lambda span: span['bbox'][1])
赵小蒙's avatar
赵小蒙 committed
31

32
33
34
35
36
        lines = []
        current_line = [spans[0]]
        for span in spans[1:]:
            # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
            # image和table类型,同上
37
38
39
40
41
42
43
            if span['type'] in [
                    ContentType.InterlineEquation, ContentType.Image,
                    ContentType.Table
            ] or any(s['type'] in [
                    ContentType.InterlineEquation, ContentType.Image,
                    ContentType.Table
            ] for s in current_line):
44
45
46
47
                # 则开始新行
                lines.append(current_line)
                current_line = [span]
                continue
赵小蒙's avatar
赵小蒙 committed
48

49
            # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
50
            if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox'], threshold):
51
52
53
54
55
                current_line.append(span)
            else:
                # 否则,开始新行
                lines.append(current_line)
                current_line = [span]
赵小蒙's avatar
赵小蒙 committed
56

57
58
59
        # 添加最后一行
        if current_line:
            lines.append(current_line)
赵小蒙's avatar
赵小蒙 committed
60

61
        return lines
赵小蒙's avatar
赵小蒙 committed
62

赵小蒙's avatar
赵小蒙 committed
63

64
def fill_spans_in_blocks(blocks, spans, radio):
65
    """将allspans中的span按位置关系,放入blocks中."""
赵小蒙's avatar
赵小蒙 committed
66
67
68
69
70
    block_with_spans = []
    for block in blocks:
        block_type = block[7]
        block_bbox = block[0:4]
        block_dict = {
赵小蒙's avatar
赵小蒙 committed
71
            'type': block_type,
赵小蒙's avatar
赵小蒙 committed
72
73
            'bbox': block_bbox,
        }
74
75
76
77
        if block_type in [
            BlockType.ImageBody, BlockType.ImageCaption, BlockType.ImageFootnote,
            BlockType.TableBody, BlockType.TableCaption, BlockType.TableFootnote
        ]:
78
            block_dict['group_id'] = block[-1]
赵小蒙's avatar
赵小蒙 committed
79
80
81
        block_spans = []
        for span in spans:
            span_bbox = span['bbox']
82
83
            if calculate_overlap_area_in_bbox1_area_ratio(
                    span_bbox, block_bbox) > radio:
赵小蒙's avatar
赵小蒙 committed
84
                block_spans.append(span)
赵小蒙's avatar
赵小蒙 committed
85
        '''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
86
87
88
        # displayed_list = []
        # text_inline_lines = []
        # modify_y_axis(block_spans, displayed_list, text_inline_lines)
赵小蒙's avatar
赵小蒙 committed
89
        '''模型识别错误的行间公式, type类型转换成行内公式'''
90
        # block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)
赵小蒙's avatar
赵小蒙 committed
91
        '''bbox去除粘连'''  # 去粘连会影响span的bbox,导致后续fill的时候出错
92
        # block_spans = remove_overlap_between_bbox_for_span(block_spans)
赵小蒙's avatar
赵小蒙 committed
93

赵小蒙's avatar
赵小蒙 committed
94
95
96
97
98
99
100
101
        block_dict['spans'] = block_spans
        block_with_spans.append(block_dict)

        # 从spans删除已经放入block_spans中的span
        if len(block_spans) > 0:
            for span in block_spans:
                spans.remove(span)

102
    return block_with_spans, spans
赵小蒙's avatar
赵小蒙 committed
103
104


105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
def fix_block_spans_v2(block_with_spans):
    """1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系
    需要将caption和footnote的text_span放入相应img_block和table_block内的
    caption_block和footnote_block中 2、同时需要删除block中的spans字段."""
    fix_blocks = []
    for block in block_with_spans:
        block_type = block['type']

        if block_type in [BlockType.Text, BlockType.Title,
                          BlockType.ImageCaption, BlockType.ImageFootnote,
                          BlockType.TableCaption, BlockType.TableFootnote
                          ]:
            block = fix_text_block(block)
        elif block_type in [BlockType.InterlineEquation, BlockType.ImageBody, BlockType.TableBody]:
            block = fix_interline_block(block)
        else:
            continue
        fix_blocks.append(block)
    return fix_blocks


126
127
128
129
130
131
132
133
def fix_discarded_block(discarded_block_with_spans):
    fix_discarded_blocks = []
    for block in discarded_block_with_spans:
        block = fix_text_block(block)
        fix_discarded_blocks.append(block)
    return fix_discarded_blocks


134
def fix_text_block(block):
135
136
137
138
139
140
141
142
143
144
145
146
    # 文本block中的公式span都应该转换成行内type
    for span in block['spans']:
        if span['type'] == ContentType.InterlineEquation:
            span['type'] = ContentType.InlineEquation
    block_lines = merge_spans_to_line(block['spans'])
    sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
    block['lines'] = sort_block_lines
    del block['spans']
    return block


def fix_interline_block(block):
147
148
149
150
151
    block_lines = merge_spans_to_line(block['spans'])
    sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
    block['lines'] = sort_block_lines
    del block['spans']
    return block