ocr_dict_merge.py 5.37 KB
Newer Older
1
from magic_pdf.config.ocr_content_type import BlockType, ContentType
2
from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, calculate_overlap_area_in_bbox1_area_ratio
赵小蒙's avatar
赵小蒙 committed
3
4


5
6
7
8
# 将每一个line中的span从左到右排序
def line_sort_spans_by_left_to_right(lines):
    line_objects = []
    for line in lines:
9
        #  按照x0坐标排序
10
11
12
13
14
15
16
17
        line.sort(key=lambda span: span['bbox'][0])
        line_bbox = [
            min(span['bbox'][0] for span in line),  # x0
            min(span['bbox'][1] for span in line),  # y0
            max(span['bbox'][2] for span in line),  # x1
            max(span['bbox'][3] for span in line),  # y1
        ]
        line_objects.append({
18
19
            'bbox': line_bbox,
            'spans': line,
20
21
22
        })
    return line_objects

赵小蒙's avatar
赵小蒙 committed
23

24
def merge_spans_to_line(spans, threshold=0.6):
25
26
27
28
29
    if len(spans) == 0:
        return []
    else:
        # 按照y0坐标排序
        spans.sort(key=lambda span: span['bbox'][1])
赵小蒙's avatar
赵小蒙 committed
30

31
32
33
34
35
        lines = []
        current_line = [spans[0]]
        for span in spans[1:]:
            # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
            # image和table类型,同上
36
37
38
39
40
41
42
            if span['type'] in [
                    ContentType.InterlineEquation, ContentType.Image,
                    ContentType.Table
            ] or any(s['type'] in [
                    ContentType.InterlineEquation, ContentType.Image,
                    ContentType.Table
            ] for s in current_line):
43
44
45
46
                # 则开始新行
                lines.append(current_line)
                current_line = [span]
                continue
赵小蒙's avatar
赵小蒙 committed
47

48
            # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
49
            if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox'], threshold):
50
51
52
53
54
                current_line.append(span)
            else:
                # 否则,开始新行
                lines.append(current_line)
                current_line = [span]
赵小蒙's avatar
赵小蒙 committed
55

56
57
58
        # 添加最后一行
        if current_line:
            lines.append(current_line)
赵小蒙's avatar
赵小蒙 committed
59

60
        return lines
赵小蒙's avatar
赵小蒙 committed
61

赵小蒙's avatar
赵小蒙 committed
62

63
64
65
66
67
68
69
70
71
72
73
74
75
def span_block_type_compatible(span_type, block_type):
    if span_type in [ContentType.Text, ContentType.InlineEquation]:
        return block_type in [BlockType.Text, BlockType.Title, BlockType.ImageCaption, BlockType.ImageFootnote, BlockType.TableCaption, BlockType.TableFootnote]
    elif span_type == ContentType.InterlineEquation:
        return block_type in [BlockType.InterlineEquation]
    elif span_type == ContentType.Image:
        return block_type in [BlockType.ImageBody]
    elif span_type == ContentType.Table:
        return block_type in [BlockType.TableBody]
    else:
        return False


76
def fill_spans_in_blocks(blocks, spans, radio):
77
    """将allspans中的span按位置关系,放入blocks中."""
赵小蒙's avatar
赵小蒙 committed
78
79
80
81
82
    block_with_spans = []
    for block in blocks:
        block_type = block[7]
        block_bbox = block[0:4]
        block_dict = {
赵小蒙's avatar
赵小蒙 committed
83
            'type': block_type,
赵小蒙's avatar
赵小蒙 committed
84
85
            'bbox': block_bbox,
        }
86
87
88
89
        if block_type in [
            BlockType.ImageBody, BlockType.ImageCaption, BlockType.ImageFootnote,
            BlockType.TableBody, BlockType.TableCaption, BlockType.TableFootnote
        ]:
90
            block_dict['group_id'] = block[-1]
赵小蒙's avatar
赵小蒙 committed
91
92
93
        block_spans = []
        for span in spans:
            span_bbox = span['bbox']
94
            if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > radio and span_block_type_compatible(span['type'], block_type):
赵小蒙's avatar
赵小蒙 committed
95
                block_spans.append(span)
赵小蒙's avatar
赵小蒙 committed
96

赵小蒙's avatar
赵小蒙 committed
97
98
99
100
101
102
103
104
        block_dict['spans'] = block_spans
        block_with_spans.append(block_dict)

        # 从spans删除已经放入block_spans中的span
        if len(block_spans) > 0:
            for span in block_spans:
                spans.remove(span)

105
    return block_with_spans, spans
赵小蒙's avatar
赵小蒙 committed
106
107


108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
def fix_block_spans_v2(block_with_spans):
    fix_blocks = []
    for block in block_with_spans:
        block_type = block['type']

        if block_type in [BlockType.Text, BlockType.Title,
                          BlockType.ImageCaption, BlockType.ImageFootnote,
                          BlockType.TableCaption, BlockType.TableFootnote
                          ]:
            block = fix_text_block(block)
        elif block_type in [BlockType.InterlineEquation, BlockType.ImageBody, BlockType.TableBody]:
            block = fix_interline_block(block)
        else:
            continue
        fix_blocks.append(block)
    return fix_blocks


126
127
128
129
130
131
132
133
def fix_discarded_block(discarded_block_with_spans):
    fix_discarded_blocks = []
    for block in discarded_block_with_spans:
        block = fix_text_block(block)
        fix_discarded_blocks.append(block)
    return fix_discarded_blocks


134
def fix_text_block(block):
135
136
137
138
139
140
141
142
143
144
145
146
    # 文本block中的公式span都应该转换成行内type
    for span in block['spans']:
        if span['type'] == ContentType.InterlineEquation:
            span['type'] = ContentType.InlineEquation
    block_lines = merge_spans_to_line(block['spans'])
    sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
    block['lines'] = sort_block_lines
    del block['spans']
    return block


def fix_interline_block(block):
147
148
149
150
151
    block_lines = merge_spans_to_line(block['spans'])
    sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
    block['lines'] = sort_block_lines
    del block['spans']
    return block