ocr_dict_merge.py 5.38 KB
Newer Older
1
from magic_pdf.config.ocr_content_type import BlockType, ContentType
2
from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, calculate_overlap_area_in_bbox1_area_ratio
赵小蒙's avatar
赵小蒙 committed
3
4


5
6
7
8
# 将每一个line中的span从左到右排序
def line_sort_spans_by_left_to_right(lines):
    line_objects = []
    for line in lines:
9
        #  按照x0坐标排序
10
11
12
13
14
15
16
17
        line.sort(key=lambda span: span['bbox'][0])
        line_bbox = [
            min(span['bbox'][0] for span in line),  # x0
            min(span['bbox'][1] for span in line),  # y0
            max(span['bbox'][2] for span in line),  # x1
            max(span['bbox'][3] for span in line),  # y1
        ]
        line_objects.append({
18
19
            'bbox': line_bbox,
            'spans': line,
20
21
22
        })
    return line_objects

赵小蒙's avatar
赵小蒙 committed
23

24
def merge_spans_to_line(spans, threshold=0.6):
25
26
27
28
29
    if len(spans) == 0:
        return []
    else:
        # 按照y0坐标排序
        spans.sort(key=lambda span: span['bbox'][1])
赵小蒙's avatar
赵小蒙 committed
30

31
32
33
34
35
        lines = []
        current_line = [spans[0]]
        for span in spans[1:]:
            # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
            # image和table类型,同上
36
37
38
39
40
41
42
            if span['type'] in [
                    ContentType.InterlineEquation, ContentType.Image,
                    ContentType.Table
            ] or any(s['type'] in [
                    ContentType.InterlineEquation, ContentType.Image,
                    ContentType.Table
            ] for s in current_line):
43
44
45
46
                # 则开始新行
                lines.append(current_line)
                current_line = [span]
                continue
赵小蒙's avatar
赵小蒙 committed
47

48
            # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
49
            if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox'], threshold):
50
51
52
53
54
                current_line.append(span)
            else:
                # 否则,开始新行
                lines.append(current_line)
                current_line = [span]
赵小蒙's avatar
赵小蒙 committed
55

56
57
58
        # 添加最后一行
        if current_line:
            lines.append(current_line)
赵小蒙's avatar
赵小蒙 committed
59

60
        return lines
赵小蒙's avatar
赵小蒙 committed
61

赵小蒙's avatar
赵小蒙 committed
62

63
64
65
66
def span_block_type_compatible(span_type, block_type):
    if span_type in [ContentType.Text, ContentType.InlineEquation]:
        return block_type in [BlockType.Text, BlockType.Title, BlockType.ImageCaption, BlockType.ImageFootnote, BlockType.TableCaption, BlockType.TableFootnote]
    elif span_type == ContentType.InterlineEquation:
67
        return block_type in [BlockType.InterlineEquation, BlockType.Text]
68
69
70
71
72
73
74
75
    elif span_type == ContentType.Image:
        return block_type in [BlockType.ImageBody]
    elif span_type == ContentType.Table:
        return block_type in [BlockType.TableBody]
    else:
        return False


76
def fill_spans_in_blocks(blocks, spans, radio):
77
    """将allspans中的span按位置关系,放入blocks中."""
赵小蒙's avatar
赵小蒙 committed
78
79
80
81
82
    block_with_spans = []
    for block in blocks:
        block_type = block[7]
        block_bbox = block[0:4]
        block_dict = {
赵小蒙's avatar
赵小蒙 committed
83
            'type': block_type,
赵小蒙's avatar
赵小蒙 committed
84
85
            'bbox': block_bbox,
        }
86
87
88
89
        if block_type in [
            BlockType.ImageBody, BlockType.ImageCaption, BlockType.ImageFootnote,
            BlockType.TableBody, BlockType.TableCaption, BlockType.TableFootnote
        ]:
90
            block_dict['group_id'] = block[-1]
赵小蒙's avatar
赵小蒙 committed
91
92
93
        block_spans = []
        for span in spans:
            span_bbox = span['bbox']
94
            if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > radio and span_block_type_compatible(span['type'], block_type):
赵小蒙's avatar
赵小蒙 committed
95
                block_spans.append(span)
赵小蒙's avatar
赵小蒙 committed
96

赵小蒙's avatar
赵小蒙 committed
97
98
99
100
101
102
103
104
        block_dict['spans'] = block_spans
        block_with_spans.append(block_dict)

        # 从spans删除已经放入block_spans中的span
        if len(block_spans) > 0:
            for span in block_spans:
                spans.remove(span)

105
    return block_with_spans, spans
赵小蒙's avatar
赵小蒙 committed
106
107


108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
def fix_block_spans_v2(block_with_spans):
    fix_blocks = []
    for block in block_with_spans:
        block_type = block['type']

        if block_type in [BlockType.Text, BlockType.Title,
                          BlockType.ImageCaption, BlockType.ImageFootnote,
                          BlockType.TableCaption, BlockType.TableFootnote
                          ]:
            block = fix_text_block(block)
        elif block_type in [BlockType.InterlineEquation, BlockType.ImageBody, BlockType.TableBody]:
            block = fix_interline_block(block)
        else:
            continue
        fix_blocks.append(block)
    return fix_blocks


126
127
128
129
130
131
132
133
def fix_discarded_block(discarded_block_with_spans):
    fix_discarded_blocks = []
    for block in discarded_block_with_spans:
        block = fix_text_block(block)
        fix_discarded_blocks.append(block)
    return fix_discarded_blocks


134
def fix_text_block(block):
135
136
137
138
139
140
141
142
143
144
145
146
    # 文本block中的公式span都应该转换成行内type
    for span in block['spans']:
        if span['type'] == ContentType.InterlineEquation:
            span['type'] = ContentType.InlineEquation
    block_lines = merge_spans_to_line(block['spans'])
    sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
    block['lines'] = sort_block_lines
    del block['spans']
    return block


def fix_interline_block(block):
147
148
149
150
151
    block_lines = merge_spans_to_line(block['spans'])
    sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
    block['lines'] = sort_block_lines
    del block['spans']
    return block