ocr_fix_block_logic.py 4.16 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio
from magic_pdf.libs.ocr_content_type import ContentType
from magic_pdf.pre_proc.ocr_dict_merge import merge_spans_to_line, line_sort_spans_by_left_to_right

def merge_spans_to_block(spans: list, block_bbox: list, block_type: str):
    block_spans = []
    # 如果有img_caption,则将img_block中的text_spans放入img_caption_block中
    for span in spans:
        if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.8:
            block_spans.append(span)
    block_lines = merge_spans_to_line(block_spans)
    # 对line中的span进行排序
    sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
    block = {
        'bbox': block_bbox,
        'block_type': block_type,
        'lines': sort_block_lines
    }
    return block, block_spans

def make_body_block(span: dict, block_bbox: list, block_type: str):
    # 创建body_block
    body_line = {
        'bbox': block_bbox,
        'spans': [span],
    }
    body_block = {
        'bbox': block_bbox,
        'block_type': block_type,
        'lines': [body_line]
    }
    return body_block


def fix_image_block(block, img_blocks):
    block['blocks'] = []
    # 遍历img_blocks,找到与当前block匹配的img_block
    for img_block in img_blocks:
        if img_block['bbox'] == block['bbox']:
            # 创建img_body_block
            for span in block['spans']:
                if span['type'] == ContentType.Image and span['bbox'] == img_block['img_body_bbox']:
                    # 创建img_body_block
                    img_body_block = make_body_block(span, img_block['img_body_bbox'], 'img_body_block')
                    block['blocks'].append(img_body_block)

                    # 从spans中移除img_body_block中已经放入的span
                    block['spans'].remove(span)
                    break

            # 根据list长度,判断img_block中是否有img_caption
            if len(img_block['img_caption_bbox']) > 0:
                img_caption_block, img_caption_spans = merge_spans_to_block(
                    block['spans'], img_block['img_caption_bbox'], 'img_caption_block'
                )
                block['blocks'].append(img_caption_block)

            break
    del block['spans']
    return block


def fix_table_block(block, table_blocks):
    block['blocks'] = []
    # 遍历table_blocks,找到与当前block匹配的table_block
    for table_block in table_blocks:
        if table_block['bbox'] == block['bbox']:
            # 创建table_body_block
            for span in block['spans']:
                if span['type'] == ContentType.Table and span['bbox'] == table_block['table_body_bbox']:
                    # 创建table_body_block
                    table_body_block = make_body_block(span, table_block['table_body_bbox'], 'table_body_block')
                    block['blocks'].append(table_body_block)

                    # 从spans中移除img_body_block中已经放入的span
                    block['spans'].remove(span)
                    break

            # 根据list长度,判断table_block中是否有caption
            if len(table_block['table_caption_bbox']) > 0:
                table_caption_block, table_caption_spans = merge_spans_to_block(
                    block['spans'], table_block['table_caption_bbox'], 'table_caption_block'
                )
                block['blocks'].append(table_caption_block)

                # 如果table_caption_block_spans不为空
                if len(table_caption_spans) > 0:
                    #  一些span已经放入了caption_block中,需要从block['spans']中删除
                    for span in table_caption_spans:
                        block['spans'].remove(span)

            # 根据list长度,判断table_block中是否有table_note
            if len(table_block['table_footnote_bbox']) > 0:
                table_footnote_block, table_footnote_spans = merge_spans_to_block(
                    block['spans'], table_block['table_footnote_bbox'], 'table_footnote_block'
                )
                block['blocks'].append(table_footnote_block)

            break
    del block['spans']
    return block