"backend/apps/webui/routers/chats.py" did not exist on "e2a8ad5fca0bae1d619cae9b436a9a9c3c0ab50e"
ocr_fix_block_logic.py 4.41 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio
赵小蒙's avatar
赵小蒙 committed
2
from magic_pdf.libs.ocr_content_type import ContentType, BlockType
赵小蒙's avatar
赵小蒙 committed
3
4
from magic_pdf.pre_proc.ocr_dict_merge import merge_spans_to_line, line_sort_spans_by_left_to_right

赵小蒙's avatar
赵小蒙 committed
5

赵小蒙's avatar
赵小蒙 committed
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
def merge_spans_to_block(spans: list, block_bbox: list, block_type: str):
    block_spans = []
    # 如果有img_caption,则将img_block中的text_spans放入img_caption_block中
    for span in spans:
        if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.8:
            block_spans.append(span)
    block_lines = merge_spans_to_line(block_spans)
    # 对line中的span进行排序
    sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
    block = {
        'bbox': block_bbox,
        'block_type': block_type,
        'lines': sort_block_lines
    }
    return block, block_spans

赵小蒙's avatar
赵小蒙 committed
22

赵小蒙's avatar
赵小蒙 committed
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
def make_body_block(span: dict, block_bbox: list, block_type: str):
    # 创建body_block
    body_line = {
        'bbox': block_bbox,
        'spans': [span],
    }
    body_block = {
        'bbox': block_bbox,
        'block_type': block_type,
        'lines': [body_line]
    }
    return body_block


def fix_image_block(block, img_blocks):
    block['blocks'] = []
    # 遍历img_blocks,找到与当前block匹配的img_block
    for img_block in img_blocks:
        if img_block['bbox'] == block['bbox']:
            # 创建img_body_block
            for span in block['spans']:
                if span['type'] == ContentType.Image and span['bbox'] == img_block['img_body_bbox']:
                    # 创建img_body_block
赵小蒙's avatar
赵小蒙 committed
46
                    img_body_block = make_body_block(span, img_block['img_body_bbox'], BlockType.ImageBody)
赵小蒙's avatar
赵小蒙 committed
47
48
49
50
51
52
53
54
55
                    block['blocks'].append(img_body_block)

                    # 从spans中移除img_body_block中已经放入的span
                    block['spans'].remove(span)
                    break

            # 根据list长度,判断img_block中是否有img_caption
            if len(img_block['img_caption_bbox']) > 0:
                img_caption_block, img_caption_spans = merge_spans_to_block(
赵小蒙's avatar
赵小蒙 committed
56
                    block['spans'], img_block['img_caption_bbox'], BlockType.ImageCaption
赵小蒙's avatar
赵小蒙 committed
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
                )
                block['blocks'].append(img_caption_block)

            break
    del block['spans']
    return block


def fix_table_block(block, table_blocks):
    block['blocks'] = []
    # 遍历table_blocks,找到与当前block匹配的table_block
    for table_block in table_blocks:
        if table_block['bbox'] == block['bbox']:
            # 创建table_body_block
            for span in block['spans']:
                if span['type'] == ContentType.Table and span['bbox'] == table_block['table_body_bbox']:
                    # 创建table_body_block
赵小蒙's avatar
赵小蒙 committed
74
                    table_body_block = make_body_block(span, table_block['table_body_bbox'], BlockType.TableBody)
赵小蒙's avatar
赵小蒙 committed
75
76
77
78
79
80
81
82
83
                    block['blocks'].append(table_body_block)

                    # 从spans中移除img_body_block中已经放入的span
                    block['spans'].remove(span)
                    break

            # 根据list长度,判断table_block中是否有caption
            if len(table_block['table_caption_bbox']) > 0:
                table_caption_block, table_caption_spans = merge_spans_to_block(
赵小蒙's avatar
赵小蒙 committed
84
                    block['spans'], table_block['table_caption_bbox'], BlockType.TableCaption
赵小蒙's avatar
赵小蒙 committed
85
86
87
88
89
90
91
92
93
94
95
96
                )
                block['blocks'].append(table_caption_block)

                # 如果table_caption_block_spans不为空
                if len(table_caption_spans) > 0:
                    #  一些span已经放入了caption_block中,需要从block['spans']中删除
                    for span in table_caption_spans:
                        block['spans'].remove(span)

            # 根据list长度,判断table_block中是否有table_note
            if len(table_block['table_footnote_bbox']) > 0:
                table_footnote_block, table_footnote_spans = merge_spans_to_block(
赵小蒙's avatar
赵小蒙 committed
97
                    block['spans'], table_block['table_footnote_bbox'], BlockType.TableFootnote
赵小蒙's avatar
赵小蒙 committed
98
99
100
101
102
103
                )
                block['blocks'].append(table_footnote_block)

            break
    del block['spans']
    return block
赵小蒙's avatar
赵小蒙 committed
104
105
106
107
108
109
110
111
112
113


def fix_text_block(block):
    block_lines = merge_spans_to_line(block['spans'])
    sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
    block['lines'] = sort_block_lines
    del block['spans']
    return block