"src/targets/vscode:/vscode.git/clone" did not exist on "bfe935a946278cab536fc6e71f0ee09fe99e1ccc"
ocr_detect_all_bboxes.py 4.84 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
2
from loguru import logger

赵小蒙's avatar
赵小蒙 committed
3
4
from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio, calculate_overlap_area_in_bbox1_area_ratio, \
    calculate_iou
赵小蒙's avatar
赵小蒙 committed
5
from magic_pdf.libs.drop_tag import DropTag
赵小蒙's avatar
赵小蒙 committed
6
from magic_pdf.libs.ocr_content_type import BlockType
7
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_block
赵小蒙's avatar
赵小蒙 committed
8
9


赵小蒙's avatar
赵小蒙 committed
10
11
12
def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_blocks, text_blocks,
                                        title_blocks, interline_equation_blocks, page_w, page_h):
    all_bboxes = []
13
    all_discarded_blocks = []
赵小蒙's avatar
赵小蒙 committed
14
15
    for image in img_blocks:
        x0, y0, x1, y1 = image['bbox']
赵小蒙's avatar
赵小蒙 committed
16
        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Image, None, None, None, None, image["score"]])
赵小蒙's avatar
赵小蒙 committed
17
18
19

    for table in table_blocks:
        x0, y0, x1, y1 = table['bbox']
赵小蒙's avatar
赵小蒙 committed
20
        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Table, None, None, None, None, table["score"]])
赵小蒙's avatar
赵小蒙 committed
21
22
23

    for text in text_blocks:
        x0, y0, x1, y1 = text['bbox']
赵小蒙's avatar
赵小蒙 committed
24
        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Text, None, None, None, None, text["score"]])
赵小蒙's avatar
赵小蒙 committed
25
26
27

    for title in title_blocks:
        x0, y0, x1, y1 = title['bbox']
赵小蒙's avatar
赵小蒙 committed
28
        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Title, None, None, None, None, title["score"]])
赵小蒙's avatar
赵小蒙 committed
29
30
31

    for interline_equation in interline_equation_blocks:
        x0, y0, x1, y1 = interline_equation['bbox']
赵小蒙's avatar
赵小蒙 committed
32
        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.InterlineEquation, None, None, None, None, interline_equation["score"]])
赵小蒙's avatar
赵小蒙 committed
33

赵小蒙's avatar
赵小蒙 committed
34
    '''block嵌套问题解决'''
35
    '''文本框与标题框重叠,优先信任文本框'''
赵小蒙's avatar
赵小蒙 committed
36
37
38
39
    all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
    '''任何框体与舍弃框重叠,优先信任舍弃框'''
    all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)

赵小蒙's avatar
赵小蒙 committed
40
41
42
    '''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)'''
    for discarded in discarded_blocks:
        x0, y0, x1, y1 = discarded['bbox']
赵小蒙's avatar
赵小蒙 committed
43
        all_discarded_blocks.append([x0, y0, x1, y1, None, None, None, BlockType.Discarded, None, None, None, None, discarded["score"]])
44
        # 将footnote加入到all_bboxes中,用来计算layout
赵小蒙's avatar
赵小蒙 committed
45
        if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
赵小蒙's avatar
赵小蒙 committed
46
            all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Footnote, None, None, None, None, discarded["score"]])
赵小蒙's avatar
赵小蒙 committed
47

赵小蒙's avatar
赵小蒙 committed
48
49
50
51
    '''经过以上处理后,还存在大框套小框的情况,则删除小框'''
    all_bboxes = remove_overlaps_min_blocks(all_bboxes)
    all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
    '''将剩余的bbox做分离处理,防止后面分layout时出错'''
赵小蒙's avatar
赵小蒙 committed
52
    all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
赵小蒙's avatar
赵小蒙 committed
53

赵小蒙's avatar
赵小蒙 committed
54
    return all_bboxes, all_discarded_blocks, drop_reasons
赵小蒙's avatar
赵小蒙 committed
55
56
57
58
59
60
61
62
63
64
65
66


def fix_text_overlap_title_blocks(all_bboxes):
    # 先提取所有text和title block
    text_blocks = []
    for block in all_bboxes:
        if block[7] == BlockType.Text:
            text_blocks.append(block)
    title_blocks = []
    for block in all_bboxes:
        if block[7] == BlockType.Title:
            title_blocks.append(block)
赵小蒙's avatar
赵小蒙 committed
67

赵小蒙's avatar
赵小蒙 committed
68
69
    for text_block in text_blocks:
        for title_block in title_blocks:
赵小蒙's avatar
赵小蒙 committed
70
71
            text_block_bbox = text_block[:4]
            title_block_bbox = title_block[:4]
赵小蒙's avatar
赵小蒙 committed
72
            if calculate_iou(text_block_bbox, title_block_bbox) > 0.8:
73
                all_bboxes.remove(title_block)
赵小蒙's avatar
赵小蒙 committed
74
75
76
77
78

    return all_bboxes


def remove_need_drop_blocks(all_bboxes, discarded_blocks):
赵小蒙's avatar
赵小蒙 committed
79
80
    need_remove = []
    for block in all_bboxes:
赵小蒙's avatar
赵小蒙 committed
81
        for discarded_block in discarded_blocks:
赵小蒙's avatar
赵小蒙 committed
82
            block_bbox = block[:4]
赵小蒙's avatar
赵小蒙 committed
83
            if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, discarded_block['bbox']) > 0.6:
赵小蒙's avatar
赵小蒙 committed
84
85
86
                if block not in need_remove:
                    need_remove.append(block)
                    break
赵小蒙's avatar
赵小蒙 committed
87

赵小蒙's avatar
赵小蒙 committed
88
89
90
    if len(need_remove) > 0:
        for block in need_remove:
            all_bboxes.remove(block)
赵小蒙's avatar
赵小蒙 committed
91
92
    return all_bboxes

赵小蒙's avatar
赵小蒙 committed
93
94
95

def remove_overlaps_min_blocks(all_bboxes):
    #  删除重叠blocks中较小的那些
赵小蒙's avatar
赵小蒙 committed
96
97
98
    need_remove = []
    for block1 in all_bboxes:
        for block2 in all_bboxes:
赵小蒙's avatar
赵小蒙 committed
99
            if block1 != block2:
赵小蒙's avatar
赵小蒙 committed
100
101
                block1_bbox = block1[:4]
                block2_bbox = block2[:4]
赵小蒙's avatar
赵小蒙 committed
102
                overlap_box = get_minbox_if_overlap_by_ratio(block1_bbox, block2_bbox, 0.8)
赵小蒙's avatar
赵小蒙 committed
103
                if overlap_box is not None:
赵小蒙's avatar
赵小蒙 committed
104
                    bbox_to_remove = next((block for block in all_bboxes if block[:4] == overlap_box), None)
赵小蒙's avatar
赵小蒙 committed
105
                    if bbox_to_remove is not None and bbox_to_remove not in need_remove:
赵小蒙's avatar
赵小蒙 committed
106
                        need_remove.append(bbox_to_remove)
赵小蒙's avatar
赵小蒙 committed
107

赵小蒙's avatar
赵小蒙 committed
108
109
110
    if len(need_remove) > 0:
        for block in need_remove:
            all_bboxes.remove(block)
赵小蒙's avatar
赵小蒙 committed
111
112

    return all_bboxes