ocr_detect_all_bboxes.py 1.58 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
2
3
from magic_pdf.libs.ocr_content_type import BlockType


赵小蒙's avatar
赵小蒙 committed
4
5
6
7
8
9
def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_blocks, text_blocks,
                                        title_blocks, interline_equation_blocks, page_w, page_h):
    all_bboxes = []

    for image in img_blocks:
        x0, y0, x1, y1 = image['bbox']
赵小蒙's avatar
赵小蒙 committed
10
        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Image, None, None, None, None])
赵小蒙's avatar
赵小蒙 committed
11
12
13

    for table in table_blocks:
        x0, y0, x1, y1 = table['bbox']
赵小蒙's avatar
赵小蒙 committed
14
        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Table, None, None, None, None])
赵小蒙's avatar
赵小蒙 committed
15
16
17

    for text in text_blocks:
        x0, y0, x1, y1 = text['bbox']
赵小蒙's avatar
赵小蒙 committed
18
        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Text, None, None, None, None])
赵小蒙's avatar
赵小蒙 committed
19
20
21

    for title in title_blocks:
        x0, y0, x1, y1 = title['bbox']
赵小蒙's avatar
赵小蒙 committed
22
        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Title, None, None, None, None])
赵小蒙's avatar
赵小蒙 committed
23
24
25

    for interline_equation in interline_equation_blocks:
        x0, y0, x1, y1 = interline_equation['bbox']
赵小蒙's avatar
赵小蒙 committed
26
        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.InterlineEquation, None, None, None, None])
赵小蒙's avatar
赵小蒙 committed
27
28
29
30
31

    '''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)'''
    for discarded in discarded_blocks:
        x0, y0, x1, y1 = discarded['bbox']
        if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
赵小蒙's avatar
赵小蒙 committed
32
            all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Footnote, None, None, None, None])
赵小蒙's avatar
赵小蒙 committed
33
34
35

    return all_bboxes