pdf_parse_by_ocr.py 3.45 KB
Newer Older
1
2
from magic_pdf.pre_proc.ocr_detect_layout import layout_detect
from magic_pdf.pre_proc.ocr_dict_merge import merge_spans_to_line, remove_overlaps_min_spans
赵小蒙's avatar
赵小蒙 committed
3
4


5
def construct_page_component(page_id, blocks, layout_bboxes):
赵小蒙's avatar
赵小蒙 committed
6
    return_dict = {
赵小蒙's avatar
赵小蒙 committed
7
8
        'preproc_blocks': blocks,
        'page_idx': page_id,
9
        'layout_bboxes': layout_bboxes,
赵小蒙's avatar
赵小蒙 committed
10
11
12
13
14
    }
    return return_dict


def parse_pdf_by_ocr(
赵小蒙's avatar
赵小蒙 committed
15
    ocr_pdf_info,
赵小蒙's avatar
赵小蒙 committed
16
17
18
    start_page_id=0,
    end_page_id=None,
):
赵小蒙's avatar
赵小蒙 committed
19

赵小蒙's avatar
赵小蒙 committed
20
21
22
23
24
25
26
27
    pdf_info_dict = {}
    end_page_id = end_page_id if end_page_id else len(ocr_pdf_info) - 1
    for page_id in range(start_page_id, end_page_id + 1):
        ocr_page_info = ocr_pdf_info[page_id]
        layout_dets = ocr_page_info['layout_dets']
        spans = []
        for layout_det in layout_dets:
            category_id = layout_det['category_id']
赵小蒙's avatar
赵小蒙 committed
28
            allow_category_id_list = [1, 7, 13, 14, 15]
赵小蒙's avatar
赵小蒙 committed
29
30
31
            if category_id in allow_category_id_list:
                x0, y0, _, _, x1, y1, _, _ = layout_det['poly']
                bbox = [int(x0), int(y0), int(x1), int(y1)]
赵小蒙's avatar
赵小蒙 committed
32
33
34
35
36
37
38
39
40
41
42
43
44
45
                '''要删除的'''
                #  3: 'header',      # 页眉
                #  4: 'page number', # 页码
                #  5: 'footnote',    # 脚注
                #  6: 'footer',      # 页脚
                '''当成span拼接的'''
                #  1: 'image', # 图片
                #  7: 'table',       # 表格
                #  13: 'inline_equation',     # 行内公式
                #  14: 'displayed_equation',      # 行间公式
                #  15: 'text',      # ocr识别文本
                '''layout信息'''
                #  11: 'full column',   # 单栏
                #  12: 'sub column',    # 多栏
赵小蒙's avatar
赵小蒙 committed
46
47
48
                span = {
                    'bbox': bbox,
                }
赵小蒙's avatar
赵小蒙 committed
49
50
51
52
53
                if category_id == 1:
                    span['type'] = 'image'
                elif category_id == 7:
                    span['type'] = 'table'
                elif category_id == 13:
赵小蒙's avatar
赵小蒙 committed
54
55
56
57
58
59
60
61
62
63
64
65
66
                    span['content'] = layout_det['latex']
                    span['type'] = 'inline_equation'
                elif category_id == 14:
                    span['content'] = layout_det['latex']
                    span['type'] = 'displayed_equation'
                elif category_id == 15:
                    span['content'] = layout_det['text']
                    span['type'] = 'text'
                # print(span)
                spans.append(span)
            else:
                continue

赵小蒙's avatar
赵小蒙 committed
67
68
69
70
71
72
73
74
75
76
        # 删除重叠spans中较小的那些
        spans = remove_overlaps_min_spans(spans)

        # 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整低于文字的y0


        # 将spans合并成line(从上到下,从左到右)
        lines = merge_spans_to_line(spans)
        # logger.info(lines)

赵小蒙's avatar
赵小蒙 committed
77
78
79
80
81
82
83
84
        # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
        blocks = []
        for line in lines:
            blocks.append({
                "bbox": line['bbox'],
                "lines": [line],
            })

85
86
87
        # 从ocr_page_info中解析layout信息(按自然阅读方向排序,并修复重叠和交错的bad case)
        layout_bboxes = layout_detect(ocr_page_info['subfield_dets'])

赵小蒙's avatar
赵小蒙 committed
88
        # 构造pdf_info_dict
89
        page_info = construct_page_component(page_id, blocks, layout_bboxes)
赵小蒙's avatar
赵小蒙 committed
90
91
92
93
        pdf_info_dict[f"page_{page_id}"] = page_info

    return pdf_info_dict