pdf_parse_by_ocr.py 2.74 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio
from magic_pdf.libs.ocr_dict_merge import merge_spans


def construct_page_component(page_id, text_blocks_preproc):
    return_dict = {
        'preproc_blocks': text_blocks_preproc,
        'page_idx': page_id
    }
    return return_dict


def parse_pdf_by_ocr(
赵小蒙's avatar
赵小蒙 committed
14
    ocr_pdf_info,
赵小蒙's avatar
赵小蒙 committed
15
16
17
    start_page_id=0,
    end_page_id=None,
):
赵小蒙's avatar
赵小蒙 committed
18

赵小蒙's avatar
赵小蒙 committed
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
    pdf_info_dict = {}
    end_page_id = end_page_id if end_page_id else len(ocr_pdf_info) - 1
    for page_id in range(start_page_id, end_page_id + 1):
        ocr_page_info = ocr_pdf_info[page_id]
        layout_dets = ocr_page_info['layout_dets']
        spans = []
        for layout_det in layout_dets:
            category_id = layout_det['category_id']
            allow_category_id_list = [13, 14, 15]
            if category_id in allow_category_id_list:
                x0, y0, _, _, x1, y1, _, _ = layout_det['poly']
                bbox = [int(x0), int(y0), int(x1), int(y1)]
                #  13: 'embedding',     # 嵌入公式
                #  14: 'isolated',      # 单行公式
                #  15: 'ocr_text',      # ocr识别文本
                span = {
                    'bbox': bbox,
                }
                if category_id == 13:
                    span['content'] = layout_det['latex']
                    span['type'] = 'inline_equation'
                elif category_id == 14:
                    span['content'] = layout_det['latex']
                    span['type'] = 'displayed_equation'
                elif category_id == 15:
                    span['content'] = layout_det['text']
                    span['type'] = 'text'
                # print(span)
                spans.append(span)
            else:
                continue

        # 合并重叠的spans
        for span1 in spans.copy():
            for span2 in spans.copy():
                if span1 != span2:
                    overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.8)
                    if overlap_box is not None:
                        bbox_to_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
                        if bbox_to_remove is not None:
                            spans.remove(bbox_to_remove)

        # 将spans合并成line
        lines = merge_spans(spans)

        # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
        blocks = []
        for line in lines:
            blocks.append({
                "bbox": line['bbox'],
                "lines": [line],
            })

        # 构造pdf_info_dict
        page_info = construct_page_component(page_id, blocks)
        pdf_info_dict[f"page_{page_id}"] = page_info

    return pdf_info_dict