ocr_mkcontent.py 3.17 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
赵小蒙's avatar
赵小蒙 committed
2
3
4
from magic_pdf.libs.ocr_content_type import ContentType


赵小蒙's avatar
赵小蒙 committed
5
def ocr_mk_nlp_markdown(pdf_info_dict: dict):
赵小蒙's avatar
赵小蒙 committed
6
7
8
9
10
11
12
13
14
15
    markdown = []

    for _, page_info in pdf_info_dict.items():
        blocks = page_info.get("preproc_blocks")
        if not blocks:
            continue
        for block in blocks:
            for line in block['lines']:
                line_text = ''
                for span in line['spans']:
16
17
                    if not span.get('content'):
                        continue
赵小蒙's avatar
赵小蒙 committed
18
                    content = ocr_escape_special_markdown_char(span['content'])  # 转义特殊符号
赵小蒙's avatar
赵小蒙 committed
19
                    if span['type'] == ContentType.InlineEquation:
赵小蒙's avatar
赵小蒙 committed
20
                        content = f"${content}$"
赵小蒙's avatar
赵小蒙 committed
21
                    elif span['type'] == ContentType.InterlineEquation:
赵小蒙's avatar
赵小蒙 committed
22
23
24
25
26
                        content = f"$$\n{content}\n$$"
                    line_text += content + ' '
                # 在行末添加两个空格以强制换行
                markdown.append(line_text.strip() + '  ')
    return '\n'.join(markdown)
27

赵小蒙's avatar
赵小蒙 committed
28

赵小蒙's avatar
赵小蒙 committed
29
def ocr_mk_mm_markdown(pdf_info_dict: dict):
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46

    markdown = []

    for _, page_info in pdf_info_dict.items():
        blocks = page_info.get("preproc_blocks")
        if not blocks:
            continue
        for block in blocks:
            for line in block['lines']:
                line_text = ''
                for span in line['spans']:
                    if not span.get('content'):
                        if not span.get('image_path'):
                            continue
                        else:
                            content = f"![]({span['image_path']})"
                    else:
赵小蒙's avatar
赵小蒙 committed
47
                        content = ocr_escape_special_markdown_char(span['content'])  # 转义特殊符号
赵小蒙's avatar
赵小蒙 committed
48
                        if span['type'] == ContentType.InlineEquation:
49
                            content = f"${content}$"
赵小蒙's avatar
赵小蒙 committed
50
                        elif span['type'] == ContentType.InterlineEquation:
51
52
53
54
55
                            content = f"$$\n{content}\n$$"
                    line_text += content + ' '
                # 在行末添加两个空格以强制换行
                markdown.append(line_text.strip() + '  ')
    return '\n'.join(markdown)
xuchao's avatar
xuchao committed
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82

def mk_mm_markdown2(pdf_info_dict:dict):
    markdown = []
    for _, page_info in pdf_info_dict.items():
        paras = page_info.get("para_blocks")
        if not paras:
            continue
        for para in paras: 
            para_text = ''
            for line in para:
                for span in line['spans']:
                    span_type = span.get('type')
                    if span_type == 'text':
                        para_text += span['content']
                    elif span_type == 'inline_equation':
                        para_text += f" ${span['content']}$ "
                    elif span_type == 'displayed_equation':
                        para_text += f"$$\n{span['content']}\n$$ "
                    elif span_type == 'image':
                        para_text += f"![]({span['image_path']}) "
            markdown.append(para_text)
            
    return '\n\n'.join(markdown)