"template/falcon-instruct.json" did not exist on "23ebbaa46ead40c44c20b707b0e53d954ea51dc5"
ocr_mkcontent.py 6.13 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
from magic_pdf.libs.commons import s3_image_save_path, join_path
赵小蒙's avatar
赵小蒙 committed
2
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
赵小蒙's avatar
赵小蒙 committed
3
4
5
from magic_pdf.libs.ocr_content_type import ContentType


赵小蒙's avatar
赵小蒙 committed
6
def ocr_mk_nlp_markdown(pdf_info_dict: dict):
赵小蒙's avatar
赵小蒙 committed
7
8
9
10
11
12
13
14
15
16
    markdown = []

    for _, page_info in pdf_info_dict.items():
        blocks = page_info.get("preproc_blocks")
        if not blocks:
            continue
        for block in blocks:
            for line in block['lines']:
                line_text = ''
                for span in line['spans']:
17
18
                    if not span.get('content'):
                        continue
赵小蒙's avatar
赵小蒙 committed
19
                    content = ocr_escape_special_markdown_char(span['content'])  # 转义特殊符号
赵小蒙's avatar
赵小蒙 committed
20
                    if span['type'] == ContentType.InlineEquation:
赵小蒙's avatar
赵小蒙 committed
21
                        content = f"${content}$"
赵小蒙's avatar
赵小蒙 committed
22
                    elif span['type'] == ContentType.InterlineEquation:
赵小蒙's avatar
赵小蒙 committed
23
24
25
26
27
                        content = f"$$\n{content}\n$$"
                    line_text += content + ' '
                # 在行末添加两个空格以强制换行
                markdown.append(line_text.strip() + '  ')
    return '\n'.join(markdown)
28

赵小蒙's avatar
赵小蒙 committed
29

赵小蒙's avatar
赵小蒙 committed
30
def ocr_mk_mm_markdown(pdf_info_dict: dict):
31
32
33
34
35
36
37
38
39
40
41
42
43
44
    markdown = []

    for _, page_info in pdf_info_dict.items():
        blocks = page_info.get("preproc_blocks")
        if not blocks:
            continue
        for block in blocks:
            for line in block['lines']:
                line_text = ''
                for span in line['spans']:
                    if not span.get('content'):
                        if not span.get('image_path'):
                            continue
                        else:
赵小蒙's avatar
赵小蒙 committed
45
                            content = f"![]({join_path(s3_image_save_path, span['image_path'])})"
46
                    else:
赵小蒙's avatar
赵小蒙 committed
47
                        content = ocr_escape_special_markdown_char(span['content'])  # 转义特殊符号
赵小蒙's avatar
赵小蒙 committed
48
                        if span['type'] == ContentType.InlineEquation:
49
                            content = f"${content}$"
赵小蒙's avatar
赵小蒙 committed
50
                        elif span['type'] == ContentType.InterlineEquation:
51
52
53
54
55
                            content = f"$$\n{content}\n$$"
                    line_text += content + ' '
                # 在行末添加两个空格以强制换行
                markdown.append(line_text.strip() + '  ')
    return '\n'.join(markdown)
56

57

赵小蒙's avatar
赵小蒙 committed
58
def ocr_mk_mm_markdown_with_para(pdf_info_dict: dict):
xuchao's avatar
xuchao committed
59
60
61
62
63
    markdown = []
    for _, page_info in pdf_info_dict.items():
        paras = page_info.get("para_blocks")
        if not paras:
            continue
64
        for para in paras:
xuchao's avatar
xuchao committed
65
66
67
68
            para_text = ''
            for line in para:
                for span in line['spans']:
                    span_type = span.get('type')
69
                    if span_type == ContentType.Text:
xuchao's avatar
xuchao committed
70
                        para_text += span['content']
71
                    elif span_type == ContentType.InlineEquation:
xuchao's avatar
xuchao committed
72
                        para_text += f" ${span['content']}$ "
73
                    elif span_type == ContentType.InterlineEquation:
xuchao's avatar
xuchao committed
74
                        para_text += f"$$\n{span['content']}\n$$ "
75
                    elif span_type == ContentType.Image:
赵小蒙's avatar
赵小蒙 committed
76
                        para_text += f"![]({join_path(s3_image_save_path, span['image_path'])})"
xuchao's avatar
xuchao committed
77
            markdown.append(para_text)
78

xuchao's avatar
xuchao committed
79
    return '\n\n'.join(markdown)
80
81


赵小蒙's avatar
赵小蒙 committed
82
83
84
85
86
87
88
89
90
91
92
93
94
def make_standard_format_with_para(pdf_info_dict: dict):
    content_list = []
    for _, page_info in pdf_info_dict.items():
        paras = page_info.get("para_blocks")
        if not paras:
            continue
        for para in paras:
            for line in para:
                content = line_to_standard_format(line)
                content_list.append(content)
    return content_list


赵小蒙's avatar
赵小蒙 committed
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
def line_to_standard_format(line):
    line_text = ""
    inline_equation_num = 0
    for span in line['spans']:
        if not span.get('content'):
            if not span.get('image_path'):
                continue
            else:
                if span['type'] == ContentType.Image:
                    content = {
                        'type': 'image',
                        'img_path': join_path(s3_image_save_path, span['image_path'])
                    }
                    return content
                elif span['type'] == ContentType.Table:
                    content = {
                        'type': 'table',
                        'img_path': join_path(s3_image_save_path, span['image_path'])
                    }
                    return content
        else:
            if span['type'] == ContentType.InterlineEquation:
                interline_equation = ocr_escape_special_markdown_char(span['content'])  # 转义特殊符号
                content = {
                    'type': 'equation',
                    'latex': f"$$\n{interline_equation}\n$$"
                }
                return content
            elif span['type'] == ContentType.InlineEquation:
                inline_equation = ocr_escape_special_markdown_char(span['content'])  # 转义特殊符号
                line_text += f"${inline_equation}$"
                inline_equation_num += 1
            elif span['type'] == ContentType.Text:
128
129
                text_content = ocr_escape_special_markdown_char(span['content'])  # 转义特殊符号
                line_text += text_content
赵小蒙's avatar
赵小蒙 committed
130
131
132
133
134
135
136
137
138
    content = {
        'type': 'text',
        'text': line_text,
        'inline_equation_num': inline_equation_num
    }
    return content


def ocr_mk_mm_standard_format(pdf_info_dict: dict):
139
140
    '''
    content_list
赵小蒙's avatar
赵小蒙 committed
141
142
143
144
145
    type         string      image/text/table/equation(行间的单独拿出来,行内的和text合并)
    latex        string      latex文本字段。
    text         string      纯文本格式的文本数据。
    md           string      markdown格式的文本数据。
    img_path     string      s3://full/path/to/img.jpg
146
    '''
赵小蒙's avatar
赵小蒙 committed
147
148
149
150
151
152
153
154
155
156
    content_list = []
    for _, page_info in pdf_info_dict.items():
        blocks = page_info.get("preproc_blocks")
        if not blocks:
            continue
        for block in blocks:
            for line in block['lines']:
                content = line_to_standard_format(line)
                content_list.append(content)
    return content_list