ocr_mkcontent.py 9.6 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
from magic_pdf.libs.commons import s3_image_save_path, join_path
2
from magic_pdf.libs.language import detect_lang
赵小蒙's avatar
赵小蒙 committed
3
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
赵小蒙's avatar
赵小蒙 committed
4
from magic_pdf.libs.ocr_content_type import ContentType
5
6
7
8
9
10
11
12
13
14
15
16
17
import wordninja
import re


def split_long_words(text):
    segments = text.split(' ')
    for i in range(len(segments)):
        words = re.findall(r'\w+|[^\w\s]', segments[i], re.UNICODE)
        for j in range(len(words)):
            if len(words[j]) > 15:
                words[j] = ' '.join(wordninja.split(words[j]))
        segments[i] = ''.join(words)
    return ' '.join(segments)
赵小蒙's avatar
赵小蒙 committed
18
19


赵小蒙's avatar
赵小蒙 committed
20
def ocr_mk_nlp_markdown(pdf_info_dict: dict):
赵小蒙's avatar
赵小蒙 committed
21
22
23
24
25
26
27
28
29
30
    markdown = []

    for _, page_info in pdf_info_dict.items():
        blocks = page_info.get("preproc_blocks")
        if not blocks:
            continue
        for block in blocks:
            for line in block['lines']:
                line_text = ''
                for span in line['spans']:
31
32
                    if not span.get('content'):
                        continue
赵小蒙's avatar
赵小蒙 committed
33
                    content = ocr_escape_special_markdown_char(span['content'])  # 转义特殊符号
赵小蒙's avatar
赵小蒙 committed
34
                    if span['type'] == ContentType.InlineEquation:
赵小蒙's avatar
赵小蒙 committed
35
                        content = f"${content}$"
赵小蒙's avatar
赵小蒙 committed
36
                    elif span['type'] == ContentType.InterlineEquation:
赵小蒙's avatar
赵小蒙 committed
37
38
39
40
41
                        content = f"$$\n{content}\n$$"
                    line_text += content + ' '
                # 在行末添加两个空格以强制换行
                markdown.append(line_text.strip() + '  ')
    return '\n'.join(markdown)
42

赵小蒙's avatar
赵小蒙 committed
43

赵小蒙's avatar
赵小蒙 committed
44
def ocr_mk_mm_markdown(pdf_info_dict: dict):
45
46
47
48
49
50
51
52
53
54
55
56
57
58
    markdown = []

    for _, page_info in pdf_info_dict.items():
        blocks = page_info.get("preproc_blocks")
        if not blocks:
            continue
        for block in blocks:
            for line in block['lines']:
                line_text = ''
                for span in line['spans']:
                    if not span.get('content'):
                        if not span.get('image_path'):
                            continue
                        else:
赵小蒙's avatar
赵小蒙 committed
59
                            content = f"![]({join_path(s3_image_save_path, span['image_path'])})"
60
                    else:
赵小蒙's avatar
赵小蒙 committed
61
                        content = ocr_escape_special_markdown_char(span['content'])  # 转义特殊符号
赵小蒙's avatar
赵小蒙 committed
62
                        if span['type'] == ContentType.InlineEquation:
63
                            content = f"${content}$"
赵小蒙's avatar
赵小蒙 committed
64
                        elif span['type'] == ContentType.InterlineEquation:
65
66
67
68
69
                            content = f"$$\n{content}\n$$"
                    line_text += content + ' '
                # 在行末添加两个空格以强制换行
                markdown.append(line_text.strip() + '  ')
    return '\n'.join(markdown)
70

71

赵小蒙's avatar
赵小蒙 committed
72
def ocr_mk_mm_markdown_with_para(pdf_info_dict: dict):
xuchao's avatar
xuchao committed
73
74
    markdown = []
    for _, page_info in pdf_info_dict.items():
75
        paras_of_layout = page_info.get("para_blocks")
76
        page_markdown = ocr_mk_mm_markdown_with_para_core(paras_of_layout, "mm")
77
        markdown.extend(page_markdown)
78
    return '\n\n'.join(markdown)
79
80


81
82
83
84
85
86
87
88
def ocr_mk_nlp_markdown_with_para(pdf_info_dict: dict):
    markdown = []
    for _, page_info in pdf_info_dict.items():
        paras_of_layout = page_info.get("para_blocks")
        page_markdown = ocr_mk_mm_markdown_with_para_core(paras_of_layout, "nlp")
        markdown.extend(page_markdown)
    return '\n\n'.join(markdown)

89
90
91
def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: dict):
    markdown_with_para_and_pagination = []
    for page_no, page_info in pdf_info_dict.items():
92
93
        paras_of_layout = page_info.get("para_blocks")
        if not paras_of_layout:
94
            continue
95
        page_markdown = ocr_mk_mm_markdown_with_para_core(paras_of_layout, "mm")
96
97
        markdown_with_para_and_pagination.append({
            'page_no': page_no,
98
            'md_content': '\n\n'.join(page_markdown)
99
100
101
102
        })
    return markdown_with_para_and_pagination


103
def ocr_mk_mm_markdown_with_para_core(paras_of_layout, mode):
104
105
106
107
108
109
110
    page_markdown = []
    for paras in paras_of_layout:
        for para in paras:
            para_text = ''
            for line in para:
                for span in line['spans']:
                    span_type = span.get('type')
111
                    content = ''
112
                    language = ''
113
                    if span_type == ContentType.Text:
114
115
116
117
118
119
                        content = span['content']
                        language = detect_lang(content)
                        if language == 'en':  # 只对英文长词进行分词处理,中文分词会丢失文本
                            content = ocr_escape_special_markdown_char(split_long_words(content))
                        else:
                            content = ocr_escape_special_markdown_char(content)
120
                    elif span_type == ContentType.InlineEquation:
121
                        content = f"${ocr_escape_special_markdown_char(span['content'])}$"
122
                    elif span_type == ContentType.InterlineEquation:
123
                        content = f"\n$$\n{ocr_escape_special_markdown_char(span['content'])}\n$$\n"
124
                    elif span_type in [ContentType.Image, ContentType.Table]:
125
126
127
128
                        if mode == 'mm':
                            content = f"\n![]({join_path(s3_image_save_path, span['image_path'])})\n"
                        elif mode == 'nlp':
                            pass
129
                    if content != '':
130
131
132
133
                        if language == 'en':  # 英文语境下 content间需要空格分隔
                            para_text += content + ' '
                        else:  # 中文语境下,content间不需要空格分隔
                            para_text += content
134
135
136
137
            if para_text.strip() == '':
                continue
            else:
                page_markdown.append(para_text.strip() + '  ')
138
139
140
    return page_markdown


141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
def para_to_standard_format(para):
    para_content = {}
    if len(para) == 1:
        para_content = line_to_standard_format(para[0])
    elif len(para) > 1:
        para_text = ''
        inline_equation_num = 0
        for line in para:
            for span in line['spans']:
                span_type = span.get('type')
                if span_type == ContentType.Text:
                    content = ocr_escape_special_markdown_char(split_long_words(span['content']))
                elif span_type == ContentType.InlineEquation:
                    content = f"${ocr_escape_special_markdown_char(span['content'])}$"
                    inline_equation_num += 1
                para_text += content + ' '
        para_content = {
            'type': 'text',
            'text': para_text,
            'inline_equation_num': inline_equation_num
        }
    return para_content

赵小蒙's avatar
赵小蒙 committed
164
165
166
def make_standard_format_with_para(pdf_info_dict: dict):
    content_list = []
    for _, page_info in pdf_info_dict.items():
167
168
        paras_of_layout = page_info.get("para_blocks")
        if not paras_of_layout:
赵小蒙's avatar
赵小蒙 committed
169
            continue
170
171
172
173
        for paras in paras_of_layout:
            for para in paras:
                para_content = para_to_standard_format(para)
                content_list.append(para_content)
赵小蒙's avatar
赵小蒙 committed
174
175
176
    return content_list


赵小蒙's avatar
赵小蒙 committed
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
def line_to_standard_format(line):
    line_text = ""
    inline_equation_num = 0
    for span in line['spans']:
        if not span.get('content'):
            if not span.get('image_path'):
                continue
            else:
                if span['type'] == ContentType.Image:
                    content = {
                        'type': 'image',
                        'img_path': join_path(s3_image_save_path, span['image_path'])
                    }
                    return content
                elif span['type'] == ContentType.Table:
                    content = {
                        'type': 'table',
                        'img_path': join_path(s3_image_save_path, span['image_path'])
                    }
                    return content
        else:
            if span['type'] == ContentType.InterlineEquation:
                interline_equation = ocr_escape_special_markdown_char(span['content'])  # 转义特殊符号
                content = {
                    'type': 'equation',
                    'latex': f"$$\n{interline_equation}\n$$"
                }
                return content
            elif span['type'] == ContentType.InlineEquation:
                inline_equation = ocr_escape_special_markdown_char(span['content'])  # 转义特殊符号
                line_text += f"${inline_equation}$"
                inline_equation_num += 1
            elif span['type'] == ContentType.Text:
210
211
                text_content = ocr_escape_special_markdown_char(span['content'])  # 转义特殊符号
                line_text += text_content
赵小蒙's avatar
赵小蒙 committed
212
213
214
215
216
217
218
219
220
    content = {
        'type': 'text',
        'text': line_text,
        'inline_equation_num': inline_equation_num
    }
    return content


def ocr_mk_mm_standard_format(pdf_info_dict: dict):
赵小蒙's avatar
update  
赵小蒙 committed
221
    """
222
    content_list
赵小蒙's avatar
赵小蒙 committed
223
224
225
226
227
    type         string      image/text/table/equation(行间的单独拿出来,行内的和text合并)
    latex        string      latex文本字段。
    text         string      纯文本格式的文本数据。
    md           string      markdown格式的文本数据。
    img_path     string      s3://full/path/to/img.jpg
赵小蒙's avatar
update  
赵小蒙 committed
228
    """
赵小蒙's avatar
赵小蒙 committed
229
230
231
232
233
234
235
236
237
238
    content_list = []
    for _, page_info in pdf_info_dict.items():
        blocks = page_info.get("preproc_blocks")
        if not blocks:
            continue
        for block in blocks:
            for line in block['lines']:
                content = line_to_standard_format(line)
                content_list.append(content)
    return content_list