ocr_mkcontent.py 12.9 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
2
from loguru import logger

3
from magic_pdf.libs.commons import join_path
4
from magic_pdf.libs.language import detect_lang
赵小蒙's avatar
赵小蒙 committed
5
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
赵小蒙's avatar
赵小蒙 committed
6
from magic_pdf.libs.ocr_content_type import ContentType, BlockType
7
8
9
10
11
12
13
14
15
16
17
18
19
import wordninja
import re


def split_long_words(text):
    segments = text.split(' ')
    for i in range(len(segments)):
        words = re.findall(r'\w+|[^\w\s]', segments[i], re.UNICODE)
        for j in range(len(words)):
            if len(words[j]) > 15:
                words[j] = ' '.join(wordninja.split(words[j]))
        segments[i] = ''.join(words)
    return ' '.join(segments)
赵小蒙's avatar
赵小蒙 committed
20
21


赵小蒙's avatar
赵小蒙 committed
22
def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path):
xuchao's avatar
xuchao committed
23
    markdown = []
赵小蒙's avatar
赵小蒙 committed
24
    for page_info in pdf_info_list:
25
        paras_of_layout = page_info.get("para_blocks")
赵小蒙's avatar
赵小蒙 committed
26
        page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path)
27
        markdown.extend(page_markdown)
28
    return '\n\n'.join(markdown)
29
30


赵小蒙's avatar
赵小蒙 committed
31
def ocr_mk_nlp_markdown_with_para(pdf_info_dict: list):
32
    markdown = []
赵小蒙's avatar
赵小蒙 committed
33
    for page_info in pdf_info_dict:
34
        paras_of_layout = page_info.get("para_blocks")
35
        page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "nlp")
36
37
38
        markdown.extend(page_markdown)
    return '\n\n'.join(markdown)

赵小蒙's avatar
赵小蒙 committed
39

赵小蒙's avatar
赵小蒙 committed
40
def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list, img_buket_path):
41
    markdown_with_para_and_pagination = []
赵小蒙's avatar
赵小蒙 committed
42
43
    page_no = 0
    for page_info in pdf_info_dict:
44
45
        paras_of_layout = page_info.get("para_blocks")
        if not paras_of_layout:
46
            continue
47
        page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path)
48
49
        markdown_with_para_and_pagination.append({
            'page_no': page_no,
50
            'md_content': '\n\n'.join(page_markdown)
51
        })
赵小蒙's avatar
赵小蒙 committed
52
        page_no += 1
53
54
55
    return markdown_with_para_and_pagination


赵小蒙's avatar
赵小蒙 committed
56
def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=""):
57
58
59
60
61
62
63
    page_markdown = []
    for paras in paras_of_layout:
        for para in paras:
            para_text = ''
            for line in para:
                for span in line['spans']:
                    span_type = span.get('type')
64
                    content = ''
65
                    language = ''
66
                    if span_type == ContentType.Text:
67
68
69
70
71
72
                        content = span['content']
                        language = detect_lang(content)
                        if language == 'en':  # 只对英文长词进行分词处理,中文分词会丢失文本
                            content = ocr_escape_special_markdown_char(split_long_words(content))
                        else:
                            content = ocr_escape_special_markdown_char(content)
73
                    elif span_type == ContentType.InlineEquation:
74
                        content = f"${span['content']}$"
75
                    elif span_type == ContentType.InterlineEquation:
76
                        content = f"\n$$\n{span['content']}\n$$\n"
77
                    elif span_type in [ContentType.Image, ContentType.Table]:
78
                        if mode == 'mm':
赵小蒙's avatar
赵小蒙 committed
79
                            content = f"\n![]({join_path(img_buket_path, span['image_path'])})\n"
80
81
                        elif mode == 'nlp':
                            pass
82
                    if content != '':
83
84
85
86
                        if language == 'en':  # 英文语境下 content间需要空格分隔
                            para_text += content + ' '
                        else:  # 中文语境下,content间不需要空格分隔
                            para_text += content
87
88
89
90
            if para_text.strip() == '':
                continue
            else:
                page_markdown.append(para_text.strip() + '  ')
91
92
93
    return page_markdown


赵小蒙's avatar
赵小蒙 committed
94
95
def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
    page_markdown = []
96
    for para_block in paras_of_layout:
赵小蒙's avatar
赵小蒙 committed
97
        para_text = ''
赵小蒙's avatar
赵小蒙 committed
98
        para_type = para_block['type']
99
100
101
102
103
104
105
106
        if para_type == BlockType.Text:
            para_text = merge_para_with_text(para_block)
        elif para_type == BlockType.Title:
            para_text = f"# {merge_para_with_text(para_block)}"
        elif para_type == BlockType.InterlineEquation:
            para_text = merge_para_with_text(para_block)
        elif para_type == BlockType.Image:
            if mode == 'nlp':
赵小蒙's avatar
赵小蒙 committed
107
                continue
108
            elif mode == 'mm':
赵小蒙's avatar
赵小蒙 committed
109
110
111
                for block in para_block['blocks']:
                    if block['type'] == BlockType.ImageBody:
                        for line in block['lines']:
112
                            for span in line['spans']:
赵小蒙's avatar
赵小蒙 committed
113
                                if span['type'] == ContentType.Image:
114
                                    para_text = f"\n![]({join_path(img_buket_path, span['image_path'])})\n"
赵小蒙's avatar
赵小蒙 committed
115
116
117
                for block in para_block['blocks']:
                    if block['type'] == BlockType.ImageCaption:
                        para_text += merge_para_with_text(block)
118
119
120
121
        elif para_type == BlockType.Table:
            if mode == 'nlp':
                continue
            elif mode == 'mm':
赵小蒙's avatar
赵小蒙 committed
122
123
124
                for block in para_block['blocks']:
                    if block['type'] == BlockType.TableBody:
                        for line in block['lines']:
125
                            for span in line['spans']:
赵小蒙's avatar
赵小蒙 committed
126
                                if span['type'] == ContentType.Table:
127
                                    para_text = f"\n![]({join_path(img_buket_path, span['image_path'])})\n"
赵小蒙's avatar
赵小蒙 committed
128
129
130
131
132
                for block in para_block['blocks']:
                    if block['type'] == BlockType.TableCaption:
                        para_text += merge_para_with_text(block)
                    elif block['type'] == BlockType.TableFootnote:
                        para_text += merge_para_with_text(block)
133
134
135
136
137

        if para_text.strip() == '':
            continue
        else:
            page_markdown.append(para_text.strip() + '  ')
赵小蒙's avatar
赵小蒙 committed
138
139
140
141

    return page_markdown


赵小蒙's avatar
赵小蒙 committed
142
def merge_para_with_text(para_block):
赵小蒙's avatar
赵小蒙 committed
143
    para_text = ''
赵小蒙's avatar
赵小蒙 committed
144
    for line in para_block['lines']:
赵小蒙's avatar
赵小蒙 committed
145
        for span in line['spans']:
赵小蒙's avatar
赵小蒙 committed
146
            span_type = span['type']
赵小蒙's avatar
赵小蒙 committed
147
148
149
150
151
152
153
154
155
156
157
158
159
            content = ''
            language = ''
            if span_type == ContentType.Text:
                content = span['content']
                language = detect_lang(content)
                if language == 'en':  # 只对英文长词进行分词处理,中文分词会丢失文本
                    content = ocr_escape_special_markdown_char(split_long_words(content))
                else:
                    content = ocr_escape_special_markdown_char(content)
            elif span_type == ContentType.InlineEquation:
                content = f"${span['content']}$"
            elif span_type == ContentType.InterlineEquation:
                content = f"\n$$\n{span['content']}\n$$\n"
赵小蒙's avatar
赵小蒙 committed
160

赵小蒙's avatar
赵小蒙 committed
161
162
163
164
165
166
167
168
            if content != '':
                if language == 'en':  # 英文语境下 content间需要空格分隔
                    para_text += content + ' '
                else:  # 中文语境下,content间不需要空格分隔
                    para_text += content
    return para_text


169
def para_to_standard_format(para, img_buket_path):
170
171
    para_content = {}
    if len(para) == 1:
172
        para_content = line_to_standard_format(para[0], img_buket_path)
173
174
175
176
177
    elif len(para) > 1:
        para_text = ''
        inline_equation_num = 0
        for line in para:
            for span in line['spans']:
178
                language = ''
179
                span_type = span.get('type')
赵小蒙's avatar
赵小蒙 committed
180
                content = ""
181
                if span_type == ContentType.Text:
182
183
184
185
186
187
                    content = span['content']
                    language = detect_lang(content)
                    if language == 'en':  # 只对英文长词进行分词处理,中文分词会丢失文本
                        content = ocr_escape_special_markdown_char(split_long_words(content))
                    else:
                        content = ocr_escape_special_markdown_char(content)
188
                elif span_type == ContentType.InlineEquation:
189
                    content = f"${span['content']}$"
190
                    inline_equation_num += 1
191
192
193
194
195

                if language == 'en':  # 英文语境下 content间需要空格分隔
                    para_text += content + ' '
                else:  # 中文语境下,content间不需要空格分隔
                    para_text += content
196
197
198
199
200
201
202
        para_content = {
            'type': 'text',
            'text': para_text,
            'inline_equation_num': inline_equation_num
        }
    return para_content

赵小蒙's avatar
赵小蒙 committed
203

赵小蒙's avatar
赵小蒙 committed
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
def para_to_standard_format_v2(para_block, img_buket_path):
    para_type = para_block['type']
    if para_type == BlockType.Text:
        para_content = {
            'type': 'text',
            'text': merge_para_with_text(para_block),
        }
    elif para_type == BlockType.Title:
        para_content = {
            'type': 'text',
            'text': merge_para_with_text(para_block),
            'text_level': 1
        }
    elif para_type == BlockType.InterlineEquation:
        para_content = {
            'type': 'equation',
            'text': merge_para_with_text(para_block),
            'text_format': "latex"
        }
    elif para_type == BlockType.Image:
        para_content = {
            'type': 'image',
        }
        for block in para_block['blocks']:
            if block['type'] == BlockType.ImageBody:
                para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
            if block['type'] == BlockType.ImageCaption:
                para_content['img_caption'] = merge_para_with_text(block)
    elif para_type == BlockType.Table:
        para_content = {
            'type': 'table',
        }
        for block in para_block['blocks']:
            if block['type'] == BlockType.TableBody:
                para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
            if block['type'] == BlockType.TableCaption:
                para_content['table_caption'] = merge_para_with_text(block)
            if block['type'] == BlockType.TableFootnote:
                para_content['table_footnote'] = merge_para_with_text(block)

    return para_content


赵小蒙's avatar
赵小蒙 committed
247
def make_standard_format_with_para(pdf_info_dict: list, img_buket_path: str):
赵小蒙's avatar
赵小蒙 committed
248
    content_list = []
赵小蒙's avatar
赵小蒙 committed
249
    for page_info in pdf_info_dict:
250
251
        paras_of_layout = page_info.get("para_blocks")
        if not paras_of_layout:
赵小蒙's avatar
赵小蒙 committed
252
            continue
赵小蒙's avatar
赵小蒙 committed
253
254
255
        for para_block in paras_of_layout:
            para_content = para_to_standard_format_v2(para_block, img_buket_path)
            content_list.append(para_content)
赵小蒙's avatar
赵小蒙 committed
256
257
258
    return content_list


259
def line_to_standard_format(line, img_buket_path):
赵小蒙's avatar
赵小蒙 committed
260
261
262
263
264
265
266
267
268
269
    line_text = ""
    inline_equation_num = 0
    for span in line['spans']:
        if not span.get('content'):
            if not span.get('image_path'):
                continue
            else:
                if span['type'] == ContentType.Image:
                    content = {
                        'type': 'image',
270
                        'img_path': join_path(img_buket_path, span['image_path'])
赵小蒙's avatar
赵小蒙 committed
271
272
273
274
275
                    }
                    return content
                elif span['type'] == ContentType.Table:
                    content = {
                        'type': 'table',
276
                        'img_path': join_path(img_buket_path, span['image_path'])
赵小蒙's avatar
赵小蒙 committed
277
278
279
280
                    }
                    return content
        else:
            if span['type'] == ContentType.InterlineEquation:
赵小蒙's avatar
赵小蒙 committed
281
                interline_equation = span['content']
赵小蒙's avatar
赵小蒙 committed
282
283
284
285
286
287
                content = {
                    'type': 'equation',
                    'latex': f"$$\n{interline_equation}\n$$"
                }
                return content
            elif span['type'] == ContentType.InlineEquation:
赵小蒙's avatar
赵小蒙 committed
288
                inline_equation = span['content']
赵小蒙's avatar
赵小蒙 committed
289
290
291
                line_text += f"${inline_equation}$"
                inline_equation_num += 1
            elif span['type'] == ContentType.Text:
292
293
                text_content = ocr_escape_special_markdown_char(span['content'])  # 转义特殊符号
                line_text += text_content
赵小蒙's avatar
赵小蒙 committed
294
295
296
297
298
299
300
301
    content = {
        'type': 'text',
        'text': line_text,
        'inline_equation_num': inline_equation_num
    }
    return content


赵小蒙's avatar
赵小蒙 committed
302
def ocr_mk_mm_standard_format(pdf_info_dict: list):
赵小蒙's avatar
update  
赵小蒙 committed
303
    """
304
    content_list
赵小蒙's avatar
赵小蒙 committed
305
306
307
308
309
    type         string      image/text/table/equation(行间的单独拿出来,行内的和text合并)
    latex        string      latex文本字段。
    text         string      纯文本格式的文本数据。
    md           string      markdown格式的文本数据。
    img_path     string      s3://full/path/to/img.jpg
赵小蒙's avatar
update  
赵小蒙 committed
310
    """
赵小蒙's avatar
赵小蒙 committed
311
    content_list = []
赵小蒙's avatar
赵小蒙 committed
312
    for page_info in pdf_info_dict:
赵小蒙's avatar
赵小蒙 committed
313
314
315
316
317
318
319
320
        blocks = page_info.get("preproc_blocks")
        if not blocks:
            continue
        for block in blocks:
            for line in block['lines']:
                content = line_to_standard_format(line)
                content_list.append(content)
    return content_list