ocr_mkcontent.py 15.5 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
2
from loguru import logger

赵小蒙's avatar
赵小蒙 committed
3
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
4
from magic_pdf.libs.commons import join_path
5
from magic_pdf.libs.language import detect_lang
赵小蒙's avatar
赵小蒙 committed
6
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
赵小蒙's avatar
赵小蒙 committed
7
from magic_pdf.libs.ocr_content_type import ContentType, BlockType
8
9
10
11
12
13
14
import wordninja
import re


def split_long_words(text):
    segments = text.split(' ')
    for i in range(len(segments)):
liukaiwen's avatar
liukaiwen committed
15
        words = re.findall(r'\w+|[^\w]', segments[i], re.UNICODE)
16
17
18
19
20
        for j in range(len(words)):
            if len(words[j]) > 15:
                words[j] = ' '.join(wordninja.split(words[j]))
        segments[i] = ''.join(words)
    return ' '.join(segments)
赵小蒙's avatar
赵小蒙 committed
21
22


赵小蒙's avatar
赵小蒙 committed
23
def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path):
xuchao's avatar
xuchao committed
24
    markdown = []
赵小蒙's avatar
赵小蒙 committed
25
    for page_info in pdf_info_list:
26
        paras_of_layout = page_info.get("para_blocks")
赵小蒙's avatar
赵小蒙 committed
27
        page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path)
28
        markdown.extend(page_markdown)
29
    return '\n\n'.join(markdown)
30
31


赵小蒙's avatar
赵小蒙 committed
32
def ocr_mk_nlp_markdown_with_para(pdf_info_dict: list):
33
    markdown = []
赵小蒙's avatar
赵小蒙 committed
34
    for page_info in pdf_info_dict:
35
        paras_of_layout = page_info.get("para_blocks")
36
        page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "nlp")
37
38
39
        markdown.extend(page_markdown)
    return '\n\n'.join(markdown)

赵小蒙's avatar
赵小蒙 committed
40

赵小蒙's avatar
赵小蒙 committed
41
def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list, img_buket_path):
42
    markdown_with_para_and_pagination = []
赵小蒙's avatar
赵小蒙 committed
43
44
    page_no = 0
    for page_info in pdf_info_dict:
45
46
        paras_of_layout = page_info.get("para_blocks")
        if not paras_of_layout:
47
            continue
48
        page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path)
49
50
        markdown_with_para_and_pagination.append({
            'page_no': page_no,
51
            'md_content': '\n\n'.join(page_markdown)
52
        })
赵小蒙's avatar
赵小蒙 committed
53
        page_no += 1
54
55
56
    return markdown_with_para_and_pagination


赵小蒙's avatar
赵小蒙 committed
57
def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=""):
58
59
60
61
62
63
64
    page_markdown = []
    for paras in paras_of_layout:
        for para in paras:
            para_text = ''
            for line in para:
                for span in line['spans']:
                    span_type = span.get('type')
65
                    content = ''
66
                    language = ''
67
                    if span_type == ContentType.Text:
68
69
70
71
72
73
                        content = span['content']
                        language = detect_lang(content)
                        if language == 'en':  # 只对英文长词进行分词处理,中文分词会丢失文本
                            content = ocr_escape_special_markdown_char(split_long_words(content))
                        else:
                            content = ocr_escape_special_markdown_char(content)
74
                    elif span_type == ContentType.InlineEquation:
75
                        content = f"${span['content']}$"
76
                    elif span_type == ContentType.InterlineEquation:
77
                        content = f"\n$$\n{span['content']}\n$$\n"
78
                    elif span_type in [ContentType.Image, ContentType.Table]:
79
                        if mode == 'mm':
赵小蒙's avatar
赵小蒙 committed
80
                            content = f"\n![]({join_path(img_buket_path, span['image_path'])})\n"
81
82
                        elif mode == 'nlp':
                            pass
83
                    if content != '':
84
85
86
87
                        if language == 'en':  # 英文语境下 content间需要空格分隔
                            para_text += content + ' '
                        else:  # 中文语境下,content间不需要空格分隔
                            para_text += content
88
89
90
91
            if para_text.strip() == '':
                continue
            else:
                page_markdown.append(para_text.strip() + '  ')
92
93
94
    return page_markdown


赵小蒙's avatar
赵小蒙 committed
95
96
def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
    page_markdown = []
97
    for para_block in paras_of_layout:
赵小蒙's avatar
赵小蒙 committed
98
        para_text = ''
赵小蒙's avatar
赵小蒙 committed
99
        para_type = para_block['type']
100
101
102
103
104
105
106
107
        if para_type == BlockType.Text:
            para_text = merge_para_with_text(para_block)
        elif para_type == BlockType.Title:
            para_text = f"# {merge_para_with_text(para_block)}"
        elif para_type == BlockType.InterlineEquation:
            para_text = merge_para_with_text(para_block)
        elif para_type == BlockType.Image:
            if mode == 'nlp':
赵小蒙's avatar
赵小蒙 committed
108
                continue
109
            elif mode == 'mm':
赵小蒙's avatar
赵小蒙 committed
110
                for block in para_block['blocks']:  # 1st.拼image_body
赵小蒙's avatar
赵小蒙 committed
111
112
                    if block['type'] == BlockType.ImageBody:
                        for line in block['lines']:
113
                            for span in line['spans']:
赵小蒙's avatar
赵小蒙 committed
114
                                if span['type'] == ContentType.Image:
115
                                    para_text += f"\n![]({join_path(img_buket_path, span['image_path'])})  \n"
赵小蒙's avatar
赵小蒙 committed
116
                for block in para_block['blocks']:  # 2nd.拼image_caption
赵小蒙's avatar
赵小蒙 committed
117
118
                    if block['type'] == BlockType.ImageCaption:
                        para_text += merge_para_with_text(block)
119
120
121
122
        elif para_type == BlockType.Table:
            if mode == 'nlp':
                continue
            elif mode == 'mm':
赵小蒙's avatar
赵小蒙 committed
123
124
125
126
                for block in para_block['blocks']:  # 1st.拼table_caption
                    if block['type'] == BlockType.TableCaption:
                        para_text += merge_para_with_text(block)
                for block in para_block['blocks']:  # 2nd.拼table_body
赵小蒙's avatar
赵小蒙 committed
127
128
                    if block['type'] == BlockType.TableBody:
                        for line in block['lines']:
129
                            for span in line['spans']:
赵小蒙's avatar
赵小蒙 committed
130
                                if span['type'] == ContentType.Table:
131
132
133
134
135
                                    # if processed by table model
                                    if span.get('content', ''):
                                        para_text += f"\n {span['content']}  \n"
                                    else:
                                        para_text += f"\n![]({join_path(img_buket_path, span['image_path'])})  \n"
赵小蒙's avatar
赵小蒙 committed
136
137
                for block in para_block['blocks']:  # 3rd.拼table_footnote
                    if block['type'] == BlockType.TableFootnote:
赵小蒙's avatar
赵小蒙 committed
138
                        para_text += merge_para_with_text(block)
139
140
141
142
143

        if para_text.strip() == '':
            continue
        else:
            page_markdown.append(para_text.strip() + '  ')
赵小蒙's avatar
赵小蒙 committed
144
145
146
147

    return page_markdown


赵小蒙's avatar
赵小蒙 committed
148
def merge_para_with_text(para_block):
赵小蒙's avatar
赵小蒙 committed
149
    para_text = ''
赵小蒙's avatar
赵小蒙 committed
150
    for line in para_block['lines']:
151
152
153
154
155
156
157
158
        line_text = ""
        line_lang = ""
        for span in line['spans']:
            span_type = span['type']
            if span_type == ContentType.Text:
                line_text += span['content'].strip()
        if line_text != "":
            line_lang = detect_lang(line_text)
赵小蒙's avatar
赵小蒙 committed
159
        for span in line['spans']:
赵小蒙's avatar
赵小蒙 committed
160
            span_type = span['type']
赵小蒙's avatar
赵小蒙 committed
161
162
163
164
165
166
167
168
169
170
171
172
            content = ''
            if span_type == ContentType.Text:
                content = span['content']
                language = detect_lang(content)
                if language == 'en':  # 只对英文长词进行分词处理,中文分词会丢失文本
                    content = ocr_escape_special_markdown_char(split_long_words(content))
                else:
                    content = ocr_escape_special_markdown_char(content)
            elif span_type == ContentType.InlineEquation:
                content = f"${span['content']}$"
            elif span_type == ContentType.InterlineEquation:
                content = f"\n$$\n{span['content']}\n$$\n"
赵小蒙's avatar
赵小蒙 committed
173

赵小蒙's avatar
赵小蒙 committed
174
            if content != '':
175
                if 'zh' in line_lang:  # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断
176
177
178
                    para_text += content  # 中文语境下,content间不需要空格分隔
                else:
                    para_text += content + ' '  # 英文语境下 content间需要空格分隔
赵小蒙's avatar
赵小蒙 committed
179
180
181
    return para_text


182
def para_to_standard_format(para, img_buket_path):
183
184
    para_content = {}
    if len(para) == 1:
185
        para_content = line_to_standard_format(para[0], img_buket_path)
186
187
188
189
190
    elif len(para) > 1:
        para_text = ''
        inline_equation_num = 0
        for line in para:
            for span in line['spans']:
191
                language = ''
192
                span_type = span.get('type')
赵小蒙's avatar
赵小蒙 committed
193
                content = ""
194
                if span_type == ContentType.Text:
195
196
197
198
199
200
                    content = span['content']
                    language = detect_lang(content)
                    if language == 'en':  # 只对英文长词进行分词处理,中文分词会丢失文本
                        content = ocr_escape_special_markdown_char(split_long_words(content))
                    else:
                        content = ocr_escape_special_markdown_char(content)
201
                elif span_type == ContentType.InlineEquation:
202
                    content = f"${span['content']}$"
203
                    inline_equation_num += 1
204
205
206
207
208

                if language == 'en':  # 英文语境下 content间需要空格分隔
                    para_text += content + ' '
                else:  # 中文语境下,content间不需要空格分隔
                    para_text += content
209
210
211
212
213
214
215
        para_content = {
            'type': 'text',
            'text': para_text,
            'inline_equation_num': inline_equation_num
        }
    return para_content

赵小蒙's avatar
赵小蒙 committed
216

赵小蒙's avatar
赵小蒙 committed
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
def para_to_standard_format_v2(para_block, img_buket_path):
    para_type = para_block['type']
    if para_type == BlockType.Text:
        para_content = {
            'type': 'text',
            'text': merge_para_with_text(para_block),
        }
    elif para_type == BlockType.Title:
        para_content = {
            'type': 'text',
            'text': merge_para_with_text(para_block),
            'text_level': 1
        }
    elif para_type == BlockType.InterlineEquation:
        para_content = {
            'type': 'equation',
            'text': merge_para_with_text(para_block),
            'text_format': "latex"
        }
    elif para_type == BlockType.Image:
        para_content = {
            'type': 'image',
        }
        for block in para_block['blocks']:
            if block['type'] == BlockType.ImageBody:
                para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
            if block['type'] == BlockType.ImageCaption:
                para_content['img_caption'] = merge_para_with_text(block)
    elif para_type == BlockType.Table:
        para_content = {
            'type': 'table',
        }
        for block in para_block['blocks']:
            if block['type'] == BlockType.TableBody:
251
252
253
                #TODO
                if block["lines"][0]["spans"][0].get('content', ''):
                    para_content['table_body'] = f"\n {block['lines'][0]['spans'][0]['content']}  \n"
赵小蒙's avatar
赵小蒙 committed
254
255
256
257
258
259
260
261
262
                para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
            if block['type'] == BlockType.TableCaption:
                para_content['table_caption'] = merge_para_with_text(block)
            if block['type'] == BlockType.TableFootnote:
                para_content['table_footnote'] = merge_para_with_text(block)

    return para_content


赵小蒙's avatar
赵小蒙 committed
263
def make_standard_format_with_para(pdf_info_dict: list, img_buket_path: str):
赵小蒙's avatar
赵小蒙 committed
264
    content_list = []
赵小蒙's avatar
赵小蒙 committed
265
    for page_info in pdf_info_dict:
266
267
        paras_of_layout = page_info.get("para_blocks")
        if not paras_of_layout:
赵小蒙's avatar
赵小蒙 committed
268
            continue
赵小蒙's avatar
赵小蒙 committed
269
270
271
        for para_block in paras_of_layout:
            para_content = para_to_standard_format_v2(para_block, img_buket_path)
            content_list.append(para_content)
赵小蒙's avatar
赵小蒙 committed
272
273
274
    return content_list


275
def line_to_standard_format(line, img_buket_path):
赵小蒙's avatar
赵小蒙 committed
276
277
278
279
280
281
282
283
284
285
    line_text = ""
    inline_equation_num = 0
    for span in line['spans']:
        if not span.get('content'):
            if not span.get('image_path'):
                continue
            else:
                if span['type'] == ContentType.Image:
                    content = {
                        'type': 'image',
286
                        'img_path': join_path(img_buket_path, span['image_path'])
赵小蒙's avatar
赵小蒙 committed
287
288
289
290
291
                    }
                    return content
                elif span['type'] == ContentType.Table:
                    content = {
                        'type': 'table',
292
                        'img_path': join_path(img_buket_path, span['image_path'])
赵小蒙's avatar
赵小蒙 committed
293
294
295
296
                    }
                    return content
        else:
            if span['type'] == ContentType.InterlineEquation:
赵小蒙's avatar
赵小蒙 committed
297
                interline_equation = span['content']
赵小蒙's avatar
赵小蒙 committed
298
299
300
301
302
303
                content = {
                    'type': 'equation',
                    'latex': f"$$\n{interline_equation}\n$$"
                }
                return content
            elif span['type'] == ContentType.InlineEquation:
赵小蒙's avatar
赵小蒙 committed
304
                inline_equation = span['content']
赵小蒙's avatar
赵小蒙 committed
305
306
307
                line_text += f"${inline_equation}$"
                inline_equation_num += 1
            elif span['type'] == ContentType.Text:
308
309
                text_content = ocr_escape_special_markdown_char(span['content'])  # 转义特殊符号
                line_text += text_content
赵小蒙's avatar
赵小蒙 committed
310
311
312
313
314
315
316
317
    content = {
        'type': 'text',
        'text': line_text,
        'inline_equation_num': inline_equation_num
    }
    return content


赵小蒙's avatar
赵小蒙 committed
318
def ocr_mk_mm_standard_format(pdf_info_dict: list):
赵小蒙's avatar
update  
赵小蒙 committed
319
    """
320
    content_list
赵小蒙's avatar
赵小蒙 committed
321
322
323
324
325
    type         string      image/text/table/equation(行间的单独拿出来,行内的和text合并)
    latex        string      latex文本字段。
    text         string      纯文本格式的文本数据。
    md           string      markdown格式的文本数据。
    img_path     string      s3://full/path/to/img.jpg
赵小蒙's avatar
update  
赵小蒙 committed
326
    """
赵小蒙's avatar
赵小蒙 committed
327
    content_list = []
赵小蒙's avatar
赵小蒙 committed
328
    for page_info in pdf_info_dict:
赵小蒙's avatar
赵小蒙 committed
329
330
331
332
333
334
335
336
        blocks = page_info.get("preproc_blocks")
        if not blocks:
            continue
        for block in blocks:
            for line in block['lines']:
                content = line_to_standard_format(line)
                content_list.append(content)
    return content_list
赵小蒙's avatar
赵小蒙 committed
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370


def union_make(pdf_info_dict: list, make_mode: str, drop_mode: str, img_buket_path: str = ""):
    output_content = []
    for page_info in pdf_info_dict:
        if page_info.get("need_drop", False):
            drop_reason = page_info.get("drop_reason")
            if drop_mode == DropMode.NONE:
                pass
            elif drop_mode == DropMode.WHOLE_PDF:
                raise Exception(f"drop_mode is {DropMode.WHOLE_PDF} , drop_reason is {drop_reason}")
            elif drop_mode == DropMode.SINGLE_PAGE:
                logger.warning(f"drop_mode is {DropMode.SINGLE_PAGE} , drop_reason is {drop_reason}")
                continue
            else:
                raise Exception(f"drop_mode can not be null")

        paras_of_layout = page_info.get("para_blocks")
        if not paras_of_layout:
            continue
        if make_mode == MakeMode.MM_MD:
            page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path)
            output_content.extend(page_markdown)
        elif make_mode == MakeMode.NLP_MD:
            page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "nlp")
            output_content.extend(page_markdown)
        elif make_mode == MakeMode.STANDARD_FORMAT:
            for para_block in paras_of_layout:
                para_content = para_to_standard_format_v2(para_block, img_buket_path)
                output_content.append(para_content)
    if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
        return '\n\n'.join(output_content)
    elif make_mode == MakeMode.STANDARD_FORMAT:
        return output_content