ocr_mkcontent.py 8.98 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
from magic_pdf.libs.commons import s3_image_save_path, join_path
赵小蒙's avatar
赵小蒙 committed
2
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
赵小蒙's avatar
赵小蒙 committed
3
from magic_pdf.libs.ocr_content_type import ContentType
4
5
6
7
8
9
10
11
12
13
14
15
16
import wordninja
import re


def split_long_words(text):
    segments = text.split(' ')
    for i in range(len(segments)):
        words = re.findall(r'\w+|[^\w\s]', segments[i], re.UNICODE)
        for j in range(len(words)):
            if len(words[j]) > 15:
                words[j] = ' '.join(wordninja.split(words[j]))
        segments[i] = ''.join(words)
    return ' '.join(segments)
赵小蒙's avatar
赵小蒙 committed
17
18


赵小蒙's avatar
赵小蒙 committed
19
def ocr_mk_nlp_markdown(pdf_info_dict: dict):
赵小蒙's avatar
赵小蒙 committed
20
21
22
23
24
25
26
27
28
29
    markdown = []

    for _, page_info in pdf_info_dict.items():
        blocks = page_info.get("preproc_blocks")
        if not blocks:
            continue
        for block in blocks:
            for line in block['lines']:
                line_text = ''
                for span in line['spans']:
30
31
                    if not span.get('content'):
                        continue
赵小蒙's avatar
赵小蒙 committed
32
                    content = ocr_escape_special_markdown_char(span['content'])  # 转义特殊符号
赵小蒙's avatar
赵小蒙 committed
33
                    if span['type'] == ContentType.InlineEquation:
赵小蒙's avatar
赵小蒙 committed
34
                        content = f"${content}$"
赵小蒙's avatar
赵小蒙 committed
35
                    elif span['type'] == ContentType.InterlineEquation:
赵小蒙's avatar
赵小蒙 committed
36
37
38
39
40
                        content = f"$$\n{content}\n$$"
                    line_text += content + ' '
                # 在行末添加两个空格以强制换行
                markdown.append(line_text.strip() + '  ')
    return '\n'.join(markdown)
41

赵小蒙's avatar
赵小蒙 committed
42

赵小蒙's avatar
赵小蒙 committed
43
def ocr_mk_mm_markdown(pdf_info_dict: dict):
44
45
46
47
48
49
50
51
52
53
54
55
56
57
    markdown = []

    for _, page_info in pdf_info_dict.items():
        blocks = page_info.get("preproc_blocks")
        if not blocks:
            continue
        for block in blocks:
            for line in block['lines']:
                line_text = ''
                for span in line['spans']:
                    if not span.get('content'):
                        if not span.get('image_path'):
                            continue
                        else:
赵小蒙's avatar
赵小蒙 committed
58
                            content = f"![]({join_path(s3_image_save_path, span['image_path'])})"
59
                    else:
赵小蒙's avatar
赵小蒙 committed
60
                        content = ocr_escape_special_markdown_char(span['content'])  # 转义特殊符号
赵小蒙's avatar
赵小蒙 committed
61
                        if span['type'] == ContentType.InlineEquation:
62
                            content = f"${content}$"
赵小蒙's avatar
赵小蒙 committed
63
                        elif span['type'] == ContentType.InterlineEquation:
64
65
66
67
68
                            content = f"$$\n{content}\n$$"
                    line_text += content + ' '
                # 在行末添加两个空格以强制换行
                markdown.append(line_text.strip() + '  ')
    return '\n'.join(markdown)
69

70

赵小蒙's avatar
赵小蒙 committed
71
def ocr_mk_mm_markdown_with_para(pdf_info_dict: dict):
xuchao's avatar
xuchao committed
72
73
    markdown = []
    for _, page_info in pdf_info_dict.items():
74
        paras_of_layout = page_info.get("para_blocks")
75
        page_markdown = ocr_mk_mm_markdown_with_para_core(paras_of_layout, "mm")
76
        markdown.extend(page_markdown)
77
    return '\n\n'.join(markdown)
78
79


80
81
82
83
84
85
86
87
def ocr_mk_nlp_markdown_with_para(pdf_info_dict: dict):
    markdown = []
    for _, page_info in pdf_info_dict.items():
        paras_of_layout = page_info.get("para_blocks")
        page_markdown = ocr_mk_mm_markdown_with_para_core(paras_of_layout, "nlp")
        markdown.extend(page_markdown)
    return '\n\n'.join(markdown)

88
89
90
def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: dict):
    markdown_with_para_and_pagination = []
    for page_no, page_info in pdf_info_dict.items():
91
92
        paras_of_layout = page_info.get("para_blocks")
        if not paras_of_layout:
93
            continue
94
        page_markdown = ocr_mk_mm_markdown_with_para_core(paras_of_layout, "mm")
95
96
        markdown_with_para_and_pagination.append({
            'page_no': page_no,
97
            'md_content': '\n\n'.join(page_markdown)
98
99
100
101
        })
    return markdown_with_para_and_pagination


102
def ocr_mk_mm_markdown_with_para_core(paras_of_layout, mode):
103
104
105
106
107
108
109
    page_markdown = []
    for paras in paras_of_layout:
        for para in paras:
            para_text = ''
            for line in para:
                for span in line['spans']:
                    span_type = span.get('type')
110
                    content = ''
111
                    if span_type == ContentType.Text:
112
                        content = ocr_escape_special_markdown_char(split_long_words(span['content']))
113
                    elif span_type == ContentType.InlineEquation:
114
                        content = f"${ocr_escape_special_markdown_char(span['content'])}$"
115
                    elif span_type == ContentType.InterlineEquation:
116
                        content = f"\n$$\n{ocr_escape_special_markdown_char(span['content'])}\n$$\n"
117
                    elif span_type in [ContentType.Image, ContentType.Table]:
118
119
120
121
                        if mode == 'mm':
                            content = f"\n![]({join_path(s3_image_save_path, span['image_path'])})\n"
                        elif mode == 'nlp':
                            pass
122
                    if content != '':
123
                        para_text += content + ' '
124
125
126
127
            if para_text.strip() == '':
                continue
            else:
                page_markdown.append(para_text.strip() + '  ')
128
129
130
    return page_markdown


131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
def para_to_standard_format(para):
    para_content = {}
    if len(para) == 1:
        para_content = line_to_standard_format(para[0])
    elif len(para) > 1:
        para_text = ''
        inline_equation_num = 0
        for line in para:
            for span in line['spans']:
                span_type = span.get('type')
                if span_type == ContentType.Text:
                    content = ocr_escape_special_markdown_char(split_long_words(span['content']))
                elif span_type == ContentType.InlineEquation:
                    content = f"${ocr_escape_special_markdown_char(span['content'])}$"
                    inline_equation_num += 1
                para_text += content + ' '
        para_content = {
            'type': 'text',
            'text': para_text,
            'inline_equation_num': inline_equation_num
        }
    return para_content

赵小蒙's avatar
赵小蒙 committed
154
155
156
def make_standard_format_with_para(pdf_info_dict: dict):
    content_list = []
    for _, page_info in pdf_info_dict.items():
157
158
        paras_of_layout = page_info.get("para_blocks")
        if not paras_of_layout:
赵小蒙's avatar
赵小蒙 committed
159
            continue
160
161
162
163
        for paras in paras_of_layout:
            for para in paras:
                para_content = para_to_standard_format(para)
                content_list.append(para_content)
赵小蒙's avatar
赵小蒙 committed
164
165
166
    return content_list


赵小蒙's avatar
赵小蒙 committed
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
def line_to_standard_format(line):
    line_text = ""
    inline_equation_num = 0
    for span in line['spans']:
        if not span.get('content'):
            if not span.get('image_path'):
                continue
            else:
                if span['type'] == ContentType.Image:
                    content = {
                        'type': 'image',
                        'img_path': join_path(s3_image_save_path, span['image_path'])
                    }
                    return content
                elif span['type'] == ContentType.Table:
                    content = {
                        'type': 'table',
                        'img_path': join_path(s3_image_save_path, span['image_path'])
                    }
                    return content
        else:
            if span['type'] == ContentType.InterlineEquation:
                interline_equation = ocr_escape_special_markdown_char(span['content'])  # 转义特殊符号
                content = {
                    'type': 'equation',
                    'latex': f"$$\n{interline_equation}\n$$"
                }
                return content
            elif span['type'] == ContentType.InlineEquation:
                inline_equation = ocr_escape_special_markdown_char(span['content'])  # 转义特殊符号
                line_text += f"${inline_equation}$"
                inline_equation_num += 1
            elif span['type'] == ContentType.Text:
200
201
                text_content = ocr_escape_special_markdown_char(span['content'])  # 转义特殊符号
                line_text += text_content
赵小蒙's avatar
赵小蒙 committed
202
203
204
205
206
207
208
209
210
    content = {
        'type': 'text',
        'text': line_text,
        'inline_equation_num': inline_equation_num
    }
    return content


def ocr_mk_mm_standard_format(pdf_info_dict: dict):
赵小蒙's avatar
update  
赵小蒙 committed
211
    """
212
    content_list
赵小蒙's avatar
赵小蒙 committed
213
214
215
216
217
    type         string      image/text/table/equation(行间的单独拿出来,行内的和text合并)
    latex        string      latex文本字段。
    text         string      纯文本格式的文本数据。
    md           string      markdown格式的文本数据。
    img_path     string      s3://full/path/to/img.jpg
赵小蒙's avatar
update  
赵小蒙 committed
218
    """
赵小蒙's avatar
赵小蒙 committed
219
220
221
222
223
224
225
226
227
228
    content_list = []
    for _, page_info in pdf_info_dict.items():
        blocks = page_info.get("preproc_blocks")
        if not blocks:
            continue
        for block in blocks:
            for line in block['lines']:
                content = line_to_standard_format(line)
                content_list.append(content)
    return content_list