mkcontent.py.bak 18.2 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
import math
2

赵小蒙's avatar
赵小蒙 committed
3
4
from loguru import logger

5
6
7
from magic_pdf.config.ocr_content_type import ContentType
from magic_pdf.libs.boxbase import (find_bottom_nearest_text_bbox,
                                    find_top_nearest_text_bbox)
8
from magic_pdf.libs.commons import join_path
赵小蒙's avatar
赵小蒙 committed
9

10
11
12
TYPE_INLINE_EQUATION = ContentType.InlineEquation
TYPE_INTERLINE_EQUATION = ContentType.InterlineEquation
UNI_FORMAT_TEXT_TYPE = ['text', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
赵小蒙's avatar
赵小蒙 committed
13

14
15
16

@DeprecationWarning
def mk_nlp_markdown_1(para_dict: dict):
17
    """对排序后的bboxes拼接内容."""
赵小蒙's avatar
赵小蒙 committed
18
19
    content_lst = []
    for _, page_info in para_dict.items():
20
        para_blocks = page_info.get('para_blocks')
赵小蒙's avatar
赵小蒙 committed
21
22
23
24
        if not para_blocks:
            continue

        for block in para_blocks:
25
            item = block['paras']
赵小蒙's avatar
赵小蒙 committed
26
            for _, p in item.items():
27
28
                para_text = p['para_text']
                is_title = p['is_para_title']
赵小蒙's avatar
赵小蒙 committed
29
                title_level = p['para_title_level']
30
                md_title_prefix = '#' * title_level
赵小蒙's avatar
赵小蒙 committed
31
                if is_title:
32
                    content_lst.append(f'{md_title_prefix} {para_text}')
赵小蒙's avatar
赵小蒙 committed
33
34
35
                else:
                    content_lst.append(para_text)

36
    content_text = '\n\n'.join(content_lst)
赵小蒙's avatar
赵小蒙 committed
37
38
39
40
41
42
43
44
45
46
47
48
49

    return content_text


# 找到目标字符串在段落中的索引
def __find_index(paragraph, target):
    index = paragraph.find(target)
    if index != -1:
        return index
    else:
        return None


50
51
def __insert_string(paragraph, target, position):
    new_paragraph = paragraph[:position] + target + paragraph[position:]
赵小蒙's avatar
赵小蒙 committed
52
53
54
55
    return new_paragraph


def __insert_after(content, image_content, target):
56
    """在content中找到target,将image_content插入到target后面."""
赵小蒙's avatar
赵小蒙 committed
57
58
    index = content.find(target)
    if index != -1:
59
60
61
62
63
64
65
        content = (
            content[: index + len(target)]
            + '\n\n'
            + image_content
            + '\n\n'
            + content[index + len(target) :]
        )
赵小蒙's avatar
赵小蒙 committed
66
    else:
67
68
69
        logger.error(
            f"Can't find the location of image {image_content} in the markdown file, search target is {target}"
        )
赵小蒙's avatar
赵小蒙 committed
70
71
    return content

72

赵小蒙's avatar
赵小蒙 committed
73
def __insert_before(content, image_content, target):
74
    """在content中找到target,将image_content插入到target前面."""
赵小蒙's avatar
赵小蒙 committed
75
76
    index = content.find(target)
    if index != -1:
77
        content = content[:index] + '\n\n' + image_content + '\n\n' + content[index:]
赵小蒙's avatar
赵小蒙 committed
78
    else:
79
80
81
        logger.error(
            f"Can't find the location of image {image_content} in the markdown file, search target is {target}"
        )
赵小蒙's avatar
赵小蒙 committed
82
83
84
    return content


85
86
@DeprecationWarning
def mk_mm_markdown_1(para_dict: dict):
87
    """拼装多模态markdown."""
赵小蒙's avatar
赵小蒙 committed
88
89
    content_lst = []
    for _, page_info in para_dict.items():
90
91
92
93
        page_lst = []  # 一个page内的段落列表
        para_blocks = page_info.get('para_blocks')
        pymu_raw_blocks = page_info.get('preproc_blocks')

赵小蒙's avatar
赵小蒙 committed
94
        all_page_images = []
95
96
97
98
99
100
        all_page_images.extend(page_info.get('images', []))
        all_page_images.extend(page_info.get('image_backup', []))
        all_page_images.extend(page_info.get('tables', []))
        all_page_images.extend(page_info.get('table_backup', []))

        if not para_blocks or not pymu_raw_blocks:  # 只有图片的拼接的场景
赵小蒙's avatar
赵小蒙 committed
101
            for img in all_page_images:
102
103
104
                page_lst.append(f"![]({img['image_path']})")  # TODO 图片顺序
            page_md = '\n\n'.join(page_lst)

赵小蒙's avatar
赵小蒙 committed
105
106
        else:
            for block in para_blocks:
107
                item = block['paras']
赵小蒙's avatar
赵小蒙 committed
108
                for _, p in item.items():
109
110
                    para_text = p['para_text']
                    is_title = p['is_para_title']
赵小蒙's avatar
赵小蒙 committed
111
                    title_level = p['para_title_level']
112
                    md_title_prefix = '#' * title_level
赵小蒙's avatar
赵小蒙 committed
113
                    if is_title:
114
                        page_lst.append(f'{md_title_prefix} {para_text}')
赵小蒙's avatar
赵小蒙 committed
115
116
                    else:
                        page_lst.append(para_text)
117

赵小蒙's avatar
赵小蒙 committed
118
            """拼装成一个页面的文本"""
119
            page_md = '\n\n'.join(page_lst)
赵小蒙's avatar
赵小蒙 committed
120
121
122
123
124
125
126
            """插入图片"""
            for img in all_page_images:
                imgbox = img['bbox']
                img_content = f"![]({img['image_path']})"
                # 先看在哪个block内
                for block in pymu_raw_blocks:
                    bbox = block['bbox']
127
128
129
130
131
                    if (
                        bbox[0] - 1 <= imgbox[0] < bbox[2] + 1
                        and bbox[1] - 1 <= imgbox[1] < bbox[3] + 1
                    ):  # 确定在block内
                        for l in block['lines']:  # noqa: E741
赵小蒙's avatar
赵小蒙 committed
132
                            line_box = l['bbox']
133
134
135
136
137
138
139
140
                            if (
                                line_box[0] - 1 <= imgbox[0] < line_box[2] + 1
                                and line_box[1] - 1 <= imgbox[1] < line_box[3] + 1
                            ):  # 在line内的,插入line前面
                                line_txt = ''.join([s['text'] for s in l['spans']])
                                page_md = __insert_before(
                                    page_md, img_content, line_txt
                                )
赵小蒙's avatar
赵小蒙 committed
141
142
                                break
                            break
143
                        else:  # 在行与行之间
赵小蒙's avatar
赵小蒙 committed
144
145
146
                            # 找到图片x0,y0与line的x0,y0最近的line
                            min_distance = 100000
                            min_line = None
147
                            for l in block['lines']:  # noqa: E741
赵小蒙's avatar
赵小蒙 committed
148
                                line_box = l['bbox']
149
150
151
152
                                distance = math.sqrt(
                                    (line_box[0] - imgbox[0]) ** 2
                                    + (line_box[1] - imgbox[1]) ** 2
                                )
赵小蒙's avatar
赵小蒙 committed
153
154
155
156
                                if distance < min_distance:
                                    min_distance = distance
                                    min_line = l
                            if min_line:
157
158
159
                                line_txt = ''.join(
                                    [s['text'] for s in min_line['spans']]
                                )
赵小蒙's avatar
赵小蒙 committed
160
                                img_h = imgbox[3] - imgbox[1]
161
162
163
164
                                if min_distance < img_h:  # 文字在图片前面
                                    page_md = __insert_after(
                                        page_md, img_content, line_txt
                                    )
赵小蒙's avatar
赵小蒙 committed
165
                                else:
166
167
168
                                    page_md = __insert_before(
                                        page_md, img_content, line_txt
                                    )
赵小蒙's avatar
赵小蒙 committed
169
                            else:
170
171
172
173
                                logger.error(
                                    f"Can't find the location of image {img['image_path']} in the markdown file  #1"
                                )
                else:  # 应当在两个block之间
赵小蒙's avatar
赵小蒙 committed
174
175
176
                    # 找到上方最近的block,如果上方没有就找大下方最近的block
                    top_txt_block = find_top_nearest_text_bbox(pymu_raw_blocks, imgbox)
                    if top_txt_block:
177
178
179
                        line_txt = ''.join(
                            [s['text'] for s in top_txt_block['lines'][-1]['spans']]
                        )
赵小蒙's avatar
赵小蒙 committed
180
181
                        page_md = __insert_after(page_md, img_content, line_txt)
                    else:
182
183
184
                        bottom_txt_block = find_bottom_nearest_text_bbox(
                            pymu_raw_blocks, imgbox
                        )
赵小蒙's avatar
赵小蒙 committed
185
                        if bottom_txt_block:
186
187
188
189
190
191
                            line_txt = ''.join(
                                [
                                    s['text']
                                    for s in bottom_txt_block['lines'][0]['spans']
                                ]
                            )
赵小蒙's avatar
赵小蒙 committed
192
193
                            page_md = __insert_before(page_md, img_content, line_txt)
                        else:
194
195
196
197
                            logger.error(
                                f"Can't find the location of image {img['image_path']} in the markdown file  #2"
                            )

赵小蒙's avatar
赵小蒙 committed
198
        content_lst.append(page_md)
199

赵小蒙's avatar
赵小蒙 committed
200
    """拼装成全部页面的文本"""
201
    content_text = '\n\n'.join(content_lst)
赵小蒙's avatar
赵小蒙 committed
202
203

    return content_text
204
205


赵小蒙's avatar
赵小蒙 committed
206
def __insert_after_para(text, type, element, content_list):
207
    """在content_list中找到text,将image_path作为一个新的node插入到text后面."""
208
    for i, c in enumerate(content_list):
209
210
211
        content_type = c.get('type')
        if content_type in UNI_FORMAT_TEXT_TYPE and text in c.get('text', ''):
            if type == 'image':
赵小蒙's avatar
赵小蒙 committed
212
                content_node = {
213
214
215
216
217
                    'type': 'image',
                    'img_path': element.get('image_path'),
                    'img_alt': '',
                    'img_title': '',
                    'img_caption': '',
赵小蒙's avatar
赵小蒙 committed
218
                }
219
            elif type == 'table':
赵小蒙's avatar
赵小蒙 committed
220
                content_node = {
221
222
223
224
225
226
                    'type': 'table',
                    'img_path': element.get('image_path'),
                    'table_latex': element.get('text'),
                    'table_title': '',
                    'table_caption': '',
                    'table_quality': element.get('quality'),
赵小蒙's avatar
赵小蒙 committed
227
                }
228
            content_list.insert(i + 1, content_node)
229
230
            break
    else:
231
232
233
        logger.error(
            f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}"
        )
234
235


赵小蒙's avatar
赵小蒙 committed
236
def __insert_before_para(text, type, element, content_list):
237
    """在content_list中找到text,将image_path作为一个新的node插入到text前面."""
238
    for i, c in enumerate(content_list):
239
240
241
        content_type = c.get('type')
        if content_type in UNI_FORMAT_TEXT_TYPE and text in c.get('text', ''):
            if type == 'image':
赵小蒙's avatar
赵小蒙 committed
242
                content_node = {
243
244
245
246
247
                    'type': 'image',
                    'img_path': element.get('image_path'),
                    'img_alt': '',
                    'img_title': '',
                    'img_caption': '',
赵小蒙's avatar
赵小蒙 committed
248
                }
249
            elif type == 'table':
赵小蒙's avatar
赵小蒙 committed
250
                content_node = {
251
252
253
254
255
256
                    'type': 'table',
                    'img_path': element.get('image_path'),
                    'table_latex': element.get('text'),
                    'table_title': '',
                    'table_caption': '',
                    'table_quality': element.get('quality'),
赵小蒙's avatar
赵小蒙 committed
257
258
                }
            content_list.insert(i, content_node)
259
260
            break
    else:
261
262
263
264
        logger.error(
            f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}"
        )

265

赵小蒙's avatar
赵小蒙 committed
266
def mk_universal_format(pdf_info_list: list, img_buket_path):
267
    """构造统一格式 https://aicarrier.feishu.cn/wiki/FqmMwcH69iIdCWkkyjvcDwNUnTY."""
268
    content_lst = []
赵小蒙's avatar
赵小蒙 committed
269
    for page_info in pdf_info_list:
270
271
272
273
        page_lst = []  # 一个page内的段落列表
        para_blocks = page_info.get('para_blocks')
        pymu_raw_blocks = page_info.get('preproc_blocks')

赵小蒙's avatar
赵小蒙 committed
274
        all_page_images = []
275
276
        all_page_images.extend(page_info.get('images', []))
        all_page_images.extend(page_info.get('image_backup', []))
赵小蒙's avatar
赵小蒙 committed
277
278
279
        # all_page_images.extend(page_info.get("tables",[]))
        # all_page_images.extend(page_info.get("table_backup",[]) )
        all_page_tables = []
280
        all_page_tables.extend(page_info.get('tables', []))
赵小蒙's avatar
赵小蒙 committed
281

282
        if not para_blocks or not pymu_raw_blocks:  # 只有图片的拼接的场景
283
284
            for img in all_page_images:
                content_node = {
285
286
287
288
289
                    'type': 'image',
                    'img_path': join_path(img_buket_path, img['image_path']),
                    'img_alt': '',
                    'img_title': '',
                    'img_caption': '',
290
                }
291
                page_lst.append(content_node)  # TODO 图片顺序
赵小蒙's avatar
赵小蒙 committed
292
293
            for table in all_page_tables:
                content_node = {
294
295
296
297
298
299
                    'type': 'table',
                    'img_path': join_path(img_buket_path, table['image_path']),
                    'table_latex': table.get('text'),
                    'table_title': '',
                    'table_caption': '',
                    'table_quality': table.get('quality'),
赵小蒙's avatar
赵小蒙 committed
300
                }
301
                page_lst.append(content_node)  # TODO 图片顺序
302
303
        else:
            for block in para_blocks:
304
                item = block['paras']
305
                for _, p in item.items():
306
307
308
                    font_type = p[
                        'para_font_type'
                    ]  # 对于文本来说,要么是普通文本,要么是个行间公式
309
                    if font_type == TYPE_INTERLINE_EQUATION:
310
                        content_node = {'type': 'equation', 'latex': p['para_text']}
311
312
                        page_lst.append(content_node)
                    else:
313
314
                        para_text = p['para_text']
                        is_title = p['is_para_title']
315
                        title_level = p['para_title_level']
316

317
318
                        if is_title:
                            content_node = {
319
320
                                'type': f'h{title_level}',
                                'text': para_text,
321
322
323
                            }
                            page_lst.append(content_node)
                        else:
324
                            content_node = {'type': 'text', 'text': para_text}
325
                            page_lst.append(content_node)
326

327
        content_lst.extend(page_lst)
328

329
330
        """插入图片"""
        for img in all_page_images:
331
            insert_img_or_table('image', img, pymu_raw_blocks, content_lst)
赵小蒙's avatar
赵小蒙 committed
332
333
334

        """插入表格"""
        for table in all_page_tables:
335
            insert_img_or_table('table', table, pymu_raw_blocks, content_lst)
336
337
    # end for
    return content_lst
赵小蒙's avatar
赵小蒙 committed
338

339

赵小蒙's avatar
赵小蒙 committed
340
341
342
343
344
def insert_img_or_table(type, element, pymu_raw_blocks, content_lst):
    element_bbox = element['bbox']
    # 先看在哪个block内
    for block in pymu_raw_blocks:
        bbox = block['bbox']
345
346
347
348
349
        if (
            bbox[0] - 1 <= element_bbox[0] < bbox[2] + 1
            and bbox[1] - 1 <= element_bbox[1] < bbox[3] + 1
        ):  # 确定在这个大的block内,然后进入逐行比较距离
            for l in block['lines']:  # noqa: E741
赵小蒙's avatar
赵小蒙 committed
350
                line_box = l['bbox']
351
352
353
354
355
                if (
                    line_box[0] - 1 <= element_bbox[0] < line_box[2] + 1
                    and line_box[1] - 1 <= element_bbox[1] < line_box[3] + 1
                ):  # 在line内的,插入line前面
                    line_txt = ''.join([s['text'] for s in l['spans']])
赵小蒙's avatar
赵小蒙 committed
356
357
358
359
360
361
362
                    __insert_before_para(line_txt, type, element, content_lst)
                    break
                break
            else:  # 在行与行之间
                # 找到图片x0,y0与line的x0,y0最近的line
                min_distance = 100000
                min_line = None
363
                for l in block['lines']:  # noqa: E741
赵小蒙's avatar
赵小蒙 committed
364
                    line_box = l['bbox']
365
366
367
368
                    distance = math.sqrt(
                        (line_box[0] - element_bbox[0]) ** 2
                        + (line_box[1] - element_bbox[1]) ** 2
                    )
赵小蒙's avatar
赵小蒙 committed
369
370
371
372
                    if distance < min_distance:
                        min_distance = distance
                        min_line = l
                if min_line:
373
                    line_txt = ''.join([s['text'] for s in min_line['spans']])
赵小蒙's avatar
赵小蒙 committed
374
375
376
377
378
379
380
                    img_h = element_bbox[3] - element_bbox[1]
                    if min_distance < img_h:  # 文字在图片前面
                        __insert_after_para(line_txt, type, element, content_lst)
                    else:
                        __insert_before_para(line_txt, type, element, content_lst)
                    break
                else:
381
382
383
                    logger.error(
                        f"Can't find the location of image {element.get('image_path')} in the markdown file  #1"
                    )
赵小蒙's avatar
赵小蒙 committed
384
385
386
387
    else:  # 应当在两个block之间
        # 找到上方最近的block,如果上方没有就找大下方最近的block
        top_txt_block = find_top_nearest_text_bbox(pymu_raw_blocks, element_bbox)
        if top_txt_block:
388
            line_txt = ''.join([s['text'] for s in top_txt_block['lines'][-1]['spans']])
赵小蒙's avatar
赵小蒙 committed
389
390
            __insert_after_para(line_txt, type, element, content_lst)
        else:
391
392
393
            bottom_txt_block = find_bottom_nearest_text_bbox(
                pymu_raw_blocks, element_bbox
            )
赵小蒙's avatar
赵小蒙 committed
394
            if bottom_txt_block:
395
396
397
                line_txt = ''.join(
                    [s['text'] for s in bottom_txt_block['lines'][0]['spans']]
                )
赵小蒙's avatar
赵小蒙 committed
398
399
                __insert_before_para(line_txt, type, element, content_lst)
            else:  # TODO ,图片可能独占一列,这种情况上下是没有图片的
400
401
402
                logger.error(
                    f"Can't find the location of image {element.get('image_path')} in the markdown file  #2"
                )
赵小蒙's avatar
赵小蒙 committed
403
404


405
def mk_mm_markdown(content_list):
406
    """基于同一格式的内容列表,构造markdown,含图片."""
407
408
    content_md = []
    for c in content_list:
409
410
411
412
413
414
        content_type = c.get('type')
        if content_type == 'text':
            content_md.append(c.get('text'))
        elif content_type == 'equation':
            content = c.get('latex')
            if content.startswith('$$') and content.endswith('$$'):
xuchao's avatar
xuchao committed
415
416
417
                content_md.append(content)
            else:
                content_md.append(f"\n$$\n{c.get('latex')}\n$$\n")
418
419
        elif content_type in UNI_FORMAT_TEXT_TYPE:
            content_md.append(f"{'#'*int(content_type[1])} {c.get('text')}")
420
        elif content_type == 'image':
421
            content_md.append(f"![]({c.get('img_path')})")
422
423
    return '\n\n'.join(content_md)

424
425

def mk_nlp_markdown(content_list):
426
    """基于同一格式的内容列表,构造markdown,不含图片."""
427
428
    content_md = []
    for c in content_list:
429
430
431
432
        content_type = c.get('type')
        if content_type == 'text':
            content_md.append(c.get('text'))
        elif content_type == 'equation':
433
            content_md.append(f"$$\n{c.get('latex')}\n$$")
434
        elif content_type == 'table':
435
            content_md.append(f"$$$\n{c.get('table_latex')}\n$$$")
436
437
        elif content_type in UNI_FORMAT_TEXT_TYPE:
            content_md.append(f"{'#'*int(content_type[1])} {c.get('text')}")
438
    return '\n\n'.join(content_md)