pdf2md.py 3.55 KB
Newer Older
1
import json
赵小蒙's avatar
赵小蒙 committed
2
3
4
5
6
7
8
import os
import sys
from pathlib import Path

import click
from loguru import logger

赵小蒙's avatar
赵小蒙 committed
9
from magic_pdf.libs.commons import join_path, read_file
10
from magic_pdf.dict2md.mkcontent import mk_mm_markdown, mk_universal_format
赵小蒙's avatar
赵小蒙 committed
11
from magic_pdf.pipeline import parse_pdf_by_model
赵小蒙's avatar
赵小蒙 committed
12
13
14
15
16
17
18
19



def main(s3_pdf_path: str, s3_pdf_profile: str, pdf_model_path: str, pdf_model_profile: str, start_page_num=0, debug_mode=True):
    """ """
    pth = Path(s3_pdf_path)
    book_name = pth.name
    # book_name = "".join(os.path.basename(s3_pdf_path).split(".")[0:-1])
赵小蒙's avatar
赵小蒙 committed
20
    save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "..", "tmp", "unittest")
赵小蒙's avatar
赵小蒙 committed
21
22
23
24
    save_path = join_path(save_tmp_path, "md")
    text_content_save_path = f"{save_path}/{book_name}/book.md"
    # metadata_save_path = f"{save_path}/{book_name}/metadata.json"

赵小蒙's avatar
赵小蒙 committed
25
26
    pdf_bytes = read_file(s3_pdf_path, s3_pdf_profile)

赵小蒙's avatar
赵小蒙 committed
27
28
    try:
        paras_dict = parse_pdf_by_model(
赵小蒙's avatar
赵小蒙 committed
29
            pdf_bytes, pdf_model_path, save_path, book_name, pdf_model_profile, start_page_num, debug_mode=debug_mode
赵小蒙's avatar
赵小蒙 committed
30
31
32
33
34
35
        )
        parent_dir = os.path.dirname(text_content_save_path)
        if not os.path.exists(parent_dir):
            os.makedirs(parent_dir)
                
        if not paras_dict.get('need_drop'):
36
37
            content_list = mk_universal_format(paras_dict)
            markdown_content = mk_mm_markdown(content_list)
赵小蒙's avatar
赵小蒙 committed
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
        else:
            markdown_content = paras_dict['drop_reason']
            
        with open(text_content_save_path, "w", encoding="utf-8") as f:
            f.write(markdown_content)

    except Exception as e:
        print(f"ERROR: {s3_pdf_path}, {e}", file=sys.stderr)
        logger.exception(e)


@click.command()
@click.option("--pdf-file-path", help="s3上pdf文件的路径")
@click.option("--save-path", help="解析出来的图片,文本的保存父目录")
def main_shell(pdf_file_path: str, save_path: str):
    # pdf_bin_file_path = "s3://llm-raw-snew/llm-raw-scihub/scimag07865000-07865999/10.1007/"
    pdf_bin_file_parent_path = "s3://llm-raw-snew/llm-raw-scihub/"
    pdf_bin_file_profile = "s2"
    pdf_model_parent_dir = "s3://llm-pdf-text/eval_1k/layout_res/"
    pdf_model_profile = "langchao"

    p = Path(pdf_file_path)
    pdf_parent_path = p.parent
    pdf_file_name = p.name  # pdf文件名字,含后缀
    pdf_bin_file_path = join_path(pdf_bin_file_parent_path, pdf_parent_path)
    pdf_model_dir = join_path(pdf_model_parent_dir, pdf_parent_path)

    main(
        join_path(pdf_bin_file_path, pdf_file_name),
        pdf_bin_file_profile,
        join_path(pdf_model_dir, pdf_file_name),
        pdf_model_profile,
        save_path,
    )


@click.command()
75
76
@click.option("--pdf-dir", help="本地pdf文件的路径")
@click.option("--model-dir", help="本地模型文件的路径")
赵小蒙's avatar
赵小蒙 committed
77
78
79
80
81
82
83
84
85
86
87
88
89
90
@click.option("--start-page-num", default=0, help="从第几页开始解析")
def main_shell2(pdf_dir: str, model_dir: str,start_page_num: int):
    # 先扫描所有的pdf目录里的文件名字
    pdf_dir = Path(pdf_dir)
    model_dir = Path(model_dir)

    if pdf_dir.is_file():
        pdf_file_names = [pdf_dir.name]
        pdf_dir = pdf_dir.parent
    else:
        pdf_file_names = [f.name for f in pdf_dir.glob("*.pdf")]

    for pdf_file in pdf_file_names:
        pdf_file_path = os.path.join(pdf_dir, pdf_file)
91
92
93
94
        model_file_path = os.path.join(model_dir, pdf_file).rstrip(".pdf") + ".json"
        with open(model_file_path, "r") as json_file:
            model_list = json.load(json_file)
        main(pdf_file_path, None, model_list, None, start_page_num)
赵小蒙's avatar
赵小蒙 committed
95
96
97
98
99



if __name__ == "__main__":
    main_shell2()