pdf2md.py 3.28 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
2
3
4
5
6
7
import os
import sys
from pathlib import Path

import click
from loguru import logger

赵小蒙's avatar
赵小蒙 committed
8
9
10
from libs.commons import join_path
from dict2md.mkcontent import mk_mm_markdown
from pipeline.pdf_parse_by_model import parse_pdf_by_model
赵小蒙's avatar
赵小蒙 committed
11
12
13
14
15
16
17
18



def main(s3_pdf_path: str, s3_pdf_profile: str, pdf_model_path: str, pdf_model_profile: str, start_page_num=0, debug_mode=True):
    """ """
    pth = Path(s3_pdf_path)
    book_name = pth.name
    # book_name = "".join(os.path.basename(s3_pdf_path).split(".")[0:-1])
赵小蒙's avatar
赵小蒙 committed
19
    save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "..", "tmp", "unittest")
赵小蒙's avatar
赵小蒙 committed
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
    save_path = join_path(save_tmp_path, "md")
    text_content_save_path = f"{save_path}/{book_name}/book.md"
    # metadata_save_path = f"{save_path}/{book_name}/metadata.json"

    try:
        paras_dict = parse_pdf_by_model(
            s3_pdf_path, s3_pdf_profile, pdf_model_path, save_path, book_name, pdf_model_profile, start_page_num, debug_mode=debug_mode
        )
        parent_dir = os.path.dirname(text_content_save_path)
        if not os.path.exists(parent_dir):
            os.makedirs(parent_dir)
                
        if not paras_dict.get('need_drop'):
            markdown_content = mk_mm_markdown(paras_dict)
        else:
            markdown_content = paras_dict['drop_reason']
            
        with open(text_content_save_path, "w", encoding="utf-8") as f:
            f.write(markdown_content)

    except Exception as e:
        print(f"ERROR: {s3_pdf_path}, {e}", file=sys.stderr)
        logger.exception(e)


@click.command()
@click.option("--pdf-file-path", help="s3上pdf文件的路径")
@click.option("--save-path", help="解析出来的图片,文本的保存父目录")
def main_shell(pdf_file_path: str, save_path: str):
    # pdf_bin_file_path = "s3://llm-raw-snew/llm-raw-scihub/scimag07865000-07865999/10.1007/"
    pdf_bin_file_parent_path = "s3://llm-raw-snew/llm-raw-scihub/"
    pdf_bin_file_profile = "s2"
    pdf_model_parent_dir = "s3://llm-pdf-text/eval_1k/layout_res/"
    pdf_model_profile = "langchao"

    p = Path(pdf_file_path)
    pdf_parent_path = p.parent
    pdf_file_name = p.name  # pdf文件名字,含后缀
    pdf_bin_file_path = join_path(pdf_bin_file_parent_path, pdf_parent_path)
    pdf_model_dir = join_path(pdf_model_parent_dir, pdf_parent_path)

    main(
        join_path(pdf_bin_file_path, pdf_file_name),
        pdf_bin_file_profile,
        join_path(pdf_model_dir, pdf_file_name),
        pdf_model_profile,
        save_path,
    )


@click.command()
@click.option("--pdf-dir", help="s3上pdf文件的路径")
@click.option("--model-dir", help="s3上pdf文件的路径")
@click.option("--start-page-num", default=0, help="从第几页开始解析")
def main_shell2(pdf_dir: str, model_dir: str,start_page_num: int):
    # 先扫描所有的pdf目录里的文件名字
    pdf_dir = Path(pdf_dir)
    model_dir = Path(model_dir)

    if pdf_dir.is_file():
        pdf_file_names = [pdf_dir.name]
        pdf_dir = pdf_dir.parent
    else:
        pdf_file_names = [f.name for f in pdf_dir.glob("*.pdf")]

    for pdf_file in pdf_file_names:
        pdf_file_path = os.path.join(pdf_dir, pdf_file)
        model_file_path = os.path.join(model_dir, pdf_file)
        main(pdf_file_path, None, model_file_path, None, start_page_num)



if __name__ == "__main__":
    main_shell2()