Commit 7f0c734f authored by 赵小蒙's avatar 赵小蒙
Browse files

pipeline重构

parent 872cd73f
...@@ -5,7 +5,7 @@ from pathlib import Path ...@@ -5,7 +5,7 @@ from pathlib import Path
import click import click
from loguru import logger from loguru import logger
from magic_pdf.libs.commons import join_path from magic_pdf.libs.commons import join_path, read_file
from magic_pdf.dict2md.mkcontent import mk_mm_markdown from magic_pdf.dict2md.mkcontent import mk_mm_markdown
from magic_pdf.pipeline import parse_pdf_by_model from magic_pdf.pipeline import parse_pdf_by_model
...@@ -21,9 +21,11 @@ def main(s3_pdf_path: str, s3_pdf_profile: str, pdf_model_path: str, pdf_model_p ...@@ -21,9 +21,11 @@ def main(s3_pdf_path: str, s3_pdf_profile: str, pdf_model_path: str, pdf_model_p
text_content_save_path = f"{save_path}/{book_name}/book.md" text_content_save_path = f"{save_path}/{book_name}/book.md"
# metadata_save_path = f"{save_path}/{book_name}/metadata.json" # metadata_save_path = f"{save_path}/{book_name}/metadata.json"
pdf_bytes = read_file(s3_pdf_path, s3_pdf_profile)
try: try:
paras_dict = parse_pdf_by_model( paras_dict = parse_pdf_by_model(
s3_pdf_path, s3_pdf_profile, pdf_model_path, save_path, book_name, pdf_model_profile, start_page_num, debug_mode=debug_mode pdf_bytes, pdf_model_path, save_path, book_name, pdf_model_profile, start_page_num, debug_mode=debug_mode
) )
parent_dir = os.path.dirname(text_content_save_path) parent_dir = os.path.dirname(text_content_save_path)
if not os.path.exists(parent_dir): if not os.path.exists(parent_dir):
......
...@@ -71,8 +71,7 @@ paraMergeException_msg = ParaMergeException().message ...@@ -71,8 +71,7 @@ paraMergeException_msg = ParaMergeException().message
def parse_pdf_by_model( def parse_pdf_by_model(
s3_pdf_path, pdf_bytes,
s3_pdf_profile,
pdf_model_output, pdf_model_output,
save_path, save_path,
book_name, book_name,
...@@ -83,7 +82,7 @@ def parse_pdf_by_model( ...@@ -83,7 +82,7 @@ def parse_pdf_by_model(
junk_img_bojids=[], junk_img_bojids=[],
debug_mode=False, debug_mode=False,
): ):
pdf_bytes = read_file(s3_pdf_path, s3_pdf_profile)
save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest") save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
md_bookname_save_path = "" md_bookname_save_path = ""
book_name = sanitize_filename(book_name) book_name = sanitize_filename(book_name)
......
...@@ -304,6 +304,7 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict: ...@@ -304,6 +304,7 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
# 开始正式逻辑 # 开始正式逻辑
s3_pdf_path = jso.get("file_location") s3_pdf_path = jso.get("file_location")
s3_config = get_s3_config(s3_pdf_path) s3_config = get_s3_config(s3_pdf_path)
pdf_bytes = read_file(s3_pdf_path, s3_config)
model_output_json_list = jso.get("doc_layout_result") model_output_json_list = jso.get("doc_layout_result")
data_source = get_data_source(jso) data_source = get_data_source(jso)
file_id = jso.get("file_id") file_id = jso.get("file_id")
...@@ -341,8 +342,7 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict: ...@@ -341,8 +342,7 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
file=sys.stderr, file=sys.stderr,
) )
pdf_info_dict = parse_pdf_by_model( pdf_info_dict = parse_pdf_by_model(
s3_pdf_path, pdf_bytes,
s3_config,
model_output_json_list, model_output_json_list,
save_path, save_path,
book_name, book_name,
...@@ -373,18 +373,6 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict: ...@@ -373,18 +373,6 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
return jso return jso
"""
统一处理逻辑
1.先调用parse_pdf对文本类pdf进行处理
2.再调用ocr_dropped_parse_pdf,对之前drop的pdf进行处理
"""
def uni_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
jso = parse_pdf(jso, start_page_id=start_page_id, debug_mode=debug_mode)
jso = ocr_dropped_parse_pdf(jso, start_page_id=start_page_id, debug_mode=debug_mode)
return jso
def parse_pdf_for_model_train(jso: dict, start_page_id=0, debug_mode=False) -> dict: def parse_pdf_for_model_train(jso: dict, start_page_id=0, debug_mode=False) -> dict:
# 检测debug开关 # 检测debug开关
if debug_mode: if debug_mode:
...@@ -465,6 +453,19 @@ def parse_pdf_for_model_train(jso: dict, start_page_id=0, debug_mode=False) -> d ...@@ -465,6 +453,19 @@ def parse_pdf_for_model_train(jso: dict, start_page_id=0, debug_mode=False) -> d
return jso return jso
"""
统一处理逻辑
1.先调用parse_pdf对文本类pdf进行处理
2.再调用ocr_dropped_parse_pdf,对之前drop的pdf进行处理
"""
def uni_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
jso = parse_pdf(jso, start_page_id=start_page_id, debug_mode=debug_mode)
jso = ocr_dropped_parse_pdf(jso, start_page_id=start_page_id, debug_mode=debug_mode)
return jso
# 专门用来跑被drop的pdf,跑完之后需要把need_drop字段置为false # 专门用来跑被drop的pdf,跑完之后需要把need_drop字段置为false
def ocr_dropped_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict: def ocr_dropped_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
if not jso.get("need_drop", False): if not jso.get("need_drop", False):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment