Commit 1fc053d5 authored by myhloli's avatar myhloli
Browse files

refactor(magic_pdf_parse_main): optimize model data handling and JSON output

- Add orig_model_list parameter to maintain original model data
- Deep copy model_json and pipe.model_list to preserve data integrity
- Update json_md_dump function call to include orig_model_list
- Improve condition check for empty model_json
parent dd8da7bf
...@@ -19,9 +19,10 @@ def json_md_dump( ...@@ -19,9 +19,10 @@ def json_md_dump(
pdf_name, pdf_name,
content_list, content_list,
md_content, md_content,
orig_model_list,
): ):
# 写入模型结果到 model.json # 写入模型结果到 model.json
orig_model_list = copy.deepcopy(pipe.model_list)
md_writer.write( md_writer.write(
content=json.dumps(orig_model_list, ensure_ascii=False, indent=4), content=json.dumps(orig_model_list, ensure_ascii=False, indent=4),
path=f"{pdf_name}_model.json" path=f"{pdf_name}_model.json"
...@@ -87,9 +88,12 @@ def pdf_parse_main( ...@@ -87,9 +88,12 @@ def pdf_parse_main(
pdf_bytes = open(pdf_path, "rb").read() # 读取 pdf 文件的二进制数据 pdf_bytes = open(pdf_path, "rb").read() # 读取 pdf 文件的二进制数据
orig_model_list = []
if model_json_path: if model_json_path:
# 读取已经被模型解析后的pdf文件的 json 原始数据,list 类型 # 读取已经被模型解析后的pdf文件的 json 原始数据,list 类型
model_json = json.loads(open(model_json_path, "r", encoding="utf-8").read()) model_json = json.loads(open(model_json_path, "r", encoding="utf-8").read())
orig_model_list = copy.deepcopy(model_json)
else: else:
model_json = [] model_json = []
...@@ -115,8 +119,9 @@ def pdf_parse_main( ...@@ -115,8 +119,9 @@ def pdf_parse_main(
pipe.pipe_classify() pipe.pipe_classify()
# 如果没有传入模型数据,则使用内置模型解析 # 如果没有传入模型数据,则使用内置模型解析
if not model_json: if len(model_json) == 0:
pipe.pipe_analyze() # 解析 pipe.pipe_analyze() # 解析
orig_model_list = copy.deepcopy(pipe.model_list)
# 执行解析 # 执行解析
pipe.pipe_parse() pipe.pipe_parse()
...@@ -126,7 +131,7 @@ def pdf_parse_main( ...@@ -126,7 +131,7 @@ def pdf_parse_main(
md_content = pipe.pipe_mk_markdown(image_path_parent, drop_mode="none") md_content = pipe.pipe_mk_markdown(image_path_parent, drop_mode="none")
if is_json_md_dump: if is_json_md_dump:
json_md_dump(pipe, md_writer, pdf_name, content_list, md_content) json_md_dump(pipe, md_writer, pdf_name, content_list, md_content, orig_model_list)
if is_draw_visualization_bbox: if is_draw_visualization_bbox:
draw_visualization_bbox(pipe.pdf_mid_data['pdf_info'], pdf_bytes, output_path, pdf_name) draw_visualization_bbox(pipe.pdf_mid_data['pdf_info'], pdf_bytes, output_path, pdf_name)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment