Commit 51393aa8 authored by myhloli's avatar myhloli
Browse files

refactor: update union_make import and adjust middle JSON structure for consistency

parent 6b1df419
......@@ -117,7 +117,7 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
def result_to_middle_json(model_list, images_list, pdf_doc, image_writer, lang=None, ocr=False):
middle_json = {"pdf_info": [], "_backend":"vlm", "_version_name": __version__}
middle_json = {"pdf_info": [], "_backend":"pipeline", "_version_name": __version__}
for page_index, page_model_info in enumerate(model_list):
page = pdf_doc[page_index]
image_dict = images_list[page_index]
......
......@@ -5,8 +5,6 @@ import torch
from .model_init import MineruPipelineModel
from .config_reader import get_local_models_dir, get_device, get_formula_config, get_table_recog_config
from .model_json_to_middle_json import result_to_middle_json
from ...data.data_reader_writer import DataWriter
from ...utils.pdf_classify import classify
from ...utils.pdf_image_tools import load_images_from_pdf
......
......@@ -2,13 +2,14 @@
import io
import json
import os
import copy
from pathlib import Path
import pypdfium2 as pdfium
from loguru import logger
from mineru.backend.pipeline.model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json
from mineru.api.vlm_middle_json_mkcontent import union_make
from mineru.api.vlm_middle_json_mkcontent import union_make as vlm_union_make
from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
from mineru.backend.pipeline.pipeline_analyze import doc_analyze as pipeline_doc_analyze
from mineru.data.data_reader_writer import FileBasedDataWriter
......@@ -98,8 +99,8 @@ def do_parse(
infer_results, all_image_lists, all_pdf_docs, lang_list, ocr_enabled_list = pipeline_doc_analyze(pdf_bytes_list, p_lang_list, parse_method=parse_method, formula_enable=p_formula_enable,table_enable=p_table_enable)
for idx, model_list in enumerate(infer_results):
model_json = copy.deepcopy(model_list)
pdf_file_name = pdf_file_names[idx]
model_json = infer_results[idx]
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
......@@ -124,21 +125,21 @@ def do_parse(
pdf_bytes,
)
if f_dump_md:
image_dir = str(os.path.basename(local_image_dir))
md_content_str = union_make(pdf_info, f_make_md_mode, image_dir)
md_writer.write_string(
f"{pdf_file_name}.md",
md_content_str,
)
if f_dump_content_list:
image_dir = str(os.path.basename(local_image_dir))
content_list = union_make(pdf_info, MakeMode.STANDARD_FORMAT, image_dir)
md_writer.write_string(
f"{pdf_file_name}_content_list.json",
json.dumps(content_list, ensure_ascii=False, indent=4),
)
# if f_dump_md:
# image_dir = str(os.path.basename(local_image_dir))
# md_content_str = union_make(pdf_info, f_make_md_mode, image_dir)
# md_writer.write_string(
# f"{pdf_file_name}.md",
# md_content_str,
# )
# if f_dump_content_list:
# image_dir = str(os.path.basename(local_image_dir))
# content_list = union_make(pdf_info, MakeMode.STANDARD_FORMAT, image_dir)
# md_writer.write_string(
# f"{pdf_file_name}_content_list.json",
# json.dumps(content_list, ensure_ascii=False, indent=4),
# )
if f_dump_middle_json:
md_writer.write_string(
......@@ -179,7 +180,7 @@ def do_parse(
if f_dump_md:
image_dir = str(os.path.basename(local_image_dir))
md_content_str = union_make(pdf_info, f_make_md_mode, image_dir)
md_content_str = vlm_union_make(pdf_info, f_make_md_mode, image_dir)
md_writer.write_string(
f"{pdf_file_name}.md",
md_content_str,
......@@ -187,7 +188,7 @@ def do_parse(
if f_dump_content_list:
image_dir = str(os.path.basename(local_image_dir))
content_list = union_make(pdf_info, MakeMode.STANDARD_FORMAT, image_dir)
content_list = vlm_union_make(pdf_info, MakeMode.STANDARD_FORMAT, image_dir)
md_writer.write_string(
f"{pdf_file_name}_content_list.json",
json.dumps(content_list, ensure_ascii=False, indent=4),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment