Commit 91defbb0 authored by myhloli's avatar myhloli
Browse files

feat: enhance PDF parsing functionality with new backend options and improved output handling

parent ae9fd9ad
import os
from pathlib import Path
from magic_pdf.data.batch_build_dataset import batch_build_dataset
from magic_pdf.tools.common import batch_do_parse
def batch(pdf_dir, output_dir, method, lang):
os.makedirs(output_dir, exist_ok=True)
doc_paths = []
for doc_path in Path(pdf_dir).glob('*'):
if doc_path.suffix == '.pdf':
doc_paths.append(doc_path)
# build dataset with 2 workers
datasets = batch_build_dataset(doc_paths, 4, lang)
# os.environ["MINERU_MIN_BATCH_INFERENCE_SIZE"] = "200" # every 200 pages will be parsed in one batch
batch_do_parse(output_dir, [str(doc_path.stem) for doc_path in doc_paths], datasets, method)
if __name__ == '__main__':
batch("pdfs", "output", "auto", "")
# Copyright (c) Opendatalab. All rights reserved. # Copyright (c) Opendatalab. All rights reserved.
import copy
import json
import os import os
from pathlib import Path
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset from loguru import logger
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod from mineru.cli.common import convert_pdf_bytes_to_bytes_by_pypdfium2, prepare_env, read_fn
from mineru.data.data_reader_writer import FileBasedDataWriter
# args from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox
__dir__ = os.path.dirname(os.path.abspath(__file__)) from mineru.utils.enum_class import MakeMode
pdf_file_name = os.path.join(__dir__, "pdfs", "demo1.pdf") # replace with the real pdf path from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
name_without_extension = os.path.basename(pdf_file_name).split('.')[0] from mineru.backend.pipeline.pipeline_analyze import doc_analyze as pipeline_doc_analyze
from mineru.backend.pipeline.pipeline_middle_json_mkcontent import union_make as pipeline_union_make
# prepare env from mineru.backend.pipeline.model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json
local_image_dir = os.path.join(__dir__, "output", name_without_extension, "images") from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
local_md_dir = os.path.join(__dir__, "output", name_without_extension) from mineru.utils.models_download_utils import auto_download_and_get_model_root_path
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
def do_parse(
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir) output_dir, # Output directory for storing parsing results
pdf_file_names: list[str], # List of PDF file names to be parsed
# read bytes pdf_bytes_list: list[bytes], # List of PDF bytes to be parsed
reader1 = FileBasedDataReader("") p_lang_list: list[str], # List of languages for each PDF, default is 'ch' (Chinese)
pdf_bytes = reader1.read(pdf_file_name) # read the pdf content backend="pipeline", # The backend for parsing PDF, default is 'pipeline'
parse_method="auto", # The method for parsing PDF, default is 'auto'
# proc p_formula_enable=True, # Enable formula parsing
## Create Dataset Instance p_table_enable=True, # Enable table parsing
ds = PymuDocDataset(pdf_bytes) server_url=None, # Server URL for vlm-sglang-client backend
f_draw_layout_bbox=True, # Whether to draw layout bounding boxes
## inference f_draw_span_bbox=True, # Whether to draw span bounding boxes
if ds.classify() == SupportedPdfParseMethod.OCR: f_dump_md=True, # Whether to dump markdown files
infer_result = ds.apply(doc_analyze, ocr=True) f_dump_middle_json=True, # Whether to dump middle JSON files
f_dump_model_output=True, # Whether to dump model output files
## pipeline f_dump_orig_pdf=True, # Whether to dump original PDF files
pipe_result = infer_result.pipe_ocr_mode(image_writer) f_dump_content_list=True, # Whether to dump content list files
f_make_md_mode=MakeMode.MM_MD, # The mode for making markdown content, default is MM_MD
else: start_page_id=0, # Start page ID for parsing, default is 0
infer_result = ds.apply(doc_analyze, ocr=False) end_page_id=None, # End page ID for parsing, default is None (parse all pages until the end of the document)
):
## pipeline
pipe_result = infer_result.pipe_txt_mode(image_writer) if backend == "pipeline":
for idx, pdf_bytes in enumerate(pdf_bytes_list):
### get model inference result new_pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id, end_page_id)
model_inference_result = infer_result.get_infer_res() pdf_bytes_list[idx] = new_pdf_bytes
### draw layout result on each page infer_results, all_image_lists, all_pdf_docs, lang_list, ocr_enabled_list = pipeline_doc_analyze(pdf_bytes_list, p_lang_list, parse_method=parse_method, formula_enable=p_formula_enable,table_enable=p_table_enable)
pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_extension}_layout.pdf"))
for idx, model_list in enumerate(infer_results):
### draw spans result on each page model_json = copy.deepcopy(model_list)
pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_extension}_spans.pdf")) pdf_file_name = pdf_file_names[idx]
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
### get markdown content image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
md_content = pipe_result.get_markdown(image_dir)
images_list = all_image_lists[idx]
### dump markdown pdf_doc = all_pdf_docs[idx]
pipe_result.dump_md(md_writer, f"{name_without_extension}.md", image_dir) _lang = lang_list[idx]
_ocr_enable = ocr_enabled_list[idx]
### get content list content middle_json = pipeline_result_to_middle_json(model_list, images_list, pdf_doc, image_writer, _lang, _ocr_enable, p_formula_enable)
content_list_content = pipe_result.get_content_list(image_dir)
pdf_info = middle_json["pdf_info"]
### dump content list
pipe_result.dump_content_list(md_writer, f"{name_without_extension}_content_list.json", image_dir) pdf_bytes = pdf_bytes_list[idx]
if f_draw_layout_bbox:
### get middle json draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_layout.pdf")
middle_json_content = pipe_result.get_middle_json()
if f_draw_span_bbox:
### dump middle json draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_span.pdf")
pipe_result.dump_middle_json(md_writer, f'{name_without_extension}_middle.json')
if f_dump_orig_pdf:
md_writer.write(
f"{pdf_file_name}_origin.pdf",
pdf_bytes,
)
if f_dump_md:
image_dir = str(os.path.basename(local_image_dir))
md_content_str = pipeline_union_make(pdf_info, f_make_md_mode, image_dir)
md_writer.write_string(
f"{pdf_file_name}.md",
md_content_str,
)
if f_dump_content_list:
image_dir = str(os.path.basename(local_image_dir))
content_list = pipeline_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir)
md_writer.write_string(
f"{pdf_file_name}_content_list.json",
json.dumps(content_list, ensure_ascii=False, indent=4),
)
if f_dump_middle_json:
md_writer.write_string(
f"{pdf_file_name}_middle.json",
json.dumps(middle_json, ensure_ascii=False, indent=4),
)
if f_dump_model_output:
md_writer.write_string(
f"{pdf_file_name}_model.json",
json.dumps(model_json, ensure_ascii=False, indent=4),
)
logger.info(f"local output dir is {local_md_dir}")
else:
if backend.startswith("vlm-"):
backend = backend[4:]
f_draw_span_bbox = False
parse_method = "vlm"
for idx, pdf_bytes in enumerate(pdf_bytes_list):
pdf_file_name = pdf_file_names[idx]
pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id, end_page_id)
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
model_path = auto_download_and_get_model_root_path('/', 'vlm')
middle_json, infer_result = vlm_doc_analyze(pdf_bytes, image_writer=image_writer, backend=backend, model_path=model_path, server_url=server_url)
pdf_info = middle_json["pdf_info"]
if f_draw_layout_bbox:
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_layout.pdf")
if f_draw_span_bbox:
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_span.pdf")
if f_dump_orig_pdf:
md_writer.write(
f"{pdf_file_name}_origin.pdf",
pdf_bytes,
)
if f_dump_md:
image_dir = str(os.path.basename(local_image_dir))
md_content_str = vlm_union_make(pdf_info, f_make_md_mode, image_dir)
md_writer.write_string(
f"{pdf_file_name}.md",
md_content_str,
)
if f_dump_content_list:
image_dir = str(os.path.basename(local_image_dir))
content_list = vlm_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir)
md_writer.write_string(
f"{pdf_file_name}_content_list.json",
json.dumps(content_list, ensure_ascii=False, indent=4),
)
if f_dump_middle_json:
md_writer.write_string(
f"{pdf_file_name}_middle.json",
json.dumps(middle_json, ensure_ascii=False, indent=4),
)
if f_dump_model_output:
model_output = ("\n" + "-" * 50 + "\n").join(infer_result)
md_writer.write_string(
f"{pdf_file_name}_model_output.txt",
model_output,
)
logger.info(f"local output dir is {local_md_dir}")
def parse_doc(
path_list: list[Path],
output_dir,
lang="ch",
backend="pipeline",
method="auto",
server_url=None,
start_page_id=0, # Start page ID for parsing, default is 0
end_page_id=None # End page ID for parsing, default is None (parse all pages until the end of the document)
):
"""
Parameter description:
path_list: List of document paths to be parsed, can be PDF or image files.
output_dir: Output directory for storing parsing results.
lang: Language option, default is 'ch', optional values include['ch', 'ch_server', 'ch_lite', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']。
Input the languages in the pdf (if known) to improve OCR accuracy. Optional.
Adapted only for the case where the backend is set to "pipeline"
backend: the backend for parsing pdf:
pipeline: More general.
vlm-transformers: More general.
vlm-sglang-engine: Faster(engine).
vlm-sglang-client: Faster(client).
without method specified, pipeline will be used by default.
method: the method for parsing pdf:
auto: Automatically determine the method based on the file type.
txt: Use text extraction method.
ocr: Use OCR method for image-based PDFs.
Without method specified, 'auto' will be used by default.
Adapted only for the case where the backend is set to "pipeline".
server_url: When the backend is `sglang-client`, you need to specify the server_url, for example:`http://127.0.0.1:30000`
"""
try:
file_name_list = []
pdf_bytes_list = []
lang_list = []
for path in path_list:
file_name = str(Path(path).stem)
pdf_bytes = read_fn(path)
file_name_list.append(file_name)
pdf_bytes_list.append(pdf_bytes)
lang_list.append(lang)
do_parse(
output_dir=output_dir,
pdf_file_names=file_name_list,
pdf_bytes_list=pdf_bytes_list,
p_lang_list=lang_list,
backend=backend,
parse_method=method,
server_url=server_url,
start_page_id=start_page_id,
end_page_id=end_page_id
)
except Exception as e:
logger.exception(e)
if __name__ == '__main__':
# args
__dir__ = os.path.dirname(os.path.abspath(__file__))
pdf_files_dir = os.path.join(__dir__, "pdfs")
output_dir = os.path.join(__dir__, "output")
pdf_suffixes = [".pdf"]
image_suffixes = [".png", ".jpeg", ".jpg"]
doc_path_list = []
for doc_path in Path(pdf_files_dir).glob('*'):
if doc_path.suffix in pdf_suffixes + image_suffixes:
doc_path_list.append(doc_path)
parse_doc(doc_path_list, output_dir)
...@@ -41,7 +41,8 @@ from .common import do_parse, read_fn, pdf_suffixes, image_suffixes ...@@ -41,7 +41,8 @@ from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
auto: Automatically determine the method based on the file type. auto: Automatically determine the method based on the file type.
txt: Use text extraction method. txt: Use text extraction method.
ocr: Use OCR method for image-based PDFs. ocr: Use OCR method for image-based PDFs.
Without method specified, 'auto' will be used by default.""", Without method specified, 'auto' will be used by default.
Adapted only for the case where the backend is set to "pipeline".""",
default='auto', default='auto',
) )
@click.option( @click.option(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment