Commit 6be1453c authored by Sidney233's avatar Sidney233
Browse files

test: Update test_e2e.py

parent da048cbf
...@@ -78,6 +78,9 @@ plugins: ...@@ -78,6 +78,9 @@ plugins:
- search - search
- i18n: - i18n:
docs_structure: folder docs_structure: folder
fallback_to_default: true
reconfigure_material: true
reconfigure_search: true
languages: languages:
- locale: en - locale: en
default: true default: true
......
...@@ -43,7 +43,8 @@ test = [ ...@@ -43,7 +43,8 @@ test = [
"pytest", "pytest",
"pytest-cov", "pytest-cov",
"coverage", "coverage",
"beautifulsoup4" "beautifulsoup4",
"fuzzywuzzy"
] ]
vlm = [ vlm = [
"transformers>=4.51.1", "transformers>=4.51.1",
...@@ -150,7 +151,11 @@ omit = [ ...@@ -150,7 +151,11 @@ omit = [
"*/cli_parser.py", "*/cli_parser.py",
"*/run_async.py" "*/run_async.py"
] ]
[tool.coverage.html] [tool.coverage.html]
directory = "htmlcov"
[tool.coverage.report]
exclude_also = [ exclude_also = [
'def __repr__', 'def __repr__',
'if self.debug:', 'if self.debug:',
...@@ -162,5 +167,4 @@ exclude_also = [ ...@@ -162,5 +167,4 @@ exclude_also = [
'if TYPE_CHECKING:', 'if TYPE_CHECKING:',
'class .*\bProtocol\):', 'class .*\bProtocol\):',
'@(abc\.)?abstractmethod', '@(abc\.)?abstractmethod',
] ]
directory = "htmlcov" \ No newline at end of file
\ No newline at end of file
...@@ -3,17 +3,15 @@ import copy ...@@ -3,17 +3,15 @@ import copy
import json import json
import os import os
from pathlib import Path from pathlib import Path
from cryptography.hazmat.backends.openssl import backend
from loguru import logger from loguru import logger
from bs4 import BeautifulSoup
from fuzzywuzzy import fuzz
from mineru.cli.common import ( from mineru.cli.common import (
convert_pdf_bytes_to_bytes_by_pypdfium2, convert_pdf_bytes_to_bytes_by_pypdfium2,
prepare_env, prepare_env,
read_fn, read_fn,
) )
from mineru.data.data_reader_writer import FileBasedDataWriter from mineru.data.data_reader_writer import FileBasedDataWriter
from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox
from mineru.utils.enum_class import MakeMode from mineru.utils.enum_class import MakeMode
from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
from mineru.backend.pipeline.pipeline_analyze import doc_analyze as pipeline_doc_analyze from mineru.backend.pipeline.pipeline_analyze import doc_analyze as pipeline_doc_analyze
...@@ -24,313 +22,272 @@ from mineru.backend.pipeline.model_json_to_middle_json import ( ...@@ -24,313 +22,272 @@ from mineru.backend.pipeline.model_json_to_middle_json import (
result_to_middle_json as pipeline_result_to_middle_json, result_to_middle_json as pipeline_result_to_middle_json,
) )
from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
from mineru.utils.models_download_utils import auto_download_and_get_model_root_path
class TestE2E:
def test_pipeline_with_two_config(self):
def do_parse(
output_dir, # Output directory for storing parsing results
pdf_file_names: list[str], # List of PDF file names to be parsed
pdf_bytes_list: list[bytes], # List of PDF bytes to be parsed
p_lang_list: list[
str
], # List of languages for each PDF, default is 'ch' (Chinese)
parse_method="auto", # The method for parsing PDF, default is 'auto'
formula_enable=True, # Enable formula parsing
table_enable=True, # Enable table parsing
f_draw_layout_bbox=True, # Whether to draw layout bounding boxes
f_draw_span_bbox=True, # Whether to draw span bounding boxes
f_dump_md=True, # Whether to dump markdown files
f_dump_middle_json=True, # Whether to dump middle JSON files
f_dump_model_output=True, # Whether to dump model output files
f_dump_orig_pdf=True, # Whether to dump original PDF files
f_dump_content_list=True, # Whether to dump content list files
f_make_md_mode=MakeMode.MM_MD, # The mode for making markdown content, default is MM_MD
start_page_id=0, # Start page ID for parsing, default is 0
end_page_id=None, # End page ID for parsing, default is None (parse all pages until the end of the document)
):
for idx, pdf_bytes in enumerate(pdf_bytes_list):
new_pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(
pdf_bytes, start_page_id, end_page_id
)
pdf_bytes_list[idx] = new_pdf_bytes
(
infer_results,
all_image_lists,
all_pdf_docs,
lang_list,
ocr_enabled_list,
) = pipeline_doc_analyze(
pdf_bytes_list,
p_lang_list,
parse_method=parse_method,
formula_enable=formula_enable,
table_enable=table_enable,
)
for idx, model_list in enumerate(infer_results):
model_json = copy.deepcopy(model_list)
pdf_file_name = pdf_file_names[idx]
local_image_dir, local_md_dir = prepare_env(
output_dir, pdf_file_name, parse_method
)
image_writer, md_writer = FileBasedDataWriter(
local_image_dir
), FileBasedDataWriter(local_md_dir)
images_list = all_image_lists[idx]
pdf_doc = all_pdf_docs[idx]
_lang = lang_list[idx]
_ocr_enable = ocr_enabled_list[idx]
middle_json = pipeline_result_to_middle_json(
model_list,
images_list,
pdf_doc,
image_writer,
_lang,
_ocr_enable,
formula_enable,
)
pdf_info = middle_json["pdf_info"]
pdf_bytes = pdf_bytes_list[idx]
if f_draw_layout_bbox:
draw_layout_bbox(
pdf_info,
pdf_bytes,
local_md_dir,
f"{pdf_file_name}_layout.pdf",
)
if f_draw_span_bbox:
draw_span_bbox(
pdf_info,
pdf_bytes,
local_md_dir,
f"{pdf_file_name}_span.pdf",
)
if f_dump_orig_pdf:
md_writer.write(
f"{pdf_file_name}_origin.pdf",
pdf_bytes,
)
if f_dump_md:
image_dir = str(os.path.basename(local_image_dir))
md_content_str = pipeline_union_make(
pdf_info, f_make_md_mode, image_dir
)
md_writer.write_string(
f"{pdf_file_name}.md",
md_content_str,
)
if f_dump_content_list:
image_dir = str(os.path.basename(local_image_dir))
content_list = pipeline_union_make(
pdf_info, MakeMode.CONTENT_LIST, image_dir
)
md_writer.write_string(
f"{pdf_file_name}_content_list.json",
json.dumps(content_list, ensure_ascii=False, indent=4),
)
if f_dump_middle_json: def test_pipeline_with_two_config():
md_writer.write_string( __dir__ = os.path.dirname(os.path.abspath(__file__))
f"{pdf_file_name}_middle.json", pdf_files_dir = os.path.join(__dir__, "pdfs")
json.dumps(middle_json, ensure_ascii=False, indent=4), output_dir = os.path.join(__dir__, "output")
) pdf_suffixes = [".pdf"]
image_suffixes = [".png", ".jpeg", ".jpg"]
if f_dump_model_output:
md_writer.write_string( doc_path_list = []
f"{pdf_file_name}_model.json", for doc_path in Path(pdf_files_dir).glob("*"):
json.dumps(model_json, ensure_ascii=False, indent=4), if doc_path.suffix in pdf_suffixes + image_suffixes:
) doc_path_list.append(doc_path)
logger.info(f"local output dir is {local_md_dir}") os.environ["MINERU_MODEL_SOURCE"] = "modelscope"
def parse_doc( pdf_file_names = []
path_list: list[Path], pdf_bytes_list = []
output_dir, p_lang_list = []
lang="ch", for path in doc_path_list:
method="auto", file_name = str(Path(path).stem)
start_page_id=0, pdf_bytes = read_fn(path)
end_page_id=None, pdf_file_names.append(file_name)
): pdf_bytes_list.append(pdf_bytes)
file_name_list = [] p_lang_list.append("en")
pdf_bytes_list = [] for idx, pdf_bytes in enumerate(pdf_bytes_list):
lang_list = [] new_pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes)
for path in path_list: pdf_bytes_list[idx] = new_pdf_bytes
file_name = str(Path(path).stem)
pdf_bytes = read_fn(path) # 获取 pipline 分析结果, 分别测试 txt 和 ocr 两种解析方法的结果
file_name_list.append(file_name) infer_results, all_image_lists, all_pdf_docs, lang_list, ocr_enabled_list = (
pdf_bytes_list.append(pdf_bytes) pipeline_doc_analyze(
lang_list.append(lang) pdf_bytes_list,
# 运行两次 do_parse,分别是开启公式和表格解析和不开启 p_lang_list,
do_parse( parse_method="txt",
output_dir=output_dir, )
pdf_file_names=file_name_list, )
pdf_bytes_list=pdf_bytes_list, write_infer_result(
p_lang_list=lang_list, infer_results,
parse_method=method, all_image_lists,
start_page_id=start_page_id, all_pdf_docs,
end_page_id=end_page_id, lang_list,
) ocr_enabled_list,
do_parse( pdf_file_names,
output_dir=output_dir, output_dir,
pdf_file_names=file_name_list, parse_method="txt",
pdf_bytes_list=pdf_bytes_list, )
p_lang_list=lang_list, assert_content("tests/unittest/output/test/txt/test_content_list.json")
parse_method=method, infer_results, all_image_lists, all_pdf_docs, lang_list, ocr_enabled_list = (
table_enable=False, pipeline_doc_analyze(
formula_enable=False, pdf_bytes_list,
start_page_id=start_page_id, p_lang_list,
end_page_id=end_page_id, parse_method="ocr",
) )
)
__dir__ = os.path.dirname(os.path.abspath(__file__)) write_infer_result(
pdf_files_dir = os.path.join(__dir__, "pdfs") infer_results,
output_dir = os.path.join(__dir__, "output") all_image_lists,
pdf_suffixes = [".pdf"] all_pdf_docs,
image_suffixes = [".png", ".jpeg", ".jpg"] lang_list,
ocr_enabled_list,
doc_path_list = [] pdf_file_names,
for doc_path in Path(pdf_files_dir).glob("*"): output_dir,
if doc_path.suffix in pdf_suffixes + image_suffixes: parse_method="ocr",
doc_path_list.append(doc_path) )
assert_content("tests/unittest/output/test/ocr/test_content_list.json")
# os.environ["MINERU_MODEL_SOURCE"] = "modelscope"
parse_doc(doc_path_list, output_dir)
def test_vlm_transformers_with_default_config():
def test_vlm_transformers_with_default_config(self): __dir__ = os.path.dirname(os.path.abspath(__file__))
def do_parse( pdf_files_dir = os.path.join(__dir__, "pdfs")
output_dir, # Output directory for storing parsing results output_dir = os.path.join(__dir__, "output")
pdf_file_names: list[str], # List of PDF file names to be parsed pdf_suffixes = [".pdf"]
pdf_bytes_list: list[bytes], # List of PDF bytes to be parsed image_suffixes = [".png", ".jpeg", ".jpg"]
server_url=None, # Server URL for vlm-sglang-client backend
f_draw_layout_bbox=True, # Whether to draw layout bounding boxes doc_path_list = []
f_dump_md=True, # Whether to dump markdown files for doc_path in Path(pdf_files_dir).glob("*"):
f_dump_middle_json=True, # Whether to dump middle JSON files if doc_path.suffix in pdf_suffixes + image_suffixes:
f_dump_model_output=True, # Whether to dump model output files doc_path_list.append(doc_path)
f_dump_orig_pdf=True, # Whether to dump original PDF files
f_dump_content_list=True, # Whether to dump content list files os.environ["MINERU_MODEL_SOURCE"] = "modelscope"
f_make_md_mode=MakeMode.MM_MD, # The mode for making markdown content, default is MM_MD
start_page_id=0, # Start page ID for parsing, default is 0 pdf_file_names = []
end_page_id=None, # End page ID for parsing, default is None (parse all pages until the end of the document) pdf_bytes_list = []
): p_lang_list = []
backend = "transformers" for path in doc_path_list:
f_draw_span_bbox = False file_name = str(Path(path).stem)
parse_method = "vlm" pdf_bytes = read_fn(path)
for idx, pdf_bytes in enumerate(pdf_bytes_list): pdf_file_names.append(file_name)
pdf_file_name = pdf_file_names[idx] pdf_bytes_list.append(pdf_bytes)
pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2( p_lang_list.append("en")
pdf_bytes, start_page_id, end_page_id
) for idx, pdf_bytes in enumerate(pdf_bytes_list):
local_image_dir, local_md_dir = prepare_env( pdf_file_name = pdf_file_names[idx]
output_dir, pdf_file_name, parse_method pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes)
local_image_dir, local_md_dir = prepare_env(
output_dir, pdf_file_name, parse_method="vlm"
)
image_writer, md_writer = FileBasedDataWriter(
local_image_dir
), FileBasedDataWriter(local_md_dir)
middle_json, infer_result = vlm_doc_analyze(
pdf_bytes, image_writer=image_writer, backend="transformers"
)
pdf_info = middle_json["pdf_info"]
image_dir = str(os.path.basename(local_image_dir))
md_content_str = vlm_union_make(pdf_info, MakeMode.MM_MD, image_dir)
md_writer.write_string(
f"{pdf_file_name}.md",
md_content_str,
)
content_list = vlm_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir)
md_writer.write_string(
f"{pdf_file_name}_content_list.json",
json.dumps(content_list, ensure_ascii=False, indent=4),
)
md_writer.write_string(
f"{pdf_file_name}_middle.json",
json.dumps(middle_json, ensure_ascii=False, indent=4),
)
model_output = ("\n" + "-" * 50 + "\n").join(infer_result)
md_writer.write_string(
f"{pdf_file_name}_model_output.txt",
model_output,
)
logger.info(f"local output dir is {local_md_dir}")
assert_content("tests/unittest/output/test/vlm/test_content_list.json")
def write_infer_result(
infer_results,
all_image_lists,
all_pdf_docs,
lang_list,
ocr_enabled_list,
pdf_file_names,
output_dir,
parse_method,
):
for idx, model_list in enumerate(infer_results):
model_json = copy.deepcopy(model_list)
pdf_file_name = pdf_file_names[idx]
local_image_dir, local_md_dir = prepare_env(
output_dir, pdf_file_name, parse_method
)
image_writer, md_writer = FileBasedDataWriter(
local_image_dir
), FileBasedDataWriter(local_md_dir)
images_list = all_image_lists[idx]
pdf_doc = all_pdf_docs[idx]
_lang = lang_list[idx]
_ocr_enable = ocr_enabled_list[idx]
middle_json = pipeline_result_to_middle_json(
model_list,
images_list,
pdf_doc,
image_writer,
_lang,
_ocr_enable,
True,
)
pdf_info = middle_json["pdf_info"]
image_dir = str(os.path.basename(local_image_dir))
# 写入 md 文件
md_content_str = pipeline_union_make(pdf_info, MakeMode.MM_MD, image_dir)
md_writer.write_string(
f"{pdf_file_name}.md",
md_content_str,
)
content_list = pipeline_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir)
md_writer.write_string(
f"{pdf_file_name}_content_list.json",
json.dumps(content_list, ensure_ascii=False, indent=4),
)
md_writer.write_string(
f"{pdf_file_name}_middle.json",
json.dumps(middle_json, ensure_ascii=False, indent=4),
)
md_writer.write_string(
f"{pdf_file_name}_model.json",
json.dumps(model_json, ensure_ascii=False, indent=4),
)
logger.info(f"local output dir is {local_md_dir}")
def validate_html(html_content):
try:
soup = BeautifulSoup(html_content, "html.parser")
return True
except Exception as e:
return False
def assert_content(content_path):
content_list = []
with open(content_path, "r", encoding="utf-8") as file:
content_list = json.load(file)
type_set = set()
for content_dict in content_list:
match content_dict["type"]:
# 图片校验,只校验 Caption
case "image":
type_set.add("image")
assert (
content_dict["image_caption"][0].strip().lower()
== "Figure 1: Figure Caption".lower()
) )
image_writer, md_writer = FileBasedDataWriter( # 表格校验,校验 Caption,表格格式和表格内容
local_image_dir case "table":
), FileBasedDataWriter(local_md_dir) type_set.add("table")
middle_json, infer_result = vlm_doc_analyze( assert (
pdf_bytes, content_dict["table_caption"][0].strip().lower()
image_writer=image_writer, == "Table 1: Table Caption".lower()
backend=backend,
server_url=server_url,
) )
assert validate_html(content_dict["table_body"])
pdf_info = middle_json["pdf_info"] target_str_list = [
"Linear Regression",
if f_draw_layout_bbox: "0.98740",
draw_layout_bbox( "1321.2",
pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_layout.pdf" "2-order Polynomial",
) "0.99906",
"26.4",
if f_draw_span_bbox: "3-order Polynomial",
draw_span_bbox( "0.99913",
pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_span.pdf" "101.2",
) "4-order Polynomial",
"0.99914",
if f_dump_orig_pdf: "94.1",
md_writer.write( "Gray Prediction",
f"{pdf_file_name}_origin.pdf", "0.00617",
pdf_bytes, "687",
) ]
correct_count = 0
if f_dump_md: for target_str in target_str_list:
image_dir = str(os.path.basename(local_image_dir)) if target_str in content_dict["table_body"]:
md_content_str = vlm_union_make(pdf_info, f_make_md_mode, image_dir) correct_count += 1
md_writer.write_string(
f"{pdf_file_name}.md", assert correct_count > 0.9 * len(target_str_list)
md_content_str, # 公式校验,检测是否含有公式元素
case "equation":
type_set.add("equation")
target_str_list = ["$$", "lambda", "frac", "bar"]
for target_str in target_str_list:
assert target_str in content_dict["text"]
# 文本校验,文本相似度超过90
case "text":
type_set.add("text")
assert (
fuzz.ratio(
content_dict["text"],
"Trump graduated from the Wharton School of the University of Pennsylvania with a bachelor's degree in 1968. He became president of his father's real estate business in 1971 and renamed it The Trump Organization.",
) )
> 90
if f_dump_content_list: )
image_dir = str(os.path.basename(local_image_dir)) assert len(type_set) >= 4
content_list = vlm_union_make(
pdf_info, MakeMode.CONTENT_LIST, image_dir
)
md_writer.write_string(
f"{pdf_file_name}_content_list.json",
json.dumps(content_list, ensure_ascii=False, indent=4),
)
if f_dump_middle_json:
md_writer.write_string(
f"{pdf_file_name}_middle.json",
json.dumps(middle_json, ensure_ascii=False, indent=4),
)
if f_dump_model_output:
model_output = ("\n" + "-" * 50 + "\n").join(infer_result)
md_writer.write_string(
f"{pdf_file_name}_model_output.txt",
model_output,
)
logger.info(f"local output dir is {local_md_dir}")
def parse_doc(
path_list: list[Path],
output_dir,
lang="ch",
server_url=None,
start_page_id=0,
end_page_id=None,
):
file_name_list = []
pdf_bytes_list = []
lang_list = []
for path in path_list:
file_name = str(Path(path).stem)
pdf_bytes = read_fn(path)
file_name_list.append(file_name)
pdf_bytes_list.append(pdf_bytes)
lang_list.append(lang)
do_parse(
output_dir=output_dir,
pdf_file_names=file_name_list,
pdf_bytes_list=pdf_bytes_list,
server_url=server_url,
start_page_id=start_page_id,
end_page_id=end_page_id,
)
__dir__ = os.path.dirname(os.path.abspath(__file__))
pdf_files_dir = os.path.join(__dir__, "pdfs")
output_dir = os.path.join(__dir__, "output")
pdf_suffixes = [".pdf"]
image_suffixes = [".png", ".jpeg", ".jpg"]
doc_path_list = []
for doc_path in Path(pdf_files_dir).glob("*"):
if doc_path.suffix in pdf_suffixes + image_suffixes:
doc_path_list.append(doc_path)
# os.environ["MINERU_MODEL_SOURCE"] = "modelscope"
parse_doc(doc_path_list, output_dir)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment