Commit 6be1453c authored by Sidney233's avatar Sidney233
Browse files

test: Update test_e2e.py

parent da048cbf
......@@ -78,6 +78,9 @@ plugins:
- search
- i18n:
docs_structure: folder
fallback_to_default: true
reconfigure_material: true
reconfigure_search: true
languages:
- locale: en
default: true
......
......@@ -43,7 +43,8 @@ test = [
"pytest",
"pytest-cov",
"coverage",
"beautifulsoup4"
"beautifulsoup4",
"fuzzywuzzy"
]
vlm = [
"transformers>=4.51.1",
......@@ -150,7 +151,11 @@ omit = [
"*/cli_parser.py",
"*/run_async.py"
]
[tool.coverage.html]
directory = "htmlcov"
[tool.coverage.report]
exclude_also = [
'def __repr__',
'if self.debug:',
......@@ -162,5 +167,4 @@ exclude_also = [
'if TYPE_CHECKING:',
'class .*\bProtocol\):',
'@(abc\.)?abstractmethod',
]
directory = "htmlcov"
\ No newline at end of file
]
\ No newline at end of file
......@@ -3,17 +3,15 @@ import copy
import json
import os
from pathlib import Path
from cryptography.hazmat.backends.openssl import backend
from loguru import logger
from bs4 import BeautifulSoup
from fuzzywuzzy import fuzz
from mineru.cli.common import (
convert_pdf_bytes_to_bytes_by_pypdfium2,
prepare_env,
read_fn,
)
from mineru.data.data_reader_writer import FileBasedDataWriter
from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox
from mineru.utils.enum_class import MakeMode
from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
from mineru.backend.pipeline.pipeline_analyze import doc_analyze as pipeline_doc_analyze
......@@ -24,313 +22,272 @@ from mineru.backend.pipeline.model_json_to_middle_json import (
result_to_middle_json as pipeline_result_to_middle_json,
)
from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
from mineru.utils.models_download_utils import auto_download_and_get_model_root_path
class TestE2E:
def test_pipeline_with_two_config(self):
def do_parse(
output_dir, # Output directory for storing parsing results
pdf_file_names: list[str], # List of PDF file names to be parsed
pdf_bytes_list: list[bytes], # List of PDF bytes to be parsed
p_lang_list: list[
str
], # List of languages for each PDF, default is 'ch' (Chinese)
parse_method="auto", # The method for parsing PDF, default is 'auto'
formula_enable=True, # Enable formula parsing
table_enable=True, # Enable table parsing
f_draw_layout_bbox=True, # Whether to draw layout bounding boxes
f_draw_span_bbox=True, # Whether to draw span bounding boxes
f_dump_md=True, # Whether to dump markdown files
f_dump_middle_json=True, # Whether to dump middle JSON files
f_dump_model_output=True, # Whether to dump model output files
f_dump_orig_pdf=True, # Whether to dump original PDF files
f_dump_content_list=True, # Whether to dump content list files
f_make_md_mode=MakeMode.MM_MD, # The mode for making markdown content, default is MM_MD
start_page_id=0, # Start page ID for parsing, default is 0
end_page_id=None, # End page ID for parsing, default is None (parse all pages until the end of the document)
):
for idx, pdf_bytes in enumerate(pdf_bytes_list):
new_pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(
pdf_bytes, start_page_id, end_page_id
)
pdf_bytes_list[idx] = new_pdf_bytes
(
infer_results,
all_image_lists,
all_pdf_docs,
lang_list,
ocr_enabled_list,
) = pipeline_doc_analyze(
pdf_bytes_list,
p_lang_list,
parse_method=parse_method,
formula_enable=formula_enable,
table_enable=table_enable,
)
for idx, model_list in enumerate(infer_results):
model_json = copy.deepcopy(model_list)
pdf_file_name = pdf_file_names[idx]
local_image_dir, local_md_dir = prepare_env(
output_dir, pdf_file_name, parse_method
)
image_writer, md_writer = FileBasedDataWriter(
local_image_dir
), FileBasedDataWriter(local_md_dir)
images_list = all_image_lists[idx]
pdf_doc = all_pdf_docs[idx]
_lang = lang_list[idx]
_ocr_enable = ocr_enabled_list[idx]
middle_json = pipeline_result_to_middle_json(
model_list,
images_list,
pdf_doc,
image_writer,
_lang,
_ocr_enable,
formula_enable,
)
pdf_info = middle_json["pdf_info"]
pdf_bytes = pdf_bytes_list[idx]
if f_draw_layout_bbox:
draw_layout_bbox(
pdf_info,
pdf_bytes,
local_md_dir,
f"{pdf_file_name}_layout.pdf",
)
if f_draw_span_bbox:
draw_span_bbox(
pdf_info,
pdf_bytes,
local_md_dir,
f"{pdf_file_name}_span.pdf",
)
if f_dump_orig_pdf:
md_writer.write(
f"{pdf_file_name}_origin.pdf",
pdf_bytes,
)
if f_dump_md:
image_dir = str(os.path.basename(local_image_dir))
md_content_str = pipeline_union_make(
pdf_info, f_make_md_mode, image_dir
)
md_writer.write_string(
f"{pdf_file_name}.md",
md_content_str,
)
if f_dump_content_list:
image_dir = str(os.path.basename(local_image_dir))
content_list = pipeline_union_make(
pdf_info, MakeMode.CONTENT_LIST, image_dir
)
md_writer.write_string(
f"{pdf_file_name}_content_list.json",
json.dumps(content_list, ensure_ascii=False, indent=4),
)
if f_dump_middle_json:
md_writer.write_string(
f"{pdf_file_name}_middle.json",
json.dumps(middle_json, ensure_ascii=False, indent=4),
)
if f_dump_model_output:
md_writer.write_string(
f"{pdf_file_name}_model.json",
json.dumps(model_json, ensure_ascii=False, indent=4),
)
logger.info(f"local output dir is {local_md_dir}")
def parse_doc(
path_list: list[Path],
output_dir,
lang="ch",
method="auto",
start_page_id=0,
end_page_id=None,
):
file_name_list = []
pdf_bytes_list = []
lang_list = []
for path in path_list:
file_name = str(Path(path).stem)
pdf_bytes = read_fn(path)
file_name_list.append(file_name)
pdf_bytes_list.append(pdf_bytes)
lang_list.append(lang)
# 运行两次 do_parse,分别是开启公式和表格解析和不开启
do_parse(
output_dir=output_dir,
pdf_file_names=file_name_list,
pdf_bytes_list=pdf_bytes_list,
p_lang_list=lang_list,
parse_method=method,
start_page_id=start_page_id,
end_page_id=end_page_id,
)
do_parse(
output_dir=output_dir,
pdf_file_names=file_name_list,
pdf_bytes_list=pdf_bytes_list,
p_lang_list=lang_list,
parse_method=method,
table_enable=False,
formula_enable=False,
start_page_id=start_page_id,
end_page_id=end_page_id,
)
__dir__ = os.path.dirname(os.path.abspath(__file__))
pdf_files_dir = os.path.join(__dir__, "pdfs")
output_dir = os.path.join(__dir__, "output")
pdf_suffixes = [".pdf"]
image_suffixes = [".png", ".jpeg", ".jpg"]
doc_path_list = []
for doc_path in Path(pdf_files_dir).glob("*"):
if doc_path.suffix in pdf_suffixes + image_suffixes:
doc_path_list.append(doc_path)
# os.environ["MINERU_MODEL_SOURCE"] = "modelscope"
parse_doc(doc_path_list, output_dir)
def test_vlm_transformers_with_default_config(self):
def do_parse(
output_dir, # Output directory for storing parsing results
pdf_file_names: list[str], # List of PDF file names to be parsed
pdf_bytes_list: list[bytes], # List of PDF bytes to be parsed
server_url=None, # Server URL for vlm-sglang-client backend
f_draw_layout_bbox=True, # Whether to draw layout bounding boxes
f_dump_md=True, # Whether to dump markdown files
f_dump_middle_json=True, # Whether to dump middle JSON files
f_dump_model_output=True, # Whether to dump model output files
f_dump_orig_pdf=True, # Whether to dump original PDF files
f_dump_content_list=True, # Whether to dump content list files
f_make_md_mode=MakeMode.MM_MD, # The mode for making markdown content, default is MM_MD
start_page_id=0, # Start page ID for parsing, default is 0
end_page_id=None, # End page ID for parsing, default is None (parse all pages until the end of the document)
):
backend = "transformers"
f_draw_span_bbox = False
parse_method = "vlm"
for idx, pdf_bytes in enumerate(pdf_bytes_list):
pdf_file_name = pdf_file_names[idx]
pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(
pdf_bytes, start_page_id, end_page_id
)
local_image_dir, local_md_dir = prepare_env(
output_dir, pdf_file_name, parse_method
def test_pipeline_with_two_config():
__dir__ = os.path.dirname(os.path.abspath(__file__))
pdf_files_dir = os.path.join(__dir__, "pdfs")
output_dir = os.path.join(__dir__, "output")
pdf_suffixes = [".pdf"]
image_suffixes = [".png", ".jpeg", ".jpg"]
doc_path_list = []
for doc_path in Path(pdf_files_dir).glob("*"):
if doc_path.suffix in pdf_suffixes + image_suffixes:
doc_path_list.append(doc_path)
os.environ["MINERU_MODEL_SOURCE"] = "modelscope"
pdf_file_names = []
pdf_bytes_list = []
p_lang_list = []
for path in doc_path_list:
file_name = str(Path(path).stem)
pdf_bytes = read_fn(path)
pdf_file_names.append(file_name)
pdf_bytes_list.append(pdf_bytes)
p_lang_list.append("en")
for idx, pdf_bytes in enumerate(pdf_bytes_list):
new_pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes)
pdf_bytes_list[idx] = new_pdf_bytes
# 获取 pipline 分析结果, 分别测试 txt 和 ocr 两种解析方法的结果
infer_results, all_image_lists, all_pdf_docs, lang_list, ocr_enabled_list = (
pipeline_doc_analyze(
pdf_bytes_list,
p_lang_list,
parse_method="txt",
)
)
write_infer_result(
infer_results,
all_image_lists,
all_pdf_docs,
lang_list,
ocr_enabled_list,
pdf_file_names,
output_dir,
parse_method="txt",
)
assert_content("tests/unittest/output/test/txt/test_content_list.json")
infer_results, all_image_lists, all_pdf_docs, lang_list, ocr_enabled_list = (
pipeline_doc_analyze(
pdf_bytes_list,
p_lang_list,
parse_method="ocr",
)
)
write_infer_result(
infer_results,
all_image_lists,
all_pdf_docs,
lang_list,
ocr_enabled_list,
pdf_file_names,
output_dir,
parse_method="ocr",
)
assert_content("tests/unittest/output/test/ocr/test_content_list.json")
def test_vlm_transformers_with_default_config():
__dir__ = os.path.dirname(os.path.abspath(__file__))
pdf_files_dir = os.path.join(__dir__, "pdfs")
output_dir = os.path.join(__dir__, "output")
pdf_suffixes = [".pdf"]
image_suffixes = [".png", ".jpeg", ".jpg"]
doc_path_list = []
for doc_path in Path(pdf_files_dir).glob("*"):
if doc_path.suffix in pdf_suffixes + image_suffixes:
doc_path_list.append(doc_path)
os.environ["MINERU_MODEL_SOURCE"] = "modelscope"
pdf_file_names = []
pdf_bytes_list = []
p_lang_list = []
for path in doc_path_list:
file_name = str(Path(path).stem)
pdf_bytes = read_fn(path)
pdf_file_names.append(file_name)
pdf_bytes_list.append(pdf_bytes)
p_lang_list.append("en")
for idx, pdf_bytes in enumerate(pdf_bytes_list):
pdf_file_name = pdf_file_names[idx]
pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes)
local_image_dir, local_md_dir = prepare_env(
output_dir, pdf_file_name, parse_method="vlm"
)
image_writer, md_writer = FileBasedDataWriter(
local_image_dir
), FileBasedDataWriter(local_md_dir)
middle_json, infer_result = vlm_doc_analyze(
pdf_bytes, image_writer=image_writer, backend="transformers"
)
pdf_info = middle_json["pdf_info"]
image_dir = str(os.path.basename(local_image_dir))
md_content_str = vlm_union_make(pdf_info, MakeMode.MM_MD, image_dir)
md_writer.write_string(
f"{pdf_file_name}.md",
md_content_str,
)
content_list = vlm_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir)
md_writer.write_string(
f"{pdf_file_name}_content_list.json",
json.dumps(content_list, ensure_ascii=False, indent=4),
)
md_writer.write_string(
f"{pdf_file_name}_middle.json",
json.dumps(middle_json, ensure_ascii=False, indent=4),
)
model_output = ("\n" + "-" * 50 + "\n").join(infer_result)
md_writer.write_string(
f"{pdf_file_name}_model_output.txt",
model_output,
)
logger.info(f"local output dir is {local_md_dir}")
assert_content("tests/unittest/output/test/vlm/test_content_list.json")
def write_infer_result(
infer_results,
all_image_lists,
all_pdf_docs,
lang_list,
ocr_enabled_list,
pdf_file_names,
output_dir,
parse_method,
):
for idx, model_list in enumerate(infer_results):
model_json = copy.deepcopy(model_list)
pdf_file_name = pdf_file_names[idx]
local_image_dir, local_md_dir = prepare_env(
output_dir, pdf_file_name, parse_method
)
image_writer, md_writer = FileBasedDataWriter(
local_image_dir
), FileBasedDataWriter(local_md_dir)
images_list = all_image_lists[idx]
pdf_doc = all_pdf_docs[idx]
_lang = lang_list[idx]
_ocr_enable = ocr_enabled_list[idx]
middle_json = pipeline_result_to_middle_json(
model_list,
images_list,
pdf_doc,
image_writer,
_lang,
_ocr_enable,
True,
)
pdf_info = middle_json["pdf_info"]
image_dir = str(os.path.basename(local_image_dir))
# 写入 md 文件
md_content_str = pipeline_union_make(pdf_info, MakeMode.MM_MD, image_dir)
md_writer.write_string(
f"{pdf_file_name}.md",
md_content_str,
)
content_list = pipeline_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir)
md_writer.write_string(
f"{pdf_file_name}_content_list.json",
json.dumps(content_list, ensure_ascii=False, indent=4),
)
md_writer.write_string(
f"{pdf_file_name}_middle.json",
json.dumps(middle_json, ensure_ascii=False, indent=4),
)
md_writer.write_string(
f"{pdf_file_name}_model.json",
json.dumps(model_json, ensure_ascii=False, indent=4),
)
logger.info(f"local output dir is {local_md_dir}")
def validate_html(html_content):
try:
soup = BeautifulSoup(html_content, "html.parser")
return True
except Exception as e:
return False
def assert_content(content_path):
content_list = []
with open(content_path, "r", encoding="utf-8") as file:
content_list = json.load(file)
type_set = set()
for content_dict in content_list:
match content_dict["type"]:
# 图片校验,只校验 Caption
case "image":
type_set.add("image")
assert (
content_dict["image_caption"][0].strip().lower()
== "Figure 1: Figure Caption".lower()
)
image_writer, md_writer = FileBasedDataWriter(
local_image_dir
), FileBasedDataWriter(local_md_dir)
middle_json, infer_result = vlm_doc_analyze(
pdf_bytes,
image_writer=image_writer,
backend=backend,
server_url=server_url,
# 表格校验,校验 Caption,表格格式和表格内容
case "table":
type_set.add("table")
assert (
content_dict["table_caption"][0].strip().lower()
== "Table 1: Table Caption".lower()
)
pdf_info = middle_json["pdf_info"]
if f_draw_layout_bbox:
draw_layout_bbox(
pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_layout.pdf"
)
if f_draw_span_bbox:
draw_span_bbox(
pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_span.pdf"
)
if f_dump_orig_pdf:
md_writer.write(
f"{pdf_file_name}_origin.pdf",
pdf_bytes,
)
if f_dump_md:
image_dir = str(os.path.basename(local_image_dir))
md_content_str = vlm_union_make(pdf_info, f_make_md_mode, image_dir)
md_writer.write_string(
f"{pdf_file_name}.md",
md_content_str,
assert validate_html(content_dict["table_body"])
target_str_list = [
"Linear Regression",
"0.98740",
"1321.2",
"2-order Polynomial",
"0.99906",
"26.4",
"3-order Polynomial",
"0.99913",
"101.2",
"4-order Polynomial",
"0.99914",
"94.1",
"Gray Prediction",
"0.00617",
"687",
]
correct_count = 0
for target_str in target_str_list:
if target_str in content_dict["table_body"]:
correct_count += 1
assert correct_count > 0.9 * len(target_str_list)
# 公式校验,检测是否含有公式元素
case "equation":
type_set.add("equation")
target_str_list = ["$$", "lambda", "frac", "bar"]
for target_str in target_str_list:
assert target_str in content_dict["text"]
# 文本校验,文本相似度超过90
case "text":
type_set.add("text")
assert (
fuzz.ratio(
content_dict["text"],
"Trump graduated from the Wharton School of the University of Pennsylvania with a bachelor's degree in 1968. He became president of his father's real estate business in 1971 and renamed it The Trump Organization.",
)
if f_dump_content_list:
image_dir = str(os.path.basename(local_image_dir))
content_list = vlm_union_make(
pdf_info, MakeMode.CONTENT_LIST, image_dir
)
md_writer.write_string(
f"{pdf_file_name}_content_list.json",
json.dumps(content_list, ensure_ascii=False, indent=4),
)
if f_dump_middle_json:
md_writer.write_string(
f"{pdf_file_name}_middle.json",
json.dumps(middle_json, ensure_ascii=False, indent=4),
)
if f_dump_model_output:
model_output = ("\n" + "-" * 50 + "\n").join(infer_result)
md_writer.write_string(
f"{pdf_file_name}_model_output.txt",
model_output,
)
logger.info(f"local output dir is {local_md_dir}")
def parse_doc(
path_list: list[Path],
output_dir,
lang="ch",
server_url=None,
start_page_id=0,
end_page_id=None,
):
file_name_list = []
pdf_bytes_list = []
lang_list = []
for path in path_list:
file_name = str(Path(path).stem)
pdf_bytes = read_fn(path)
file_name_list.append(file_name)
pdf_bytes_list.append(pdf_bytes)
lang_list.append(lang)
do_parse(
output_dir=output_dir,
pdf_file_names=file_name_list,
pdf_bytes_list=pdf_bytes_list,
server_url=server_url,
start_page_id=start_page_id,
end_page_id=end_page_id,
)
__dir__ = os.path.dirname(os.path.abspath(__file__))
pdf_files_dir = os.path.join(__dir__, "pdfs")
output_dir = os.path.join(__dir__, "output")
pdf_suffixes = [".pdf"]
image_suffixes = [".png", ".jpeg", ".jpg"]
doc_path_list = []
for doc_path in Path(pdf_files_dir).glob("*"):
if doc_path.suffix in pdf_suffixes + image_suffixes:
doc_path_list.append(doc_path)
# os.environ["MINERU_MODEL_SOURCE"] = "modelscope"
parse_doc(doc_path_list, output_dir)
> 90
)
assert len(type_set) >= 4
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment