Unverified Commit 0bdbdffc authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #3076 from opendatalab/release-2.1.1

Release 2.1.1
parents cad4c585 4f88955d
......@@ -206,37 +206,49 @@ def filter_nested_tables(table_res_list, overlap_threshold=0.8, area_threshold=0
def remove_overlaps_min_blocks(res_list):
# 重叠block,小的不能直接删除,需要和大的那个合并成一个更大的。
# 删除重叠blocks中较小的那些
# 重叠block,小的不能直接删除,需要和大的那个合并成一个更大的。
# 删除重叠blocks中较小的那些
need_remove = []
for res1 in res_list:
for res2 in res_list:
if res1 != res2:
overlap_box = get_minbox_if_overlap_by_ratio(
res1['bbox'], res2['bbox'], 0.8
)
if overlap_box is not None:
res_to_remove = next(
(res for res in res_list if res['bbox'] == overlap_box),
None,
)
if (
res_to_remove is not None
and res_to_remove not in need_remove
):
large_res = res1 if res1 != res_to_remove else res2
x1, y1, x2, y2 = large_res['bbox']
sx1, sy1, sx2, sy2 = res_to_remove['bbox']
x1 = min(x1, sx1)
y1 = min(y1, sy1)
x2 = max(x2, sx2)
y2 = max(y2, sy2)
large_res['bbox'] = [x1, y1, x2, y2]
need_remove.append(res_to_remove)
if len(need_remove) > 0:
for res in need_remove:
res_list.remove(res)
for i in range(len(res_list)):
# 如果当前元素已在需要移除列表中,则跳过
if res_list[i] in need_remove:
continue
for j in range(i + 1, len(res_list)):
# 如果比较对象已在需要移除列表中,则跳过
if res_list[j] in need_remove:
continue
overlap_box = get_minbox_if_overlap_by_ratio(
res_list[i]['bbox'], res_list[j]['bbox'], 0.8
)
if overlap_box is not None:
res_to_remove = None
large_res = None
# 确定哪个是小块(要移除的)
if overlap_box == res_list[i]['bbox']:
res_to_remove = res_list[i]
large_res = res_list[j]
elif overlap_box == res_list[j]['bbox']:
res_to_remove = res_list[j]
large_res = res_list[i]
if res_to_remove is not None and res_to_remove not in need_remove:
# 更新大块的边界为两者的并集
x1, y1, x2, y2 = large_res['bbox']
sx1, sy1, sx2, sy2 = res_to_remove['bbox']
x1 = min(x1, sx1)
y1 = min(y1, sy1)
x2 = max(x2, sx2)
y2 = max(y2, sy2)
large_res['bbox'] = [x1, y1, x2, y2]
need_remove.append(res_to_remove)
# 从列表中移除标记的元素
for res in need_remove:
res_list.remove(res)
return res_list, need_remove
......
site_name: MinerU
site_url: https://opendatalab.github.io/MinerU
repo_name: opendatalab/MinerU
repo_url: https://github.com/opendatalab/MinerU
theme:
icon:
repo: fontawesome/brands/github
name: material
palette:
# Palette toggle for automatic mode
- media: "(prefers-color-scheme)"
scheme: default
primary: black
toggle:
icon: material/brightness-auto
name: Switch to light mode
# Palette toggle for light mode
- media: "(prefers-color-scheme: light)"
scheme: default
primary: black
toggle:
icon: material/brightness-7
name: Switch to dark mode
# Palette toggle for dark mode
- media: "(prefers-color-scheme: dark)"
scheme: slate
primary: black
toggle:
icon: material/brightness-4
name: Switch to system preference
logo: images/logo.png
favicon: images/logo.png
features:
- content.tabs.link
- content.code.annotate
- content.code.copy
- navigation.expand
- navigation.footer
- navigation.tabs
- navigation.sections
- navigation.path
- navigation.indexes
- navigation.top
- navigation.tracking
- search.suggest
- toc.follow
- toc.integrate
extra:
social:
- icon: fontawesome/brands/github
link: https://github.com/opendatalab/MinerU
name: GitHub
- icon: fontawesome/brands/x-twitter
link: https://x.com/OpenDataLab_AI
name: X-Twitter
- icon: fontawesome/brands/discord
link: https://discord.gg/Tdedn9GTXq
name: Discord
- icon: fontawesome/brands/weixin
link: http://mineru.space/s/V85Yl
name: WeChat
- icon: material/email
link: mailto:OpenDataLab@pjlab.org.cn
name: E-mail
copyright: © 2024 - 2025 MinerU. All Rights Reserved.
nav:
- Home:
- "MinerU": index.md
- Quick Start:
- Quick Start: quick_start/index.md
- Extension Modules: quick_start/extension_modules.md
- Docker Deployment: quick_start/docker_deployment.md
- Usage:
- Usage: usage/index.md
- Quick Usage: usage/quick_usage.md
- Model Source: usage/model_source.md
- CLI Tools: usage/cli_tools.md
- Advanced CLI Parameters: usage/advanced_cli_parameters.md
- Reference:
- Output File Format: reference/output_files.md
- FAQ:
- FAQ: faq/index.md
- Demo:
- Demo: demo/index.md
- Reference:
- Output File Format: reference/output_files.md
- FAQ:
- FAQ: faq/index.md
- Demo:
- Demo: demo/index.md
plugins:
- search
- i18n:
reconfigure_material: true
docs_structure: folder
fallback_to_default: true
reconfigure_material: true
reconfigure_search: true
languages:
- locale: en
default: true
name: English
build: true
- locale: zh
name: 中文
build: true
nav_translations:
Home: 主页
Quick Start: 快速开始
Extension Modules: 扩展模块安装
Docker Deployment: Docker部署
Usage: 使用方法
Quick Usage: 快速使用
CLI Tools: 命令行工具
Model Source: 模型源
Advanced CLI Parameters: 命令行进阶参数
FAQ: 常见问题解答
Reference: 参考资料
Output File Format: 输出文件格式
Demo: 在线演示
- mkdocs-video
markdown_extensions:
- admonition
- pymdownx.details
- attr_list
- gfm_admonition
- pymdownx.highlight:
use_pygments: true
- pymdownx.superfences
- pymdownx.tasklist:
custom_checkbox: true
\ No newline at end of file
......@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "mineru"
dynamic = ["version"]
license = {text = "AGPL-3.0"}
license = { text = "AGPL-3.0" }
description = "A practical tool for converting PDF to Markdown"
readme = "README.md"
requires-python = ">=3.10,<3.14"
......@@ -33,9 +33,19 @@ dependencies = [
"modelscope>=1.26.0",
"huggingface-hub>=0.32.4",
"json-repair>=0.46.2",
"opencv-python>=4.11.0.86",
"fast-langdetect>=0.2.3,<0.3.0",
]
[project.optional-dependencies]
test = [
"mineru[core]",
"pytest",
"pytest-cov",
"coverage",
"beautifulsoup4",
"fuzzywuzzy"
]
vlm = [
"transformers>=4.51.1",
"torch>=2.6.0",
......@@ -60,7 +70,6 @@ pipeline = [
"torch>=2.2.2,!=2.5.0,!=2.5.1,<3",
"torchvision",
"transformers>=4.49.0,!=4.51.0,<5.0.0",
"fast-langdetect>=0.2.3,<0.3.0",
]
api = [
"fastapi",
......@@ -97,7 +106,6 @@ pipeline_old_linux = [
"torch>=2.2.2,!=2.5.0,!=2.5.1,<3",
"torchvision",
"transformers>=4.49.0,!=4.51.0,<5.0.0",
"fast-langdetect>=0.2.3,<0.3.0",
]
[project.urls]
......@@ -112,7 +120,7 @@ mineru-api = "mineru.cli.fast_api:main"
mineru-gradio = "mineru.cli.gradio_app:main"
[tool.setuptools.dynamic]
version = {attr = "mineru.version.__version__"}
version = { attr = "mineru.version.__version__" }
[tool.setuptools.packages.find]
include = ["mineru*"]
......@@ -125,3 +133,38 @@ namespaces = false
[tool.setuptools]
include-package-data = true
zip-safe = false
[tool.pytest.ini_options]
addopts = "-s --cov=mineru --cov-report html"
[tool.coverage.run]
command_line = "-m pytest tests/unittest/test_e2e.py"
source = ["mineru/"]
omit = [
"*/vlm_sglang_model/*",
"*/gradio_app.py",
"*/models_download.py",
"*/fast_api.py",
"*/cli/client.py",
"*/sglang_engine_predictor.py",
"*/vlm_sglang_server.py",
"*/cli_parser.py",
"*/run_async.py"
]
[tool.coverage.html]
directory = "htmlcov"
[tool.coverage.report]
exclude_also = [
'def __repr__',
'if self.debug:',
'if settings.DEBUG',
'raise AssertionError',
'raise NotImplementedError',
'if 0:',
'if __name__ == .__main__.:',
'if TYPE_CHECKING:',
'class .*\bProtocol\):',
'@(abc\.)?abstractmethod',
]
\ No newline at end of file
pytest
Levenshtein
nltk
rapidfuzz
statistics
openxlab #安装opendatalab
pandas
numpy
matplotlib
seaborn
scipy
scikit-learn
tqdm
htmltabletomd
pypandoc
pyopenssl==24.0.0
struct-eqtable==0.1.0
pytest-cov
beautifulsoup4
coverage
\ No newline at end of file
#!/bin/bash
max_retries=5
retry_count=0
while true; do
# prepare env
#python -m pip install -r requirements-qa.txt
#python -m pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple
pip install -e .
python -m pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
pip install modelscope
wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/scripts/download_models.py -O download_models.py
python download_models.py
exit_code=$?
if [ $exit_code -eq 0 ]; then
echo "test.sh 成功执行!"
break
else
let retry_count+=1
if [ $retry_count -ge $max_retries ]; then
echo "达到最大重试次数 ($max_retries),放弃重试。"
exit 1
fi
echo "test.sh 执行失败 (退出码: $exit_code)。尝试第 $retry_count 次重试..."
sleep 5
fi
done
import os
conf = {
"code_path": os.environ.get('GITHUB_WORKSPACE'),
"pdf_dev_path" : os.environ.get('GITHUB_WORKSPACE') + "/tests/test_cli/pdf_dev",
#"code_path": "/home/quyuan/ci/actions-runner/MinerU",
#"pdf_dev_path": "/home/quyuan/ci/actions-runner/MinerU/tests/test_cli/pdf_dev",
"pdf_res_path": "/tmp/magic-pdf",
"jsonl_path": "s3://llm-qatest-pnorm/mineru/test/line1.jsonl",
"s3_pdf_path": "s3://llm-qatest-pnorm/mineru/test/test_rearch_report.pdf"
}
import pytest
import torch
def clear_gpu_memory():
'''
clear GPU memory
'''
torch.cuda.empty_cache()
print("GPU memory cleared.")
"""
calculate_score
"""
import os
import re
import json
from Levenshtein import distance
from lib import scoring
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
class Scoring:
"""
calculate_score
"""
def __init__(self, result_path):
"""
init
"""
self.edit_distances = []
self.bleu_scores = []
self.sim_scores = []
self.filenames = []
self.score_dict = {}
self.anntion_cnt = 0
self.fw = open(result_path, "w+", encoding='utf-8')
def simple_bleu_score(self, candidate, reference):
"""
get bleu score
"""
candidate_tokens = word_tokenize(candidate)
reference_tokens = word_tokenize(reference)
return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=SmoothingFunction().method1)
def preprocess_string(self, s):
"""
preprocess_string
"""
sub_enter = re.sub(r'\n+', '\n', s)
return re.sub(r' ', ' ', sub_enter)
def calculate_similarity(self, annotion, actual, tool_type):
"""
calculate_similarity
"""
class_dict = {}
edit_distances = []
bleu_scores = []
sim_scores = list()
total_file = 0
for filename in os.listdir(annotion):
if filename.endswith('.md') and not filename.startswith('.'):
total_file = total_file + 1
with open(os.path.join(annotion, filename), 'r', encoding='utf-8') as file_a:
content_a = file_a.read()
self.anntion_cnt = self.anntion_cnt + 1
filepath_b = os.path.join(actual, filename)
if os.path.exists(filepath_b):
with open(filepath_b, 'r', encoding='utf-8') as file_b:
content_b = file_b.read()
self.filenames.append(filename)
edit_dist = distance(self.preprocess_string(content_b),self.preprocess_string(content_a)) / max(len(content_a), len(content_b))
self.edit_distances.append(edit_dist)
edit_distances.append(edit_dist)
bleu_score = self.simple_bleu_score(content_b, content_a)
bleu_scores.append(bleu_score)
self.bleu_scores.append(bleu_score)
score = scoring.score_text(content_b, content_a)
sim_scores.append(score)
self.sim_scores.append(score)
class_dict[filename] = {"edit_dist": edit_dist, "bleu_score": bleu_score, "sim_score": score}
self.score_dict[filename] = {"edit_dist": edit_dist, "bleu_score": bleu_score, "sim_score": score}
else:
print(f"File {filename} not found in actual directory.")
class_average_edit_distance = sum(edit_distances) / len(edit_distances) if edit_distances else 0
class_average_bleu_score = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0
class_average_sim_score = sum(sim_scores) / len(sim_scores) if sim_scores else 0
self.fw.write(json.dumps(class_dict, ensure_ascii=False) + "\n")
ratio = len(class_dict)/total_file
self.fw.write(f"{tool_type} extract ratio: {ratio}" + "\n")
self.fw.write(f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}" + "\n")
self.fw.write(f"{tool_type} Average BLEU Score: {class_average_bleu_score}" + "\n")
self.fw.write(f"{tool_type} Average Sim Score: {class_average_sim_score}" + "\n")
print (f"{tool_type} extract ratio: {ratio}")
print (f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}")
print (f"{tool_type} Average BLEU Score: {class_average_bleu_score}")
print (f"{tool_type} Average Sim Score: {class_average_sim_score}")
return self.score_dict
def summary_scores(self):
"""
calculate the average of edit distance, bleu score and sim score
"""
over_all_dict = dict()
average_edit_distance = sum(self.edit_distances) / len(self.edit_distances) if self.edit_distances else 0
average_bleu_score = sum(self.bleu_scores) / len(self.bleu_scores) if self.bleu_scores else 0
average_sim_score = sum(self.sim_scores) / len(self.sim_scores) if self.sim_scores else 0
over_all_dict["average_edit_distance"] = average_edit_distance
over_all_dict["average_bleu_score"] = average_bleu_score
over_all_dict["average_sim_score"] = average_sim_score
self.fw.write(json.dumps(over_all_dict, ensure_ascii=False) + "\n")
return over_all_dict
def calculate_similarity_total(self, tool_type, download_dir):
"""
calculate the average of edit distance, bleu score and sim score
"""
annotion = os.path.join(download_dir, "annotations", "cleaned")
actual = os.path.join(download_dir, tool_type, "cleaned")
score = self.calculate_similarity(annotion, actual, tool_type)
return score
"""common definitions."""
import os
import shutil
import re
import json
import torch
def clear_gpu_memory():
'''
clear GPU memory
'''
torch.cuda.empty_cache()
print("GPU memory cleared.")
def check_shell(cmd):
"""shell successful."""
res = os.system(cmd)
assert res == 0
def update_config_file(file_path, key, value):
"""update config file."""
with open(file_path, 'r', encoding="utf-8") as fr:
config = json.loads(fr.read())
config[key] = value
# 保存修改后的内容
with open(file_path, 'w', encoding='utf-8') as fw:
json.dump(config, fw, ensure_ascii=False, indent=4)
def cli_count_folders_and_check_contents(file_path):
"""" count cli files."""
if os.path.exists(file_path):
for files in os.listdir(file_path):
folder_count = os.path.getsize(os.path.join(file_path, files))
assert folder_count > 0
assert len(os.listdir(file_path)) > 5
def sdk_count_folders_and_check_contents(file_path):
"""count folders."""
if os.path.exists(file_path):
file_count = os.path.getsize(file_path)
assert file_count > 0
else:
exit(1)
def delete_file(path):
"""delete file."""
if not os.path.exists(path):
if os.path.isfile(path):
try:
os.remove(path)
print(f"File '{path}' deleted.")
except TypeError as e:
print(f"Error deleting file '{path}': {e}")
elif os.path.isdir(path):
try:
shutil.rmtree(path)
print(f"Directory '{path}' and its contents deleted.")
except TypeError as e:
print(f"Error deleting directory '{path}': {e}")
def check_latex_table_exists(file_path):
"""check latex table exists."""
pattern = r'\\begin\{tabular\}.*?\\end\{tabular\}'
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
matches = re.findall(pattern, content, re.DOTALL)
return len(matches) > 0
def check_html_table_exists(file_path):
"""check html table exists."""
pattern = r'<table.*?>.*?</table>'
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
matches = re.findall(pattern, content, re.DOTALL)
return len(matches) > 0
def check_close_tables(file_path):
"""delete no tables."""
latex_pattern = r'\\begin\{tabular\}.*?\\end\{tabular\}'
html_pattern = r'<table.*?>.*?</table>'
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
latex_matches = re.findall(latex_pattern, content, re.DOTALL)
html_matches = re.findall(html_pattern, content, re.DOTALL)
if len(latex_matches) == 0 and len(html_matches) == 0:
return True
else:
return False
\ No newline at end of file
"""
clean data
"""
import argparse
import os
import re
import htmltabletomd # type: ignore
import pypandoc
import argparse
parser = argparse.ArgumentParser(description="get tool type")
parser.add_argument(
"--tool_name",
type=str,
required=True,
help="input tool name",
)
parser.add_argument(
"--download_dir",
type=str,
required=True,
help="input download dir",
)
args = parser.parse_args()
def clean_markdown_images(content):
"""
clean markdown images
"""
pattern = re.compile(r'!\[[^\]]*\]\([^)]*\)', re.IGNORECASE)
cleaned_content = pattern.sub('', content)
return cleaned_content
def clean_ocrmath_photo(content):
"""
clean ocrmath photo
"""
pattern = re.compile(r'\\includegraphics\[.*?\]\{.*?\}', re.IGNORECASE)
cleaned_content = pattern.sub('', content)
return cleaned_content
def convert_html_table_to_md(html_table):
"""
convert html table to markdown table
"""
lines = html_table.strip().split('\n')
md_table = ''
if lines and '<tr>' in lines[0]:
in_thead = True
for line in lines:
if '<th>' in line:
cells = re.findall(r'<th>(.*?)</th>', line)
md_table += '| ' + ' | '.join(cells) + ' |\n'
in_thead = False
elif '<td>' in line and not in_thead:
cells = re.findall(r'<td>(.*?)</td>', line)
md_table += '| ' + ' | '.join(cells) + ' |\n'
md_table = md_table.rstrip() + '\n'
return md_table
def convert_latext_to_md(content):
"""
convert latex table to markdown table
"""
tables = re.findall(r'\\begin\{tabular\}(.*?)\\end\{tabular\}', content, re.DOTALL)
placeholders = []
for table in tables:
placeholder = f"<!-- TABLE_PLACEHOLDER_{len(placeholders)} -->"
replace_str = f"\\begin{{tabular}}{table}cl\\end{{tabular}}"
content = content.replace(replace_str, placeholder)
try:
pypandoc.convert_text(replace_str, format="latex", to="md", outputfile="output.md", encoding="utf-8")
except:
markdown_string = replace_str
else:
markdown_string = open('output.md', 'r', encoding='utf-8').read()
placeholders.append((placeholder, markdown_string))
new_content = content
for placeholder, md_table in placeholders:
new_content = new_content.replace(placeholder, md_table)
# 写入文件
return new_content
def convert_htmltale_to_md(content):
"""
convert html table to markdown table
"""
tables = re.findall(r'<table>(.*?)</table>', content, re.DOTALL)
placeholders = []
for table in tables:
placeholder = f"<!-- TABLE_PLACEHOLDER_{len(placeholders)} -->"
content = content.replace(f"<table>{table}</table>", placeholder)
try:
convert_table = htmltabletomd.convert_table(table)
except:
convert_table = table
placeholders.append((placeholder,convert_table))
new_content = content
for placeholder, md_table in placeholders:
new_content = new_content.replace(placeholder, md_table)
# 写入文件
return new_content
def clean_data(prod_type, download_dir):
"""
clean data
"""
tgt_dir = os.path.join(download_dir, prod_type, "cleaned")
if not os.path.exists(tgt_dir):
os.makedirs(tgt_dir)
source_dir = os.path.join(download_dir, prod_type)
filenames = os.listdir(source_dir)
for filename in filenames:
if filename.endswith('.md'):
input_file = os.path.join(source_dir, filename)
output_file = os.path.join(tgt_dir, "cleaned_" + filename)
with open(input_file, 'r', encoding='utf-8') as fr:
content = fr.read()
new_content = clean_markdown_images(content)
with open(output_file, 'w', encoding='utf-8') as fw:
fw.write(new_content)
if __name__ == '__main__':
tool_type = args.tool_name
download_dir = args.download_dir
clean_data(tool_type, download_dir)
"""
Calculate simscore, refer to (https://github.com/VikParuchuri/marker?tab=readme-ov-file)
"""
import math
from rapidfuzz import fuzz
import re
import regex
from statistics import mean
CHUNK_MIN_CHARS = 25
def chunk_text(text, chunk_len=500):
chunks = [text[i:i+chunk_len] for i in range(0, len(text), chunk_len)]
chunks = [c for c in chunks if c.strip() and len(c) > CHUNK_MIN_CHARS]
return chunks
def overlap_score(hypothesis_chunks, reference_chunks):
if len(reference_chunks) > 0:
length_modifier = len(hypothesis_chunks) / len(reference_chunks)
else:
length_modifier = 0
search_distance = max(len(reference_chunks) // 5, 10)
chunk_scores = []
for i, hyp_chunk in enumerate(hypothesis_chunks):
max_score = 0
total_len = 0
i_offset = int(i * length_modifier)
chunk_range = range(max(0, i_offset-search_distance), min(len(reference_chunks), i_offset+search_distance))
for j in chunk_range:
ref_chunk = reference_chunks[j]
score = fuzz.ratio(hyp_chunk, ref_chunk, score_cutoff=30) / 100
if score > max_score:
max_score = score
total_len = len(ref_chunk)
chunk_scores.append(max_score)
return chunk_scores
def score_text(hypothesis, reference):
# Returns a 0-1 alignment score
hypothesis_chunks = chunk_text(hypothesis)
reference_chunks = chunk_text(reference)
chunk_scores = overlap_score(hypothesis_chunks, reference_chunks)
if len(chunk_scores) > 0:
mean_score = mean(chunk_scores)
return mean_score
else:
return 0
#return mean(chunk_scores)
\ No newline at end of file
{
"bucket_info":{
"bucket-name-1":["ak", "sk", "endpoint"],
"bucket-name-2":["ak", "sk", "endpoint"]
},
"temp-output-dir":"/tmp",
"models-dir":"/tmp/models",
"device-mode":"cpu"
}
\ No newline at end of file
This diff is collapsed.
{"average_sim_score":0.6505598645664856, "average_edit_distance":0.2514908429188901, "average_bleu_score": 0.5808819533975296}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment