Commit 29d26261 authored by myhloli's avatar myhloli
Browse files

refactor: add method option for PDF parsing and improve resource management

parent a2a8b459
repos:
- repo: https://github.com/PyCQA/flake8
rev: 5.0.4
hooks:
- id: flake8
args: ["--max-line-length=150", "--ignore=E131,E125,W503,W504,E203"]
- repo: https://github.com/PyCQA/isort
rev: 5.11.5
hooks:
- id: isort
- repo: https://github.com/pre-commit/mirrors-yapf
rev: v0.32.0
hooks:
- id: yapf
args: ["--style={based_on_style: google, column_limit: 150, indent_width: 4}"]
- repo: https://github.com/codespell-project/codespell
rev: v2.2.1
hooks:
- id: codespell
args: ['--skip', '*.json']
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.3.0
hooks:
- id: trailing-whitespace
- id: check-yaml
- id: end-of-file-fixer
- id: requirements-txt-fixer
- id: double-quote-string-fixer
- id: check-merge-conflict
- id: fix-encoding-pragma
args: ["--remove"]
- id: mixed-line-ending
args: ["--fix=lf"]
- repo: https://github.com/executablebooks/mdformat
rev: 0.7.9
hooks:
- id: mdformat
args: ["--number", "--table-width", "200"]
additional_dependencies:
- mdformat-openmmlab
- mdformat_frontmatter
- linkify-it-py
- repo: https://github.com/myint/docformatter
rev: v1.3.1
hooks:
- id: docformatter
args: ["--in-place", "--wrap-descriptions", "119"]
......@@ -230,6 +230,8 @@ def result_to_middle_json(model_list, images_list, pdf_doc, image_writer, lang=N
llm_aided_title(middle_json["pdf_info"], title_aided_config)
logger.info(f'llm aided title time: {round(time.time() - llm_aided_title_start_time, 2)}')
"""清理内存"""
pdf_doc.close()
clean_memory(get_device())
return middle_json
......
......@@ -30,6 +30,18 @@ from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
required=True,
help='output local directory',
)
@click.option(
'-m',
'--method',
'method',
type=click.Choice(['auto', 'txt', 'ocr']),
help="""the method for parsing pdf:
auto: Automatically determine the method based on the file type.
txt: Use text extraction method.
ocr: Use OCR method for image-based PDFs.
Without method specified, 'auto' will be used by default.""",
default='auto',
)
@click.option(
'-b',
'--backend',
......@@ -125,7 +137,7 @@ from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
)
def main(input_path, output_dir, backend, lang, server_url, start_page_id, end_page_id, formula_enable, table_enable, device_mode, virtual_vram, model_source):
def main(input_path, output_dir, method, backend, lang, server_url, start_page_id, end_page_id, formula_enable, table_enable, device_mode, virtual_vram, model_source):
if os.getenv('MINERU_FORMULA_ENABLE', None) is None:
os.environ['MINERU_FORMULA_ENABLE'] = str(formula_enable).lower()
......@@ -167,8 +179,17 @@ def main(input_path, output_dir, backend, lang, server_url, start_page_id, end_p
file_name_list.append(file_name)
pdf_bytes_list.append(pdf_bytes)
lang_list.append(lang)
do_parse(output_dir, file_name_list, pdf_bytes_list, lang_list, backend, server_url,
start_page_id=start_page_id, end_page_id=end_page_id)
do_parse(
output_dir=output_dir,
pdf_file_names=file_name_list,
pdf_bytes_list=pdf_bytes_list,
p_lang_list=lang_list,
backend=backend,
parse_method=method,
server_url=server_url,
start_page_id=start_page_id,
end_page_id=end_page_id
)
except Exception as e:
logger.exception(e)
......
......@@ -69,6 +69,9 @@ def convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id=0, end_page
# 获取字节数据
output_bytes = output_buffer.getvalue()
pdf.close() # 关闭原PDF文档以释放资源
output_pdf.close() # 关闭新PDF文档以释放资源
return output_bytes
......
......@@ -59,9 +59,9 @@ def configure_model(model_dir, model_type):
@click.command()
def download_models():
"""下载MinerU模型文件。
"""Download MinerU model files.
支持从ModelScopeHuggingFace下载pipeline或VLM模型。
Supports downloading pipeline or VLM models from ModelScope or HuggingFace.
"""
# 交互式输入下载来源
source = click.prompt(
......
boto3>=1.28.43
Brotli>=1.1.0
click>=8.1.7
fast-langdetect>=0.2.3,<0.3.0
loguru>=0.6.0
numpy>=1.21.6
pydantic>=2.7.2,<2.11
PyMuPDF>=1.24.9,<1.25.0
scikit-learn>=1.0.2
torch>=2.2.2,!=2.5.0,!=2.5.1,<3
torchvision
transformers>=4.49.0,!=4.51.0,<5.0.0
pdfminer.six==20250506
tqdm>=4.67.1
# The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator.
from pathlib import Path
from setuptools import setup, find_packages
from mineru.version import __version__
if __name__ == '__main__':
with Path(Path(__file__).parent,
'README.md').open(encoding='utf-8') as file:
long_description = file.read()
setup(
name="mineru", # 项目名
version=__version__, # 自动从tag中获取版本号
license="AGPL-3.0",
packages=find_packages() + ["mineru.resources"] + ["mineru.model.ocr.paddleocr2pytorch.pytorchocr.utils.resources"], # 包含所有的包
package_data={
"mineru.resources": ["**"], # 包含magic_pdf.resources目录下的所有文件
"mineru.model.ocr.paddleocr2pytorch.pytorchocr.utils.resources": ["**"], # pytorchocr.resources目录下的所有文件
},
install_requires=[
"boto3>=1.28.43",
"click>=8.1.7",
"loguru>=0.6.0",
"numpy>=1.21.6",
"pdfminer.six==20250506",
"tqdm>=4.67.1",
"requests",
"httpx",
"pillow",
"pypdfium2",
"loguru",
"pypdf",
"reportlab",
], # 项目依赖的第三方库
extras_require={
"vlm":[
"transformers>=4.51.1",
"torch>=2.6.0",
"accelerate>=1.5.1"
"pydantic>=2.7.2,<2.11",
],
"sglang": [
"sglang[all]==0.4.6.post5",
],
"pipeline": [
"matplotlib>=3.10,<4",
"ultralytics>=8.3.48,<9", # yolov8,公式检测
"doclayout_yolo==0.0.4", # doclayout_yolo
"dill>=0.3.8,<1", # doclayout_yolo
"rapid_table>=1.0.5,<2.0.0", # rapid_table
"PyYAML>=6.0.2,<7", # yaml
"ftfy>=6.3.1,<7", # unimernet_hf
"openai>=1.70.0,<2", # openai SDK
"shapely>=2.0.7,<3", # imgaug-paddleocr2pytorch
"pyclipper>=1.3.0,<2", # paddleocr2pytorch
"omegaconf>=2.3.0,<3", # paddleocr2pytorch
"torch>=2.2.2,!=2.5.0,!=2.5.1,<3",
"torchvision",
"transformers>=4.49.0,!=4.51.0,<5.0.0",
"fast-langdetect>=0.2.3,<0.3.0",
],
"pipeline_old_linux": [
"matplotlib>=3.10,<=3.10.1",
"ultralytics>=8.3.48,<=8.3.104", # yolov8,公式检测
"doclayout_yolo==0.0.4", # doclayout_yolo
"dill==0.3.8", # doclayout_yolo
"PyYAML==6.0.2", # yaml
"ftfy==6.3.1", # unimernet_hf
"openai==1.71.0", # openai SDK
"shapely==2.1.0", # imgaug-paddleocr2pytorch
"pyclipper==1.3.0.post6", # paddleocr2pytorch
"omegaconf==2.3.0", # paddleocr2pytorch
"albumentations==1.4.20", # 1.4.21引入的simsimd不支持2019年及更早的linux系统
"rapid_table==1.0.3", # rapid_table新版本依赖的onnxruntime不支持2019年及更早的linux系统
"torch>=2.2.2,!=2.5.0,!=2.5.1,<3",
"torchvision",
"transformers>=4.49.0,!=4.51.0,<5.0.0",
"fast-langdetect>=0.2.3,<0.3.0",
],
},
description="A practical tool for converting PDF to Markdown", # 简短描述
long_description=long_description, # 详细描述
long_description_content_type="text/markdown", # 如果README是Markdown格式
project_urls={
"Home": "https://mineru.net/",
"Repository": "https://github.com/opendatalab/MinerU",
},
keywords=["magic-pdf, mineru, MinerU, convert, pdf, markdown"],
classifiers=[
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
],
python_requires=">=3.10,<3.14", # 项目依赖的 Python 版本
entry_points={
"console_scripts": [
"mineru = mineru.cli:client.main", # 命令行入口点,mineru命令将调用mineru.cli.client.main函数
"mineru-sglang-server = mineru.cli.vlm_sglang_server:main", # sglang服务器入口点
"mineru-models-download = mineru.cli.models_download:download_models", # 模型下载入口点
],
}, # 项目提供的可执行命令
include_package_data=True, # 是否包含非代码文件,如数据文件、配置文件等
zip_safe=False, # 是否使用 zip 文件格式打包,一般设为 False
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment