refactor: add method option for PDF parsing and improve resource management

29d26261 · myhloli · a2a8b459 · a2a8b459 · 29d26261 · 29d26261
Commit 29d26261 authored Jun 10, 2025 by myhloli
7 changed files
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
-repos:
-  - repo: https://github.com/PyCQA/flake8
-    rev: 5.0.4
-    hooks:
-      - id: flake8
-        args: ["--max-line-length=150", "--ignore=E131,E125,W503,W504,E203"]
-  - repo: https://github.com/PyCQA/isort
-    rev: 5.11.5
-    hooks:
-      - id: isort
-  - repo: https://github.com/pre-commit/mirrors-yapf
-    rev: v0.32.0
-    hooks:
-      - id: yapf
-        args: ["--style={based_on_style: google, column_limit: 150, indent_width: 4}"]
-  - repo: https://github.com/codespell-project/codespell
-    rev: v2.2.1
-    hooks:
-      - id: codespell
-        args: ['--skip', '*.json']
-  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.3.0
-    hooks:
-      - id: trailing-whitespace
-      - id: check-yaml
-      - id: end-of-file-fixer
-      - id: requirements-txt-fixer
-      - id: double-quote-string-fixer
-      - id: check-merge-conflict
-      - id: fix-encoding-pragma
-        args: ["--remove"]
-      - id: mixed-line-ending
-        args: ["--fix=lf"]
-  - repo: https://github.com/executablebooks/mdformat
-    rev: 0.7.9
-    hooks:
-      - id: mdformat
-        args: ["--number", "--table-width", "200"]
-        additional_dependencies:
-          - mdformat-openmmlab
-          - mdformat_frontmatter
-          - linkify-it-py
-  - repo: https://github.com/myint/docformatter
-    rev: v1.3.1
-    hooks:
-      - id: docformatter
-        args: ["--in-place", "--wrap-descriptions", "119"]
--- a/mineru/backend/pipeline/model_json_to_middle_json.py
+++ b/mineru/backend/pipeline/model_json_to_middle_json.py
@@ -230,6 +230,8 @@ def result_to_middle_json(model_list, images_list, pdf_doc, image_writer, lang=N
                llm_aided_title(middle_json["pdf_info"], title_aided_config)
                logger.info(f'llm aided title time: {round(time.time() - llm_aided_title_start_time, 2)}')

+    """清理内存"""
+    pdf_doc.close()
    clean_memory(get_device())

    return middle_json

--- a/mineru/cli/client.py
+++ b/mineru/cli/client.py
@@ -30,6 +30,18 @@ from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
    required=True,
    help='output local directory',
 )
+@click.option(
+    '-m',
+    '--method',
+    'method',
+    type=click.Choice(['auto', 'txt', 'ocr']),
+    help="""the method for parsing pdf:
+    auto: Automatically determine the method based on the file type.
+    txt: Use text extraction method.
+    ocr: Use OCR method for image-based PDFs.
+    Without method specified, 'auto' will be used by default.""",
+    default='auto',
+)
 @click.option(
    '-b',
    '--backend',
@@ -125,7 +137,7 @@ from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
 )


-def main(input_path, output_dir, backend, lang, server_url, start_page_id, end_page_id, formula_enable, table_enable, device_mode, virtual_vram, model_source):
+def main(input_path, output_dir, method, backend, lang, server_url, start_page_id, end_page_id, formula_enable, table_enable, device_mode, virtual_vram, model_source):

    if os.getenv('MINERU_FORMULA_ENABLE', None) is None:
        os.environ['MINERU_FORMULA_ENABLE'] = str(formula_enable).lower()
@@ -167,8 +179,17 @@ def main(input_path, output_dir, backend, lang, server_url, start_page_id, end_p
                file_name_list.append(file_name)
                pdf_bytes_list.append(pdf_bytes)
                lang_list.append(lang)
-            do_parse(output_dir, file_name_list, pdf_bytes_list, lang_list, backend, server_url,
-                         start_page_id=start_page_id, end_page_id=end_page_id)
+            do_parse(
+                output_dir=output_dir,
+                pdf_file_names=file_name_list,
+                pdf_bytes_list=pdf_bytes_list,
+                p_lang_list=lang_list,
+                backend=backend,
+                parse_method=method,
+                server_url=server_url,
+                start_page_id=start_page_id,
+                end_page_id=end_page_id
+            )
        except Exception as e:
            logger.exception(e)


--- a/mineru/cli/common.py
+++ b/mineru/cli/common.py
@@ -69,6 +69,9 @@ def convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id=0, end_page
    # 获取字节数据
    output_bytes = output_buffer.getvalue()

+    pdf.close()  # 关闭原PDF文档以释放资源
+    output_pdf.close()  # 关闭新PDF文档以释放资源
+
    return output_bytes



--- a/mineru/cli/models_download.py
+++ b/mineru/cli/models_download.py
@@ -59,9 +59,9 @@ def configure_model(model_dir, model_type):

 @click.command()
 def download_models():
-    """下载MinerU模型文件。
+    """Download MinerU model files.

-    支持从ModelScope或HuggingFace下载pipeline或VLM模型。
+    Supports downloading pipeline or VLM models from ModelScope or HuggingFace.
    """
    # 交互式输入下载来源
    source = click.prompt(

--- a/requirements.txt
+++ b/requirements.txt
-boto3>=1.28.43
-Brotli>=1.1.0
-click>=8.1.7
-fast-langdetect>=0.2.3,<0.3.0
-loguru>=0.6.0
-numpy>=1.21.6
-pydantic>=2.7.2,<2.11
-PyMuPDF>=1.24.9,<1.25.0
-scikit-learn>=1.0.2
-torch>=2.2.2,!=2.5.0,!=2.5.1,<3
-torchvision
-transformers>=4.49.0,!=4.51.0,<5.0.0
-pdfminer.six==20250506
-tqdm>=4.67.1
-# The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator.
--- a/setup.py_back
+++ b/setup.py_back
-from pathlib import Path
-from setuptools import setup, find_packages
-from mineru.version import __version__
-
-
-if __name__ == '__main__':
-    with Path(Path(__file__).parent,
-              'README.md').open(encoding='utf-8') as file:
-        long_description = file.read()
-    setup(
-        name="mineru",  # 项目名
-        version=__version__,  # 自动从tag中获取版本号
-        license="AGPL-3.0",
-        packages=find_packages() + ["mineru.resources"] + ["mineru.model.ocr.paddleocr2pytorch.pytorchocr.utils.resources"],  # 包含所有的包
-        package_data={
-            "mineru.resources": ["**"],  # 包含magic_pdf.resources目录下的所有文件
-            "mineru.model.ocr.paddleocr2pytorch.pytorchocr.utils.resources": ["**"],  # pytorchocr.resources目录下的所有文件
-        },
-        install_requires=[
-                    "boto3>=1.28.43",
-                    "click>=8.1.7",
-                    "loguru>=0.6.0",
-                    "numpy>=1.21.6",
-                    "pdfminer.six==20250506",
-                    "tqdm>=4.67.1",
-                    "requests",
-                    "httpx",
-                    "pillow",
-                    "pypdfium2",
-                    "loguru",
-                    "pypdf",
-                    "reportlab",
-        ],  # 项目依赖的第三方库
-        extras_require={
-            "vlm":[
-                "transformers>=4.51.1",
-                "torch>=2.6.0",
-                "accelerate>=1.5.1"
-                "pydantic>=2.7.2,<2.11",
-            ],
-            "sglang": [
-                "sglang[all]==0.4.6.post5",
-            ],
-            "pipeline": [
-                     "matplotlib>=3.10,<4",
-                     "ultralytics>=8.3.48,<9",  # yolov8,公式检测
-                     "doclayout_yolo==0.0.4",  # doclayout_yolo
-                     "dill>=0.3.8,<1",  # doclayout_yolo
-                     "rapid_table>=1.0.5,<2.0.0",  # rapid_table
-                     "PyYAML>=6.0.2,<7",  # yaml
-                     "ftfy>=6.3.1,<7",  # unimernet_hf
-                     "openai>=1.70.0,<2",  # openai SDK
-                     "shapely>=2.0.7,<3",  # imgaug-paddleocr2pytorch
-                     "pyclipper>=1.3.0,<2",  # paddleocr2pytorch
-                     "omegaconf>=2.3.0,<3",  # paddleocr2pytorch
-                    "torch>=2.2.2,!=2.5.0,!=2.5.1,<3",
-                    "torchvision",
-                    "transformers>=4.49.0,!=4.51.0,<5.0.0",
-                    "fast-langdetect>=0.2.3,<0.3.0",
-            ],
-            "pipeline_old_linux": [
-                    "matplotlib>=3.10,<=3.10.1",
-                    "ultralytics>=8.3.48,<=8.3.104",  # yolov8,公式检测
-                    "doclayout_yolo==0.0.4",  # doclayout_yolo
-                    "dill==0.3.8",  # doclayout_yolo
-                    "PyYAML==6.0.2",  # yaml
-                    "ftfy==6.3.1",  # unimernet_hf
-                    "openai==1.71.0",  # openai SDK
-                    "shapely==2.1.0",  # imgaug-paddleocr2pytorch
-                    "pyclipper==1.3.0.post6",  # paddleocr2pytorch
-                    "omegaconf==2.3.0",  # paddleocr2pytorch
-                    "albumentations==1.4.20", # 1.4.21引入的simsimd不支持2019年及更早的linux系统
-                    "rapid_table==1.0.3",  # rapid_table新版本依赖的onnxruntime不支持2019年及更早的linux系统
-                    "torch>=2.2.2,!=2.5.0,!=2.5.1,<3",
-                    "torchvision",
-                    "transformers>=4.49.0,!=4.51.0,<5.0.0",
-                    "fast-langdetect>=0.2.3,<0.3.0",
-            ],
-        },
-        description="A practical tool for converting PDF to Markdown",  # 简短描述
-        long_description=long_description,  # 详细描述
-        long_description_content_type="text/markdown",  # 如果README是Markdown格式
-        project_urls={
-            "Home": "https://mineru.net/",
-            "Repository": "https://github.com/opendatalab/MinerU",
-        },
-        keywords=["magic-pdf, mineru, MinerU, convert, pdf, markdown"],
-        classifiers=[
-            "Programming Language :: Python :: 3.10",
-            "Programming Language :: Python :: 3.11",
-            "Programming Language :: Python :: 3.12",
-            "Programming Language :: Python :: 3.13",
-        ],
-        python_requires=">=3.10,<3.14",  # 项目依赖的 Python 版本
-        entry_points={
-            "console_scripts": [
-                "mineru = mineru.cli:client.main",  # 命令行入口点，mineru命令将调用mineru.cli.client.main函数
-                "mineru-sglang-server = mineru.cli.vlm_sglang_server:main",  # sglang服务器入口点
-                "mineru-models-download = mineru.cli.models_download:download_models",  # 模型下载入口点
-            ],
-        },  # 项目提供的可执行命令
-        include_package_data=True,  # 是否包含非代码文件，如数据文件、配置文件等
-        zip_safe=False,  # 是否使用 zip 文件格式打包，一般设为 False
-    )