Merge pull request #2622 from myhloli/dev

Dev

Merge pull request #2622 from myhloli/dev
Dev
bcbbee8c · Xiaomeng Zhao · GitHub · 3cc3f754 · ced5a7b4 · bcbbee8c
Unverified Commit bcbbee8c authored Jun 13, 2025 by Xiaomeng Zhao Committed by GitHub Jun 13, 2025
20 changed files
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -32,14 +32,14 @@ jobs:
      - name: Verify version.py
        run: |
-          ls -l magic_pdf/libs/version.py
+          ls -l mineru/version.py
-          cat magic_pdf/libs/version.py
+          cat mineru/version.py
      - name: Commit changes
        run: |
          git config --local user.email "moe@myhloli.com"
          git config --local user.name "myhloli"
-          git add magic_pdf/libs/version.py
+          git add mineru/version.py
          if git diff-index --quiet HEAD; then
            echo "No changes to commit"
          else
@@ -71,18 +71,18 @@ jobs:
    - name: Verify version.py
      run: |
-        ls -l magic_pdf/libs/version.py
+        ls -l mineru/version.py
-        cat magic_pdf/libs/version.py
+        cat mineru/version.py
    - name: Set up Python ${{ matrix.python-version }}
      uses: actions/setup-python@v5
      with:
        python-version: ${{ matrix.python-version }}
-    - name: Install magic-pdf
+    - name: Install mineru
      run: |
        python -m pip install --upgrade pip
-        pip install -e .[full]
+        pip install -e .[all]
  build:
    needs: [ check-install ]
@@ -103,10 +103,11 @@ jobs:
    - name: Install wheel
      run: |
        python -m pip install wheel
+        pip install build
    - name: Build wheel
      run: |
-        python setup.py bdist_wheel
+        python -m build --wheel
    - name: Upload artifact
      uses: actions/upload-artifact@v4

--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
-version: 2
-build:
-  os: ubuntu-22.04
-  tools:
-    python: "3.10"
-formats:
-  - epub
-python:
-  install:
-    - requirements: next_docs/zh_cn/requirements.txt
-sphinx:
-  configuration: next_docs/zh_cn/conf.py
--- a/README.md
+++ b/README.md
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
--- a/demo/batch_demo.py
+++ b/demo/batch_demo.py
-import os
-from pathlib import Path
-from magic_pdf.data.batch_build_dataset import batch_build_dataset
-from magic_pdf.tools.common import batch_do_parse
-def batch(pdf_dir, output_dir, method, lang):
-    os.makedirs(output_dir, exist_ok=True)
-    doc_paths = []
-    for doc_path in Path(pdf_dir).glob('*'):
-        if doc_path.suffix == '.pdf':
-            doc_paths.append(doc_path)
-    # build dataset with 2 workers
-    datasets = batch_build_dataset(doc_paths, 4, lang)
-    # os.environ["MINERU_MIN_BATCH_INFERENCE_SIZE"] = "200"  # every 200 pages will be parsed in one batch
-    batch_do_parse(output_dir, [str(doc_path.stem) for doc_path in doc_paths], datasets, method)
-if __name__ == '__main__':
-    batch("pdfs", "output", "auto", "")
--- a/demo/demo.py
+++ b/demo/demo.py
 # Copyright (c) Opendatalab. All rights reserved.
+import copy
+import json
 import os
+from pathlib import Path
-from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
-from magic_pdf.data.dataset import PymuDocDataset
+from loguru import logger
-from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-from magic_pdf.config.enums import SupportedPdfParseMethod
+from mineru.cli.common import convert_pdf_bytes_to_bytes_by_pypdfium2, prepare_env, read_fn
+from mineru.data.data_reader_writer import FileBasedDataWriter
-# args
+from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox
-__dir__ = os.path.dirname(os.path.abspath(__file__))
+from mineru.utils.enum_class import MakeMode
-pdf_file_name = os.path.join(__dir__, "pdfs", "demo1.pdf")  # replace with the real pdf path
+from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
-name_without_extension = os.path.basename(pdf_file_name).split('.')[0]
+from mineru.backend.pipeline.pipeline_analyze import doc_analyze as pipeline_doc_analyze
+from mineru.backend.pipeline.pipeline_middle_json_mkcontent import union_make as pipeline_union_make
-# prepare env
+from mineru.backend.pipeline.model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json
-local_image_dir = os.path.join(__dir__, "output", name_without_extension, "images")
+from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
-local_md_dir = os.path.join(__dir__, "output", name_without_extension)
+from mineru.utils.models_download_utils import auto_download_and_get_model_root_path
-image_dir = str(os.path.basename(local_image_dir))
-os.makedirs(local_image_dir, exist_ok=True)
+def do_parse(
-image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
+    output_dir,  # Output directory for storing parsing results
+    pdf_file_names: list[str],  # List of PDF file names to be parsed
-# read bytes
+    pdf_bytes_list: list[bytes],  # List of PDF bytes to be parsed
-reader1 = FileBasedDataReader("")
+    p_lang_list: list[str],  # List of languages for each PDF, default is 'ch' (Chinese)
-pdf_bytes = reader1.read(pdf_file_name)  # read the pdf content
+    backend="pipeline",  # The backend for parsing PDF, default is 'pipeline'
+    parse_method="auto",  # The method for parsing PDF, default is 'auto'
-# proc
+    p_formula_enable=True,  # Enable formula parsing
-## Create Dataset Instance
+    p_table_enable=True,  # Enable table parsing
-ds = PymuDocDataset(pdf_bytes)
+    server_url=None,  # Server URL for vlm-sglang-client backend
+    f_draw_layout_bbox=True,  # Whether to draw layout bounding boxes
-## inference
+    f_draw_span_bbox=True,  # Whether to draw span bounding boxes
-if ds.classify() == SupportedPdfParseMethod.OCR:
+    f_dump_md=True,  # Whether to dump markdown files
-    infer_result = ds.apply(doc_analyze, ocr=True)
+    f_dump_middle_json=True,  # Whether to dump middle JSON files
+    f_dump_model_output=True,  # Whether to dump model output files
-    ## pipeline
+    f_dump_orig_pdf=True,  # Whether to dump original PDF files
-    pipe_result = infer_result.pipe_ocr_mode(image_writer)
+    f_dump_content_list=True,  # Whether to dump content list files
+    f_make_md_mode=MakeMode.MM_MD,  # The mode for making markdown content, default is MM_MD
-else:
+    start_page_id=0,  # Start page ID for parsing, default is 0
-    infer_result = ds.apply(doc_analyze, ocr=False)
+    end_page_id=None,  # End page ID for parsing, default is None (parse all pages until the end of the document)
+):
-    ## pipeline
-    pipe_result = infer_result.pipe_txt_mode(image_writer)
+    if backend == "pipeline":
+        for idx, pdf_bytes in enumerate(pdf_bytes_list):
-### get model inference result
+            new_pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id, end_page_id)
-model_inference_result = infer_result.get_infer_res()
+            pdf_bytes_list[idx] = new_pdf_bytes
-### draw layout result on each page
+        infer_results, all_image_lists, all_pdf_docs, lang_list, ocr_enabled_list = pipeline_doc_analyze(pdf_bytes_list, p_lang_list, parse_method=parse_method, formula_enable=p_formula_enable,table_enable=p_table_enable)
-pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_extension}_layout.pdf"))
+        for idx, model_list in enumerate(infer_results):
-### draw spans result on each page
+            model_json = copy.deepcopy(model_list)
-pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_extension}_spans.pdf"))
+            pdf_file_name = pdf_file_names[idx]
+            local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
-### get markdown content
+            image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
-md_content = pipe_result.get_markdown(image_dir)
+            images_list = all_image_lists[idx]
-### dump markdown
+            pdf_doc = all_pdf_docs[idx]
-pipe_result.dump_md(md_writer, f"{name_without_extension}.md", image_dir)
+            _lang = lang_list[idx]
+            _ocr_enable = ocr_enabled_list[idx]
-### get content list content
+            middle_json = pipeline_result_to_middle_json(model_list, images_list, pdf_doc, image_writer, _lang, _ocr_enable, p_formula_enable)
-content_list_content = pipe_result.get_content_list(image_dir)
+            pdf_info = middle_json["pdf_info"]
-### dump content list
-pipe_result.dump_content_list(md_writer, f"{name_without_extension}_content_list.json", image_dir)
+            pdf_bytes = pdf_bytes_list[idx]
+            if f_draw_layout_bbox:
-### get middle json
+                draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_layout.pdf")
-middle_json_content = pipe_result.get_middle_json()
+            if f_draw_span_bbox:
-### dump middle json
+                draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_span.pdf")
-pipe_result.dump_middle_json(md_writer, f'{name_without_extension}_middle.json')
+            if f_dump_orig_pdf:
+                md_writer.write(
+                    f"{pdf_file_name}_origin.pdf",
+                    pdf_bytes,
+                )
+            if f_dump_md:
+                image_dir = str(os.path.basename(local_image_dir))
+                md_content_str = pipeline_union_make(pdf_info, f_make_md_mode, image_dir)
+                md_writer.write_string(
+                    f"{pdf_file_name}.md",
+                    md_content_str,
+                )
+            if f_dump_content_list:
+                image_dir = str(os.path.basename(local_image_dir))
+                content_list = pipeline_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir)
+                md_writer.write_string(
+                    f"{pdf_file_name}_content_list.json",
+                    json.dumps(content_list, ensure_ascii=False, indent=4),
+                )
+            if f_dump_middle_json:
+                md_writer.write_string(
+                    f"{pdf_file_name}_middle.json",
+                    json.dumps(middle_json, ensure_ascii=False, indent=4),
+                )
+            if f_dump_model_output:
+                md_writer.write_string(
+                    f"{pdf_file_name}_model.json",
+                    json.dumps(model_json, ensure_ascii=False, indent=4),
+                )
+            logger.info(f"local output dir is {local_md_dir}")
+    else:
+        if backend.startswith("vlm-"):
+            backend = backend[4:]
+        f_draw_span_bbox = False
+        parse_method = "vlm"
+        for idx, pdf_bytes in enumerate(pdf_bytes_list):
+            pdf_file_name = pdf_file_names[idx]
+            pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id, end_page_id)
+            local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
+            image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
+            middle_json, infer_result = vlm_doc_analyze(pdf_bytes, image_writer=image_writer, backend=backend, server_url=server_url)
+            pdf_info = middle_json["pdf_info"]
+            if f_draw_layout_bbox:
+                draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_layout.pdf")
+            if f_draw_span_bbox:
+                draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_span.pdf")
+            if f_dump_orig_pdf:
+                md_writer.write(
+                    f"{pdf_file_name}_origin.pdf",
+                    pdf_bytes,
+                )
+            if f_dump_md:
+                image_dir = str(os.path.basename(local_image_dir))
+                md_content_str = vlm_union_make(pdf_info, f_make_md_mode, image_dir)
+                md_writer.write_string(
+                    f"{pdf_file_name}.md",
+                    md_content_str,
+                )
+            if f_dump_content_list:
+                image_dir = str(os.path.basename(local_image_dir))
+                content_list = vlm_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir)
+                md_writer.write_string(
+                    f"{pdf_file_name}_content_list.json",
+                    json.dumps(content_list, ensure_ascii=False, indent=4),
+                )
+            if f_dump_middle_json:
+                md_writer.write_string(
+                    f"{pdf_file_name}_middle.json",
+                    json.dumps(middle_json, ensure_ascii=False, indent=4),
+                )
+            if f_dump_model_output:
+                model_output = ("\n" + "-" * 50 + "\n").join(infer_result)
+                md_writer.write_string(
+                    f"{pdf_file_name}_model_output.txt",
+                    model_output,
+                )
+            logger.info(f"local output dir is {local_md_dir}")
+def parse_doc(
+        path_list: list[Path],
+        output_dir,
+        lang="ch",
+        backend="pipeline",
+        method="auto",
+        server_url=None,
+        start_page_id=0,  # Start page ID for parsing, default is 0
+        end_page_id=None  # End page ID for parsing, default is None (parse all pages until the end of the document)
+):
+    """
+        Parameter description:
+        path_list: List of document paths to be parsed, can be PDF or image files.
+        output_dir: Output directory for storing parsing results.
+        lang: Language option, default is 'ch', optional values include['ch', 'ch_server', 'ch_lite', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']。
+            Input the languages in the pdf (if known) to improve OCR accuracy.  Optional.
+            Adapted only for the case where the backend is set to "pipeline"
+        backend: the backend for parsing pdf:
+            pipeline: More general.
+            vlm-transformers: More general.
+            vlm-sglang-engine: Faster(engine).
+            vlm-sglang-client: Faster(client).
+            without method specified, pipeline will be used by default.
+        method: the method for parsing pdf:
+            auto: Automatically determine the method based on the file type.
+            txt: Use text extraction method.
+            ocr: Use OCR method for image-based PDFs.
+            Without method specified, 'auto' will be used by default.
+            Adapted only for the case where the backend is set to "pipeline".
+        server_url: When the backend is `sglang-client`, you need to specify the server_url, for example:`http://127.0.0.1:30000`
+    """
+    try:
+        file_name_list = []
+        pdf_bytes_list = []
+        lang_list = []
+        for path in path_list:
+            file_name = str(Path(path).stem)
+            pdf_bytes = read_fn(path)
+            file_name_list.append(file_name)
+            pdf_bytes_list.append(pdf_bytes)
+            lang_list.append(lang)
+        do_parse(
+            output_dir=output_dir,
+            pdf_file_names=file_name_list,
+            pdf_bytes_list=pdf_bytes_list,
+            p_lang_list=lang_list,
+            backend=backend,
+            parse_method=method,
+            server_url=server_url,
+            start_page_id=start_page_id,
+            end_page_id=end_page_id
+        )
+    except Exception as e:
+        logger.exception(e)
+if __name__ == '__main__':
+    # args
+    __dir__ = os.path.dirname(os.path.abspath(__file__))
+    pdf_files_dir = os.path.join(__dir__, "pdfs")
+    output_dir = os.path.join(__dir__, "output")
+    pdf_suffixes = [".pdf"]
+    image_suffixes = [".png", ".jpeg", ".jpg"]
+    doc_path_list = []
+    for doc_path in Path(pdf_files_dir).glob('*'):
+        if doc_path.suffix in pdf_suffixes + image_suffixes:
+            doc_path_list.append(doc_path)
+    """如果您由于网络问题无法下载模型，可以设置环境变量MINERU_MODEL_SOURCE为modelscope使用免代理仓库下载模型"""
+    # os.environ['MINERU_MODEL_SOURCE'] = "modelscope"
+    """Use pipeline mode if your environment does not support VLM"""
+    parse_doc(doc_path_list, output_dir, backend="pipeline")
+    """To enable VLM mode, change the backend to 'vlm-xxx'"""
+    # parse_doc(doc_path_list, output_dir, backend="vlm-transformers")  # more general.
+    # parse_doc(doc_path_list, output_dir, backend="vlm-sglang-engine")  # faster(engine).
+    # parse_doc(doc_path_list, output_dir, backend="vlm-sglang-client", server_url="http://127.0.0.1:30000"）  # faster(client).
\ No newline at end of file
--- a/docker/ascend_npu/Dockerfile
+++ b/docker/ascend_npu/Dockerfile
-# Use the official Ubuntu base image
-FROM swr.cn-central-221.ovaijisuan.com/mindformers/mindformers1.2_mindspore2.3:20240722
-USER root
-# Set environment variables to non-interactive to avoid prompts during installation
-ENV DEBIAN_FRONTEND=noninteractive
-# Update the package list and install necessary packages
-RUN apt-get update && \
-    apt-get install -y \
-        software-properties-common && \
-    add-apt-repository -y ppa:deadsnakes/ppa && \
-    apt-get update && \
-    apt-get install -y \
-        python3.10 \
-        python3.10-venv \
-        python3.10-distutils \
-        python3.10-dev \
-        python3-pip \
-        wget \
-        git \
-        libgl1 \
-        libglib2.0-0 \
-        && rm -rf /var/lib/apt/lists/*
-# Set Python 3.10 as the default python3
-RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
-# Create a virtual environment for MinerU
-RUN python3 -m venv /opt/mineru_venv
-# Copy the configuration file template and install magic-pdf latest
-RUN /bin/bash -c "wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/magic-pdf.template.json && \
-    cp magic-pdf.template.json /root/magic-pdf.json && \
-    source /opt/mineru_venv/bin/activate && \
-    pip3 install --upgrade pip -i https://mirrors.aliyun.com/pypi/simple && \
-    pip3 install torch==2.3.1 torchvision==0.18.1 -i https://mirrors.aliyun.com/pypi/simple && \
-    pip3 install -U magic-pdf[full] 'numpy<2' decorator attrs absl-py cloudpickle ml-dtypes tornado einops -i https://mirrors.aliyun.com/pypi/simple && \
-    wget https://gitee.com/ascend/pytorch/releases/download/v6.0.rc2-pytorch2.3.1/torch_npu-2.3.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl && \
-    pip3 install torch_npu-2.3.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl"
-# Download models and update the configuration file
-RUN /bin/bash -c "source /opt/mineru_venv/bin/activate && \
-    pip3 install modelscope -i https://mirrors.aliyun.com/pypi/simple && \
-    wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/scripts/download_models.py -O download_models.py && \
-    python3 download_models.py && \
-    sed -i 's|cpu|npu|g' /root/magic-pdf.json"
-# Set the entry point to activate the virtual environment and run the command line tool
-ENTRYPOINT ["/bin/bash", "-c", "source /opt/mineru_venv/bin/activate && exec \"$@\"", "--"]
--- a/docker/china/Dockerfile
+++ b/docker/china/Dockerfile
@@ -18,37 +18,19 @@ RUN apt-get update && \
        wget \
        git \
        libgl1 \
-        libreoffice \
-        fonts-noto-cjk \
-        fonts-wqy-zenhei \
-        fonts-wqy-microhei \
-        ttf-mscorefonts-installer \
-        fontconfig \
        libglib2.0-0 \
-        libxrender1 \
-        libsm6 \
-        libxext6 \
-        poppler-utils \
        && rm -rf /var/lib/apt/lists/*
 # Set Python 3.10 as the default python3
 RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
-# Create a virtual environment for MinerU
+# install mineru latest
-RUN python3 -m venv /opt/mineru_venv
+RUN /bin/bash -c "pip3 install --upgrade pip -i https://mirrors.aliyun.com/pypi/simple && \
+    pip3 install uv -i https://mirrors.aliyun.com/pypi/simple && \
-# Copy the configuration file template and install magic-pdf latest
+    uv pip install 'mineru[all]>=2.0.0' -i https://mirrors.aliyun.com/pypi/simple"
-RUN /bin/bash -c "wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/magic-pdf.template.json && \
-    cp magic-pdf.template.json /root/magic-pdf.json && \
-    source /opt/mineru_venv/bin/activate && \
-    pip3 install --upgrade pip -i https://mirrors.aliyun.com/pypi/simple && \
-    pip3 install -U magic-pdf[full] -i https://mirrors.aliyun.com/pypi/simple"
 # Download models and update the configuration file
-RUN /bin/bash -c "pip3 install modelscope -i https://mirrors.aliyun.com/pypi/simple && \
+RUN /bin/bash -c "mineru-models-download -s modelscope -m all"
-    wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/scripts/download_models.py -O download_models.py && \
-    python3 download_models.py && \
-    sed -i 's|cpu|cuda|g' /root/magic-pdf.json"
 # Set the entry point to activate the virtual environment and run the command line tool
-ENTRYPOINT ["/bin/bash", "-c", "source /opt/mineru_venv/bin/activate && exec \"$@\"", "--"]
+ENTRYPOINT ["/bin/bash", "-c", "export MINERU_MODEL_SOURCE=local && exec \"$@\"", "--"]
\ No newline at end of file
--- a/docker/global/Dockerfile
+++ b/docker/global/Dockerfile
@@ -18,37 +18,19 @@ RUN apt-get update && \
        wget \
        git \
        libgl1 \
-        libreoffice \
-        fonts-noto-cjk \
-        fonts-wqy-zenhei \
-        fonts-wqy-microhei \
-        ttf-mscorefonts-installer \
-        fontconfig \
        libglib2.0-0 \
-        libxrender1 \
-        libsm6 \
-        libxext6 \
-        poppler-utils \
        && rm -rf /var/lib/apt/lists/*
 # Set Python 3.10 as the default python3
 RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
-# Create a virtual environment for MinerU
+# install mineru latest
-RUN python3 -m venv /opt/mineru_venv
+RUN /bin/bash -c "pip3 install --upgrade pip && \
+    pip3 install uv && \
-# Copy the configuration file template and install magic-pdf latest
+    uv pip install 'mineru[all]>=2.0.0'"
-RUN /bin/bash -c "wget https://github.com/opendatalab/MinerU/raw/master/magic-pdf.template.json && \
-    cp magic-pdf.template.json /root/magic-pdf.json && \
-    source /opt/mineru_venv/bin/activate && \
-    pip3 install --upgrade pip && \
-    pip3 install -U magic-pdf[full]"
 # Download models and update the configuration file
-RUN /bin/bash -c "pip3 install huggingface_hub && \
+RUN /bin/bash -c "mineru-models-download -s huggingface -m all"
-    wget https://github.com/opendatalab/MinerU/raw/master/scripts/download_models_hf.py -O download_models.py && \
-    python3 download_models.py && \
-    sed -i 's|cpu|cuda|g' /root/magic-pdf.json"
 # Set the entry point to activate the virtual environment and run the command line tool
-ENTRYPOINT ["/bin/bash", "-c", "source /opt/mineru_venv/bin/activate && exec \"$@\"", "--"]
+ENTRYPOINT ["/bin/bash", "-c", "export MINERU_MODEL_SOURCE=local && exec \"$@\"", "--"]
\ No newline at end of file
--- a/docs/README_Ascend_NPU_Acceleration_zh_CN.md
+++ b/docs/README_Ascend_NPU_Acceleration_zh_CN.md
-# Ascend NPU 加速
-## 简介
-本文档介绍如何在 Ascend NPU 上使用 MinerU。本文档内容已在`华为 Atlas 800T A2`服务器上测试通过。
-```
-CPU：鲲鹏 920 aarch64 2.6GHz
-NPU：Ascend 910B 64GB
-OS：openEuler 22.03 (LTS-SP3)/ Ubuntu 22.04.5 LTS
-CANN：8.0.RC2
-驱动版本：24.1.rc2.1
-```
-由于适配 Ascend NPU 的环境较为复杂，建议使用 Docker 容器运行 MinerU。
-通过docker运行MinerU前需确保物理机已安装支持CANN 8.0.RC2的驱动和固件。
-## 构建镜像
-请保持网络状况良好，并执行以下代码构建镜像。    
-```bash
-wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/docker/ascend_npu/Dockerfile -O Dockerfile
-docker build -t mineru_npu:latest .
-```
-如果构建过程中未发生报错则说明镜像构建成功。
-## 运行容器
-```bash
-docker run -it -u root --name mineru-npu --privileged=true \
-    --ipc=host \
-    --network=host \
-    --device=/dev/davinci0 \
-    --device=/dev/davinci1 \
-    --device=/dev/davinci2 \
-    --device=/dev/davinci3 \
-    --device=/dev/davinci4 \
-    --device=/dev/davinci5 \
-    --device=/dev/davinci6 \
-    --device=/dev/davinci7 \
-    --device=/dev/davinci_manager \
-    --device=/dev/devmm_svm \
-    --device=/dev/hisi_hdc \
-    -v /var/log/npu/:/usr/slog \
-    -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-    -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
-    mineru_npu:latest \
-    /bin/bash -c "echo 'source /opt/mineru_venv/bin/activate' >> ~/.bashrc && exec bash"
-magic-pdf --help
-```
--- a/docs/README_Ubuntu_CUDA_Acceleration_en_US.md
+++ b/docs/README_Ubuntu_CUDA_Acceleration_en_US.md
-# Ubuntu 22.04 LTS
-### 1. Check if NVIDIA Drivers Are Installed
-```sh
-nvidia-smi
-```
-If you see information similar to the following, it means that the NVIDIA drivers are already installed, and you can skip Step 2.
-> [!NOTE]
-> Notice:`CUDA Version` should be >= 12.4, If the displayed version number is less than 12.4, please upgrade the driver.
-```plaintext
-+---------------------------------------------------------------------------------------+
-| NVIDIA-SMI 570.133.07             Driver Version: 572.83         CUDA Version: 12.8   |
-|-----------------------------------------+----------------------+----------------------+
-| GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
-| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
-|                                         |                      |               MIG M. |
-|=========================================+======================+======================|
-|   0  NVIDIA GeForce RTX 3060 Ti   WDDM  | 00000000:01:00.0  On |                  N/A |
-|  0%   51C    P8              12W / 200W |   1489MiB /  8192MiB |      5%      Default |
-|                                         |                      |                  N/A |
-+-----------------------------------------+----------------------+----------------------+
-```
-### 2. Install the Driver
-If no driver is installed, use the following command:
-```sh
-sudo apt-get update
-sudo apt-get install nvidia-driver-570-server
-```
-Install the proprietary driver and restart your computer after installation.
-```sh
-reboot
-```
-### 3. Install Anaconda
-If Anaconda is already installed, skip this step.
-```sh
-wget https://repo.anaconda.com/archive/Anaconda3-2024.06-1-Linux-x86_64.sh
-bash Anaconda3-2024.06-1-Linux-x86_64.sh
-```
-In the final step, enter `yes`, close the terminal, and reopen it.
-### 4. Create an Environment Using Conda
-```bash
-conda create -n mineru 'python=3.12' -y
-conda activate mineru
-```
-### 5. Install Applications
-```sh
-pip install -U magic-pdf[full]
-```
-> [!TIP]
-> After installation, you can check the version of `magic-pdf` using the following command:
->
-> ```sh
-> magic-pdf --version
-> ```
-### 6. Download Models
-Refer to detailed instructions on [how to download model files](how_to_download_models_en.md).
-## 7. Understand the Location of the Configuration File
-After completing the [6. Download Models](#6-download-models) step, the script will automatically generate a `magic-pdf.json` file in the user directory and configure the default model path.
-You can find the `magic-pdf.json` file in your user directory.
-> [!TIP]
-> The user directory for Linux is "/home/username".
-### 8. First Run
-Download a sample file from the repository and test it.
-```sh
-wget https://github.com/opendatalab/MinerU/raw/master/demo/pdfs/small_ocr.pdf
-magic-pdf -p small_ocr.pdf -o ./output
-```
-### 9. Test CUDA Acceleration
-If your graphics card has at least **6GB** of VRAM, follow these steps to test CUDA acceleration:
-1. Modify the value of `"device-mode"` in the `magic-pdf.json` configuration file located in your home directory.
-   ```json
-   {
-     "device-mode": "cuda"
-   }
-   ```
-2. Test CUDA acceleration with the following command:
-   ```sh
-   magic-pdf -p small_ocr.pdf -o ./output
-   ```
\ No newline at end of file
--- a/docs/README_Ubuntu_CUDA_Acceleration_zh_CN.md
+++ b/docs/README_Ubuntu_CUDA_Acceleration_zh_CN.md
-# Ubuntu 22.04 LTS
-## 1. 检测是否已安装nvidia驱动
-```bash
-nvidia-smi
-```
-如果看到类似如下的信息，说明已经安装了nvidia驱动，可以跳过步骤2
-> [!NOTE]
-> `CUDA Version` 显示的版本号应 >= 12.4，如显示的版本号小于12.4，请升级驱动
-```plaintext
-+---------------------------------------------------------------------------------------+
-| NVIDIA-SMI 570.133.07             Driver Version: 572.83         CUDA Version: 12.8   |
-|-----------------------------------------+----------------------+----------------------+
-| GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
-| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
-|                                         |                      |               MIG M. |
-|=========================================+======================+======================|
-|   0  NVIDIA GeForce RTX 3060 Ti   WDDM  | 00000000:01:00.0  On |                  N/A |
-|  0%   51C    P8              12W / 200W |   1489MiB /  8192MiB |      5%      Default |
-|                                         |                      |                  N/A |
-+-----------------------------------------+----------------------+----------------------+
-```
-## 2. 安装驱动
-如没有驱动，则通过如下命令
-```bash
-sudo apt-get update
-sudo apt-get install nvidia-driver-570-server
-```
-安装专有驱动，安装完成后，重启电脑
-```bash
-reboot
-```
-## 3. 安装anacoda
-如果已安装conda，可以跳过本步骤
-```bash
-wget -U NoSuchBrowser/1.0 https://mirrors.tuna.tsinghua.edu.cn/anaconda/archive/Anaconda3-2024.06-1-Linux-x86_64.sh
-bash Anaconda3-2024.06-1-Linux-x86_64.sh
-```
-最后一步输入yes，关闭终端重新打开
-## 4. 使用conda 创建环境
-```bash
-conda create -n mineru 'python=3.12' -y
-conda activate mineru
-```
-## 5. 安装应用
-```bash
-pip install -U magic-pdf[full] -i https://mirrors.aliyun.com/pypi/simple
-```
-> [!TIP]
-> 下载完成后，您可以通过以下命令检查`magic-pdf`的版本：
->
-> ```bash
-> magic-pdf --version
-> ```
-## 6. 下载模型
-详细参考 [如何下载模型文件](how_to_download_models_zh_cn.md)
-## 7. 了解配置文件存放的位置
-完成[6.下载模型](#6-下载模型)步骤后，脚本会自动生成用户目录下的magic-pdf.json文件，并自动配置默认模型路径。
-您可在【用户目录】下找到magic-pdf.json文件。
-> [!TIP]
-> linux用户目录为 "/home/用户名"
-## 8. 第一次运行
-从仓库中下载样本文件，并测试
-```bash
-wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/demo/pdfs/small_ocr.pdf
-magic-pdf -p small_ocr.pdf -o ./output
-```
-## 9. 测试CUDA加速
-如果您的显卡显存大于等于 **6GB** ，可以进行以下流程，测试CUDA解析加速效果
-**1.修改【用户目录】中配置文件magic-pdf.json中"device-mode"的值**
-```json
-{
-  "device-mode":"cuda"
-}
-```
-**2.运行以下命令测试cuda加速效果**
-```bash
-magic-pdf -p small_ocr.pdf -o ./output
-```
-> [!TIP]
-> CUDA加速是否生效可以根据log中输出的各个阶段cost耗时来简单判断，通常情况下，使用cuda加速会比cpu更快。
--- a/docs/README_Windows_CUDA_Acceleration_en_US.md
+++ b/docs/README_Windows_CUDA_Acceleration_en_US.md
-# Windows 10/11
-### 1. Install CUDA and cuDNN
-You need to install a CUDA version that is compatible with torch's requirements. For details, please refer to the [official PyTorch website](https://pytorch.org/get-started/locally/).
- CUDA 11.8 https://developer.nvidia.com/cuda-11-8-0-download-archive
- CUDA 12.4 https://developer.nvidia.com/cuda-12-4-0-download-archive
- CUDA 12.6 https://developer.nvidia.com/cuda-12-6-0-download-archive
- CUDA 12.8 https://developer.nvidia.com/cuda-12-8-0-download-archive
-### 2. Install Anaconda
-If Anaconda is already installed, you can skip this step.
-Download link: https://repo.anaconda.com/archive/Anaconda3-2024.06-1-Windows-x86_64.exe
-### 3. Create an Environment Using Conda
-```bash
-conda create -n mineru 'python=3.12' -y
-conda activate mineru
-```
-### 4. Install Applications
-```
-pip install -U magic-pdf[full]
-```
-> [!IMPORTANT]
-> After installation, you can check the version of `magic-pdf` using the following command:
->
-> ```bash
-> magic-pdf --version
-> ```
-### 5. Download Models
-Refer to detailed instructions on [how to download model files](how_to_download_models_en.md).
-### 6. Understand the Location of the Configuration File
-After completing the [5. Download Models](#5-download-models) step, the script will automatically generate a `magic-pdf.json` file in the user directory and configure the default model path.
-You can find the `magic-pdf.json` file in your 【user directory】 .
-> [!TIP]
-> The user directory for Windows is "C:/Users/username".
-### 7. First Run
-Download a sample file from the repository and test it.
-```powershell
-  wget https://github.com/opendatalab/MinerU/raw/master/demo/pdfs/small_ocr.pdf -O small_ocr.pdf
-  magic-pdf -p small_ocr.pdf -o ./output
-```
-### 8. Test CUDA Acceleration
-If your graphics card has at least 6GB of VRAM, follow these steps to test CUDA-accelerated parsing performance.
-1. **Overwrite the installation of torch and torchvision** supporting CUDA.(Please select the appropriate index-url based on your CUDA version. For more details, refer to the [PyTorch official website](https://pytorch.org/get-started/locally/).)
-   ```
-   pip install --force-reinstall torch torchvision --index-url https://download.pytorch.org/whl/cu124
-   ```
-2. **Modify the value of `"device-mode"`** in the `magic-pdf.json` configuration file located in your user directory.
-   ```json
-   {
-     "device-mode": "cuda"
-   }
-   ```
-3. **Run the following command to test CUDA acceleration**:
-   ```
-   magic-pdf -p small_ocr.pdf -o ./output
-   ```
\ No newline at end of file
--- a/docs/README_Windows_CUDA_Acceleration_zh_CN.md
+++ b/docs/README_Windows_CUDA_Acceleration_zh_CN.md
-# Windows10/11
-## 1. 安装cuda环境
-需要安装符合torch要求的cuda版本，具体可参考[torch官网](https://pytorch.org/get-started/locally/)
- CUDA 11.8 https://developer.nvidia.com/cuda-11-8-0-download-archive
- CUDA 12.4 https://developer.nvidia.com/cuda-12-4-0-download-archive
- CUDA 12.6 https://developer.nvidia.com/cuda-12-6-0-download-archive
- CUDA 12.8 https://developer.nvidia.com/cuda-12-8-0-download-archive
-## 2. 安装anaconda
-如果已安装conda，可以跳过本步骤
-下载链接：
-https://mirrors.tuna.tsinghua.edu.cn/anaconda/archive/Anaconda3-2024.06-1-Windows-x86_64.exe
-## 3. 使用conda 创建环境
-```bash
-conda create -n mineru 'python=3.12' -y
-conda activate mineru
-```
-## 4. 安装应用
-```bash
-pip install -U magic-pdf[full] -i https://mirrors.aliyun.com/pypi/simple
-```
-> [!IMPORTANT]
-> 下载完成后，您可以通过以下命令检查magic-pdf的版本
->
-> ```bash
-> magic-pdf --version
-> ```
-## 5. 下载模型
-详细参考 [如何下载模型文件](how_to_download_models_zh_cn.md)
-## 6. 了解配置文件存放的位置
-完成[5.下载模型](#5-下载模型)步骤后，脚本会自动生成用户目录下的magic-pdf.json文件，并自动配置默认模型路径。
-您可在【用户目录】下找到magic-pdf.json文件。
-> [!TIP]
-> windows用户目录为 "C:/Users/用户名"
-## 7. 第一次运行
-从仓库中下载样本文件，并测试
-```powershell
- wget https://github.com/opendatalab/MinerU/raw/master/demo/pdfs/small_ocr.pdf -O small_ocr.pdf
- magic-pdf -p small_ocr.pdf -o ./output
-```
-## 8. 测试CUDA加速
-如果您的显卡显存大于等于 **6GB** ，可以进行以下流程，测试CUDA解析加速效果
-**1.覆盖安装支持cuda的torch和torchvision**(请根据cuda版本选择合适的index-url，具体可参考[torch官网](https://pytorch.org/get-started/locally/))
-```bash
-pip install --force-reinstall torch torchvision --index-url https://download.pytorch.org/whl/cu124
-```
-**2.修改【用户目录】中配置文件magic-pdf.json中"device-mode"的值**
-```json
-{
-  "device-mode":"cuda"
-}
-```
-**3.运行以下命令测试cuda加速效果**
-```bash
-magic-pdf -p small_ocr.pdf -o ./output
-```
-> [!TIP]
-> CUDA加速是否生效可以根据log中输出的各个阶段的耗时来简单判断，通常情况下，cuda加速后运行速度比cpu更快。
--- a/docs/how_to_download_models_en.md
+++ b/docs/how_to_download_models_en.md
-Model downloads are divided into initial downloads and updates to the model directory. Please refer to the corresponding documentation for instructions on how to proceed.
-# Initial download of model files
-### Download the Model from Hugging Face
-Use a Python Script to Download Model Files from Hugging Face
-```bash
-pip install huggingface_hub
-wget https://github.com/opendatalab/MinerU/raw/master/scripts/download_models_hf.py -O download_models_hf.py
-python download_models_hf.py
-```
-The Python script will automatically download the model files and configure the model directory in the configuration file.
-The configuration file can be found in the user directory, with the filename `magic-pdf.json`.
-# How to update models previously downloaded
-## 1. Models downloaded via Hugging Face or Model Scope
-If you previously downloaded models via Hugging Face or Model Scope, you can rerun the Python script used for the initial download. This will automatically update the model directory to the latest version.
--- a/docs/how_to_download_models_zh_cn.md
+++ b/docs/how_to_download_models_zh_cn.md
-模型下载分为首次下载和更新模型目录，请参考对应的文档内容进行操作
-# 首次下载模型文件
-模型文件可以从 Hugging Face 或 Model Scope 下载，由于网络原因，国内用户访问HF可能会失败，请使用 ModelScope。
-<details>
-  <summary>方法一：从 Hugging Face 下载模型</summary>
-  <p>使用python脚本 从Hugging Face下载模型文件</p>
-  <pre><code>pip install huggingface_hub
-wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/scripts/download_models_hf.py -O download_models_hf.py
-python download_models_hf.py</code></pre>
-  <p>python脚本会自动下载模型文件并配置好配置文件中的模型目录</p>
-</details>
-## 方法二：从 ModelScope 下载模型
-### 使用python脚本 从ModelScope下载模型文件
-```bash
-pip install modelscope
-wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/scripts/download_models.py -O download_models.py
-python download_models.py
-```
-python脚本会自动下载模型文件并配置好配置文件中的模型目录
-配置文件可以在用户目录中找到，文件名为`magic-pdf.json`
-> [!TIP]
-> windows的用户目录为 "C:\\Users\\用户名", linux用户目录为 "/home/用户名", macOS用户目录为 "/Users/用户名"
-# 此前下载过模型，如何更新
-## 1. 通过 Hugging Face 或 Model Scope 下载过模型
-如此前通过 HuggingFace 或 Model Scope 下载过模型，可以重复执行此前的模型下载python脚本，将会自动将模型目录更新到最新版本。
--- a/mineru/backend/pipeline/batch_analyze.py
+++ b/mineru/backend/pipeline/batch_analyze.py
@@ -5,8 +5,8 @@ from collections import defaultdict
 import numpy as np
 from .model_init import AtomModelSingleton
-from ...utils.model_utils import crop_img, get_res_list_from_layout_res, get_coords_and_area
+from ...utils.model_utils import crop_img, get_res_list_from_layout_res
-from ...utils.ocr_utils import get_adjusted_mfdetrec_res, get_ocr_result_list
+from ...utils.ocr_utils import get_adjusted_mfdetrec_res, get_ocr_result_list, OcrConfidence
 YOLO_LAYOUT_BASE_BATCH_SIZE = 1
 MFD_BASE_BATCH_SIZE = 1
@@ -315,7 +315,7 @@ class BatchAnalyze:
                        ocr_text, ocr_score = ocr_res_list[index]
                        layout_res_item['text'] = ocr_text
                        layout_res_item['score'] = float(f"{ocr_score:.3f}")
-                        if ocr_score < 0.6:
+                        if ocr_score < OcrConfidence.min_confidence:
                            layout_res_item['category_id'] = 16
                    total_processed += len(img_crop_list)

--- a/mineru/backend/pipeline/model_json_to_middle_json.py
+++ b/mineru/backend/pipeline/model_json_to_middle_json.py
@@ -2,6 +2,7 @@
 import time
 from loguru import logger
+from tqdm import tqdm
 from mineru.utils.config_reader import get_device, get_llm_aided_config
 from mineru.backend.pipeline.model_init import AtomModelSingleton
@@ -14,6 +15,7 @@ from mineru.utils.enum_class import ContentType
 from mineru.utils.llm_aided import llm_aided_title
 from mineru.utils.model_utils import clean_memory
 from mineru.backend.pipeline.pipeline_magic_model import MagicModel
+from mineru.utils.ocr_utils import OcrConfidence
 from mineru.utils.span_block_fix import fill_spans_in_blocks, fix_discarded_block, fix_block_spans
 from mineru.utils.span_pre_proc import remove_outside_spans, remove_overlaps_low_confidence_spans, \
    remove_overlaps_min_spans, txt_spans_extract
@@ -163,7 +165,7 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
 def result_to_middle_json(model_list, images_list, pdf_doc, image_writer, lang=None, ocr_enable=False, formula_enabled=True):
    middle_json = {"pdf_info": [], "_backend":"pipeline", "_version_name": __version__}
-    for page_index, page_model_info in enumerate(model_list):
+    for page_index, page_model_info in tqdm(enumerate(model_list), total=len(model_list), desc="Processing pages"):
        page = pdf_doc[page_index]
        image_dict = images_list[page_index]
        page_info = page_model_info_to_page_info(
@@ -208,7 +210,7 @@ def result_to_middle_json(model_list, images_list, pdf_doc, image_writer, lang=N
            need_ocr_list), f'ocr_res_list: {len(ocr_res_list)}, need_ocr_list: {len(need_ocr_list)}'
        for index, span in enumerate(need_ocr_list):
            ocr_text, ocr_score = ocr_res_list[index]
-            if ocr_score > 0.6:
+            if ocr_score > OcrConfidence.min_confidence:
                span['content'] = ocr_text
                span['score'] = float(f"{ocr_score:.3f}")
            else:

--- a/mineru/backend/vlm/base_predictor.py
+++ b/mineru/backend/vlm/base_predictor.py
@@ -7,8 +7,8 @@ DEFAULT_SYSTEM_PROMPT = (
 )
 DEFAULT_USER_PROMPT = "Document Parsing:"
 DEFAULT_TEMPERATURE = 0.0
-DEFAULT_TOP_P = 0.01
+DEFAULT_TOP_P = 0.8
-DEFAULT_TOP_K = 1
+DEFAULT_TOP_K = 20
 DEFAULT_REPETITION_PENALTY = 1.0
 DEFAULT_PRESENCE_PENALTY = 0.0
 DEFAULT_NO_REPEAT_NGRAM_SIZE = 100

--- a/mineru/backend/vlm/predictor.py
+++ b/mineru/backend/vlm/predictor.py
@@ -22,7 +22,7 @@ try:
    hf_loaded = True
 except ImportError as e:
-    logger.warning("hf is not installed. If you are not using huggingface, you can ignore this warning.")
+    logger.warning("hf is not installed. If you are not using transformers, you can ignore this warning.")
 engine_loaded = False
 try:
@@ -51,9 +51,9 @@ def get_predictor(
 ) -> BasePredictor:
    start_time = time.time()
-    if backend == "huggingface":
+    if backend == "transformers":
        if not model_path:
-            raise ValueError("model_path must be provided for huggingface backend.")
+            raise ValueError("model_path must be provided for transformers backend.")
        if not hf_loaded:
            raise ImportError(
                "transformers is not installed, so huggingface backend cannot be used. "
@@ -77,7 +77,7 @@ def get_predictor(
            raise ImportError(
                "sglang is not installed, so sglang-engine backend cannot be used. "
                "If you need to use sglang-engine backend for inference, "
-                "please install sglang[all]==0.4.6.post4 or a newer version."
+                "please install sglang[all]==0.4.7 or a newer version."
            )
        predictor = SglangEnginePredictor(
            server_args=ServerArgs(model_path, **kwargs),
@@ -104,7 +104,7 @@ def get_predictor(
            http_timeout=http_timeout,
        )
    else:
-        raise ValueError(f"Unsupported backend: {backend}. Supports: huggingface, sglang-engine, sglang-client.")
+        raise ValueError(f"Unsupported backend: {backend}. Supports: transformers, sglang-engine, sglang-client.")
    elapsed = round(time.time() - start_time, 2)
    logger.info(f"get_predictor cost: {elapsed}s")