"git@developer.sourcefind.cn:OpenDAS/torchaudio.git" did not exist on "622c46397c1aeae92d8bf970567910b9f2acdb58"
Unverified Commit bcbbee8c authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #2622 from myhloli/dev

Dev
parents 3cc3f754 ced5a7b4
...@@ -32,14 +32,14 @@ jobs: ...@@ -32,14 +32,14 @@ jobs:
- name: Verify version.py - name: Verify version.py
run: | run: |
ls -l magic_pdf/libs/version.py ls -l mineru/version.py
cat magic_pdf/libs/version.py cat mineru/version.py
- name: Commit changes - name: Commit changes
run: | run: |
git config --local user.email "moe@myhloli.com" git config --local user.email "moe@myhloli.com"
git config --local user.name "myhloli" git config --local user.name "myhloli"
git add magic_pdf/libs/version.py git add mineru/version.py
if git diff-index --quiet HEAD; then if git diff-index --quiet HEAD; then
echo "No changes to commit" echo "No changes to commit"
else else
...@@ -71,18 +71,18 @@ jobs: ...@@ -71,18 +71,18 @@ jobs:
- name: Verify version.py - name: Verify version.py
run: | run: |
ls -l magic_pdf/libs/version.py ls -l mineru/version.py
cat magic_pdf/libs/version.py cat mineru/version.py
- name: Set up Python ${{ matrix.python-version }} - name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5 uses: actions/setup-python@v5
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
- name: Install magic-pdf - name: Install mineru
run: | run: |
python -m pip install --upgrade pip python -m pip install --upgrade pip
pip install -e .[full] pip install -e .[all]
build: build:
needs: [ check-install ] needs: [ check-install ]
...@@ -103,10 +103,11 @@ jobs: ...@@ -103,10 +103,11 @@ jobs:
- name: Install wheel - name: Install wheel
run: | run: |
python -m pip install wheel python -m pip install wheel
pip install build
- name: Build wheel - name: Build wheel
run: | run: |
python setup.py bdist_wheel python -m build --wheel
- name: Upload artifact - name: Upload artifact
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
......
version: 2
build:
os: ubuntu-22.04
tools:
python: "3.10"
formats:
- epub
python:
install:
- requirements: next_docs/zh_cn/requirements.txt
sphinx:
configuration: next_docs/zh_cn/conf.py
This diff is collapsed.
This diff is collapsed.
import os
from pathlib import Path
from magic_pdf.data.batch_build_dataset import batch_build_dataset
from magic_pdf.tools.common import batch_do_parse
def batch(pdf_dir, output_dir, method, lang):
os.makedirs(output_dir, exist_ok=True)
doc_paths = []
for doc_path in Path(pdf_dir).glob('*'):
if doc_path.suffix == '.pdf':
doc_paths.append(doc_path)
# build dataset with 2 workers
datasets = batch_build_dataset(doc_paths, 4, lang)
# os.environ["MINERU_MIN_BATCH_INFERENCE_SIZE"] = "200" # every 200 pages will be parsed in one batch
batch_do_parse(output_dir, [str(doc_path.stem) for doc_path in doc_paths], datasets, method)
if __name__ == '__main__':
batch("pdfs", "output", "auto", "")
# Copyright (c) Opendatalab. All rights reserved. # Copyright (c) Opendatalab. All rights reserved.
import copy
import json
import os import os
from pathlib import Path
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset from loguru import logger
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod from mineru.cli.common import convert_pdf_bytes_to_bytes_by_pypdfium2, prepare_env, read_fn
from mineru.data.data_reader_writer import FileBasedDataWriter
# args from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox
__dir__ = os.path.dirname(os.path.abspath(__file__)) from mineru.utils.enum_class import MakeMode
pdf_file_name = os.path.join(__dir__, "pdfs", "demo1.pdf") # replace with the real pdf path from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
name_without_extension = os.path.basename(pdf_file_name).split('.')[0] from mineru.backend.pipeline.pipeline_analyze import doc_analyze as pipeline_doc_analyze
from mineru.backend.pipeline.pipeline_middle_json_mkcontent import union_make as pipeline_union_make
# prepare env from mineru.backend.pipeline.model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json
local_image_dir = os.path.join(__dir__, "output", name_without_extension, "images") from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
local_md_dir = os.path.join(__dir__, "output", name_without_extension) from mineru.utils.models_download_utils import auto_download_and_get_model_root_path
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
def do_parse(
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir) output_dir, # Output directory for storing parsing results
pdf_file_names: list[str], # List of PDF file names to be parsed
# read bytes pdf_bytes_list: list[bytes], # List of PDF bytes to be parsed
reader1 = FileBasedDataReader("") p_lang_list: list[str], # List of languages for each PDF, default is 'ch' (Chinese)
pdf_bytes = reader1.read(pdf_file_name) # read the pdf content backend="pipeline", # The backend for parsing PDF, default is 'pipeline'
parse_method="auto", # The method for parsing PDF, default is 'auto'
# proc p_formula_enable=True, # Enable formula parsing
## Create Dataset Instance p_table_enable=True, # Enable table parsing
ds = PymuDocDataset(pdf_bytes) server_url=None, # Server URL for vlm-sglang-client backend
f_draw_layout_bbox=True, # Whether to draw layout bounding boxes
## inference f_draw_span_bbox=True, # Whether to draw span bounding boxes
if ds.classify() == SupportedPdfParseMethod.OCR: f_dump_md=True, # Whether to dump markdown files
infer_result = ds.apply(doc_analyze, ocr=True) f_dump_middle_json=True, # Whether to dump middle JSON files
f_dump_model_output=True, # Whether to dump model output files
## pipeline f_dump_orig_pdf=True, # Whether to dump original PDF files
pipe_result = infer_result.pipe_ocr_mode(image_writer) f_dump_content_list=True, # Whether to dump content list files
f_make_md_mode=MakeMode.MM_MD, # The mode for making markdown content, default is MM_MD
else: start_page_id=0, # Start page ID for parsing, default is 0
infer_result = ds.apply(doc_analyze, ocr=False) end_page_id=None, # End page ID for parsing, default is None (parse all pages until the end of the document)
):
## pipeline
pipe_result = infer_result.pipe_txt_mode(image_writer) if backend == "pipeline":
for idx, pdf_bytes in enumerate(pdf_bytes_list):
### get model inference result new_pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id, end_page_id)
model_inference_result = infer_result.get_infer_res() pdf_bytes_list[idx] = new_pdf_bytes
### draw layout result on each page infer_results, all_image_lists, all_pdf_docs, lang_list, ocr_enabled_list = pipeline_doc_analyze(pdf_bytes_list, p_lang_list, parse_method=parse_method, formula_enable=p_formula_enable,table_enable=p_table_enable)
pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_extension}_layout.pdf"))
for idx, model_list in enumerate(infer_results):
### draw spans result on each page model_json = copy.deepcopy(model_list)
pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_extension}_spans.pdf")) pdf_file_name = pdf_file_names[idx]
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
### get markdown content image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
md_content = pipe_result.get_markdown(image_dir)
images_list = all_image_lists[idx]
### dump markdown pdf_doc = all_pdf_docs[idx]
pipe_result.dump_md(md_writer, f"{name_without_extension}.md", image_dir) _lang = lang_list[idx]
_ocr_enable = ocr_enabled_list[idx]
### get content list content middle_json = pipeline_result_to_middle_json(model_list, images_list, pdf_doc, image_writer, _lang, _ocr_enable, p_formula_enable)
content_list_content = pipe_result.get_content_list(image_dir)
pdf_info = middle_json["pdf_info"]
### dump content list
pipe_result.dump_content_list(md_writer, f"{name_without_extension}_content_list.json", image_dir) pdf_bytes = pdf_bytes_list[idx]
if f_draw_layout_bbox:
### get middle json draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_layout.pdf")
middle_json_content = pipe_result.get_middle_json()
if f_draw_span_bbox:
### dump middle json draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_span.pdf")
pipe_result.dump_middle_json(md_writer, f'{name_without_extension}_middle.json')
if f_dump_orig_pdf:
md_writer.write(
f"{pdf_file_name}_origin.pdf",
pdf_bytes,
)
if f_dump_md:
image_dir = str(os.path.basename(local_image_dir))
md_content_str = pipeline_union_make(pdf_info, f_make_md_mode, image_dir)
md_writer.write_string(
f"{pdf_file_name}.md",
md_content_str,
)
if f_dump_content_list:
image_dir = str(os.path.basename(local_image_dir))
content_list = pipeline_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir)
md_writer.write_string(
f"{pdf_file_name}_content_list.json",
json.dumps(content_list, ensure_ascii=False, indent=4),
)
if f_dump_middle_json:
md_writer.write_string(
f"{pdf_file_name}_middle.json",
json.dumps(middle_json, ensure_ascii=False, indent=4),
)
if f_dump_model_output:
md_writer.write_string(
f"{pdf_file_name}_model.json",
json.dumps(model_json, ensure_ascii=False, indent=4),
)
logger.info(f"local output dir is {local_md_dir}")
else:
if backend.startswith("vlm-"):
backend = backend[4:]
f_draw_span_bbox = False
parse_method = "vlm"
for idx, pdf_bytes in enumerate(pdf_bytes_list):
pdf_file_name = pdf_file_names[idx]
pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id, end_page_id)
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
middle_json, infer_result = vlm_doc_analyze(pdf_bytes, image_writer=image_writer, backend=backend, server_url=server_url)
pdf_info = middle_json["pdf_info"]
if f_draw_layout_bbox:
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_layout.pdf")
if f_draw_span_bbox:
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_span.pdf")
if f_dump_orig_pdf:
md_writer.write(
f"{pdf_file_name}_origin.pdf",
pdf_bytes,
)
if f_dump_md:
image_dir = str(os.path.basename(local_image_dir))
md_content_str = vlm_union_make(pdf_info, f_make_md_mode, image_dir)
md_writer.write_string(
f"{pdf_file_name}.md",
md_content_str,
)
if f_dump_content_list:
image_dir = str(os.path.basename(local_image_dir))
content_list = vlm_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir)
md_writer.write_string(
f"{pdf_file_name}_content_list.json",
json.dumps(content_list, ensure_ascii=False, indent=4),
)
if f_dump_middle_json:
md_writer.write_string(
f"{pdf_file_name}_middle.json",
json.dumps(middle_json, ensure_ascii=False, indent=4),
)
if f_dump_model_output:
model_output = ("\n" + "-" * 50 + "\n").join(infer_result)
md_writer.write_string(
f"{pdf_file_name}_model_output.txt",
model_output,
)
logger.info(f"local output dir is {local_md_dir}")
def parse_doc(
path_list: list[Path],
output_dir,
lang="ch",
backend="pipeline",
method="auto",
server_url=None,
start_page_id=0, # Start page ID for parsing, default is 0
end_page_id=None # End page ID for parsing, default is None (parse all pages until the end of the document)
):
"""
Parameter description:
path_list: List of document paths to be parsed, can be PDF or image files.
output_dir: Output directory for storing parsing results.
lang: Language option, default is 'ch', optional values include['ch', 'ch_server', 'ch_lite', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']。
Input the languages in the pdf (if known) to improve OCR accuracy. Optional.
Adapted only for the case where the backend is set to "pipeline"
backend: the backend for parsing pdf:
pipeline: More general.
vlm-transformers: More general.
vlm-sglang-engine: Faster(engine).
vlm-sglang-client: Faster(client).
without method specified, pipeline will be used by default.
method: the method for parsing pdf:
auto: Automatically determine the method based on the file type.
txt: Use text extraction method.
ocr: Use OCR method for image-based PDFs.
Without method specified, 'auto' will be used by default.
Adapted only for the case where the backend is set to "pipeline".
server_url: When the backend is `sglang-client`, you need to specify the server_url, for example:`http://127.0.0.1:30000`
"""
try:
file_name_list = []
pdf_bytes_list = []
lang_list = []
for path in path_list:
file_name = str(Path(path).stem)
pdf_bytes = read_fn(path)
file_name_list.append(file_name)
pdf_bytes_list.append(pdf_bytes)
lang_list.append(lang)
do_parse(
output_dir=output_dir,
pdf_file_names=file_name_list,
pdf_bytes_list=pdf_bytes_list,
p_lang_list=lang_list,
backend=backend,
parse_method=method,
server_url=server_url,
start_page_id=start_page_id,
end_page_id=end_page_id
)
except Exception as e:
logger.exception(e)
if __name__ == '__main__':
# args
__dir__ = os.path.dirname(os.path.abspath(__file__))
pdf_files_dir = os.path.join(__dir__, "pdfs")
output_dir = os.path.join(__dir__, "output")
pdf_suffixes = [".pdf"]
image_suffixes = [".png", ".jpeg", ".jpg"]
doc_path_list = []
for doc_path in Path(pdf_files_dir).glob('*'):
if doc_path.suffix in pdf_suffixes + image_suffixes:
doc_path_list.append(doc_path)
"""如果您由于网络问题无法下载模型,可以设置环境变量MINERU_MODEL_SOURCE为modelscope使用免代理仓库下载模型"""
# os.environ['MINERU_MODEL_SOURCE'] = "modelscope"
"""Use pipeline mode if your environment does not support VLM"""
parse_doc(doc_path_list, output_dir, backend="pipeline")
"""To enable VLM mode, change the backend to 'vlm-xxx'"""
# parse_doc(doc_path_list, output_dir, backend="vlm-transformers") # more general.
# parse_doc(doc_path_list, output_dir, backend="vlm-sglang-engine") # faster(engine).
# parse_doc(doc_path_list, output_dir, backend="vlm-sglang-client", server_url="http://127.0.0.1:30000") # faster(client).
\ No newline at end of file
# Use the official Ubuntu base image
FROM swr.cn-central-221.ovaijisuan.com/mindformers/mindformers1.2_mindspore2.3:20240722
USER root
# Set environment variables to non-interactive to avoid prompts during installation
ENV DEBIAN_FRONTEND=noninteractive
# Update the package list and install necessary packages
RUN apt-get update && \
apt-get install -y \
software-properties-common && \
add-apt-repository -y ppa:deadsnakes/ppa && \
apt-get update && \
apt-get install -y \
python3.10 \
python3.10-venv \
python3.10-distutils \
python3.10-dev \
python3-pip \
wget \
git \
libgl1 \
libglib2.0-0 \
&& rm -rf /var/lib/apt/lists/*
# Set Python 3.10 as the default python3
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
# Create a virtual environment for MinerU
RUN python3 -m venv /opt/mineru_venv
# Copy the configuration file template and install magic-pdf latest
RUN /bin/bash -c "wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/magic-pdf.template.json && \
cp magic-pdf.template.json /root/magic-pdf.json && \
source /opt/mineru_venv/bin/activate && \
pip3 install --upgrade pip -i https://mirrors.aliyun.com/pypi/simple && \
pip3 install torch==2.3.1 torchvision==0.18.1 -i https://mirrors.aliyun.com/pypi/simple && \
pip3 install -U magic-pdf[full] 'numpy<2' decorator attrs absl-py cloudpickle ml-dtypes tornado einops -i https://mirrors.aliyun.com/pypi/simple && \
wget https://gitee.com/ascend/pytorch/releases/download/v6.0.rc2-pytorch2.3.1/torch_npu-2.3.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl && \
pip3 install torch_npu-2.3.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl"
# Download models and update the configuration file
RUN /bin/bash -c "source /opt/mineru_venv/bin/activate && \
pip3 install modelscope -i https://mirrors.aliyun.com/pypi/simple && \
wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/scripts/download_models.py -O download_models.py && \
python3 download_models.py && \
sed -i 's|cpu|npu|g' /root/magic-pdf.json"
# Set the entry point to activate the virtual environment and run the command line tool
ENTRYPOINT ["/bin/bash", "-c", "source /opt/mineru_venv/bin/activate && exec \"$@\"", "--"]
...@@ -18,37 +18,19 @@ RUN apt-get update && \ ...@@ -18,37 +18,19 @@ RUN apt-get update && \
wget \ wget \
git \ git \
libgl1 \ libgl1 \
libreoffice \
fonts-noto-cjk \
fonts-wqy-zenhei \
fonts-wqy-microhei \
ttf-mscorefonts-installer \
fontconfig \
libglib2.0-0 \ libglib2.0-0 \
libxrender1 \
libsm6 \
libxext6 \
poppler-utils \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
# Set Python 3.10 as the default python3 # Set Python 3.10 as the default python3
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
# Create a virtual environment for MinerU # install mineru latest
RUN python3 -m venv /opt/mineru_venv RUN /bin/bash -c "pip3 install --upgrade pip -i https://mirrors.aliyun.com/pypi/simple && \
pip3 install uv -i https://mirrors.aliyun.com/pypi/simple && \
# Copy the configuration file template and install magic-pdf latest uv pip install 'mineru[all]>=2.0.0' -i https://mirrors.aliyun.com/pypi/simple"
RUN /bin/bash -c "wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/magic-pdf.template.json && \
cp magic-pdf.template.json /root/magic-pdf.json && \
source /opt/mineru_venv/bin/activate && \
pip3 install --upgrade pip -i https://mirrors.aliyun.com/pypi/simple && \
pip3 install -U magic-pdf[full] -i https://mirrors.aliyun.com/pypi/simple"
# Download models and update the configuration file # Download models and update the configuration file
RUN /bin/bash -c "pip3 install modelscope -i https://mirrors.aliyun.com/pypi/simple && \ RUN /bin/bash -c "mineru-models-download -s modelscope -m all"
wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/scripts/download_models.py -O download_models.py && \
python3 download_models.py && \
sed -i 's|cpu|cuda|g' /root/magic-pdf.json"
# Set the entry point to activate the virtual environment and run the command line tool # Set the entry point to activate the virtual environment and run the command line tool
ENTRYPOINT ["/bin/bash", "-c", "source /opt/mineru_venv/bin/activate && exec \"$@\"", "--"] ENTRYPOINT ["/bin/bash", "-c", "export MINERU_MODEL_SOURCE=local && exec \"$@\"", "--"]
\ No newline at end of file
...@@ -18,37 +18,19 @@ RUN apt-get update && \ ...@@ -18,37 +18,19 @@ RUN apt-get update && \
wget \ wget \
git \ git \
libgl1 \ libgl1 \
libreoffice \
fonts-noto-cjk \
fonts-wqy-zenhei \
fonts-wqy-microhei \
ttf-mscorefonts-installer \
fontconfig \
libglib2.0-0 \ libglib2.0-0 \
libxrender1 \
libsm6 \
libxext6 \
poppler-utils \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
# Set Python 3.10 as the default python3 # Set Python 3.10 as the default python3
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
# Create a virtual environment for MinerU # install mineru latest
RUN python3 -m venv /opt/mineru_venv RUN /bin/bash -c "pip3 install --upgrade pip && \
pip3 install uv && \
# Copy the configuration file template and install magic-pdf latest uv pip install 'mineru[all]>=2.0.0'"
RUN /bin/bash -c "wget https://github.com/opendatalab/MinerU/raw/master/magic-pdf.template.json && \
cp magic-pdf.template.json /root/magic-pdf.json && \
source /opt/mineru_venv/bin/activate && \
pip3 install --upgrade pip && \
pip3 install -U magic-pdf[full]"
# Download models and update the configuration file # Download models and update the configuration file
RUN /bin/bash -c "pip3 install huggingface_hub && \ RUN /bin/bash -c "mineru-models-download -s huggingface -m all"
wget https://github.com/opendatalab/MinerU/raw/master/scripts/download_models_hf.py -O download_models.py && \
python3 download_models.py && \
sed -i 's|cpu|cuda|g' /root/magic-pdf.json"
# Set the entry point to activate the virtual environment and run the command line tool # Set the entry point to activate the virtual environment and run the command line tool
ENTRYPOINT ["/bin/bash", "-c", "source /opt/mineru_venv/bin/activate && exec \"$@\"", "--"] ENTRYPOINT ["/bin/bash", "-c", "export MINERU_MODEL_SOURCE=local && exec \"$@\"", "--"]
\ No newline at end of file
# Ascend NPU 加速
## 简介
本文档介绍如何在 Ascend NPU 上使用 MinerU。本文档内容已在`华为 Atlas 800T A2`服务器上测试通过。
```
CPU:鲲鹏 920 aarch64 2.6GHz
NPU:Ascend 910B 64GB
OS:openEuler 22.03 (LTS-SP3)/ Ubuntu 22.04.5 LTS
CANN:8.0.RC2
驱动版本:24.1.rc2.1
```
由于适配 Ascend NPU 的环境较为复杂,建议使用 Docker 容器运行 MinerU。
通过docker运行MinerU前需确保物理机已安装支持CANN 8.0.RC2的驱动和固件。
## 构建镜像
请保持网络状况良好,并执行以下代码构建镜像。
```bash
wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/docker/ascend_npu/Dockerfile -O Dockerfile
docker build -t mineru_npu:latest .
```
如果构建过程中未发生报错则说明镜像构建成功。
## 运行容器
```bash
docker run -it -u root --name mineru-npu --privileged=true \
--ipc=host \
--network=host \
--device=/dev/davinci0 \
--device=/dev/davinci1 \
--device=/dev/davinci2 \
--device=/dev/davinci3 \
--device=/dev/davinci4 \
--device=/dev/davinci5 \
--device=/dev/davinci6 \
--device=/dev/davinci7 \
--device=/dev/davinci_manager \
--device=/dev/devmm_svm \
--device=/dev/hisi_hdc \
-v /var/log/npu/:/usr/slog \
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
mineru_npu:latest \
/bin/bash -c "echo 'source /opt/mineru_venv/bin/activate' >> ~/.bashrc && exec bash"
magic-pdf --help
```
# Ubuntu 22.04 LTS
### 1. Check if NVIDIA Drivers Are Installed
```sh
nvidia-smi
```
If you see information similar to the following, it means that the NVIDIA drivers are already installed, and you can skip Step 2.
> [!NOTE]
> Notice:`CUDA Version` should be >= 12.4, If the displayed version number is less than 12.4, please upgrade the driver.
```plaintext
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.133.07 Driver Version: 572.83 CUDA Version: 12.8 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name TCC/WDDM | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA GeForce RTX 3060 Ti WDDM | 00000000:01:00.0 On | N/A |
| 0% 51C P8 12W / 200W | 1489MiB / 8192MiB | 5% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
```
### 2. Install the Driver
If no driver is installed, use the following command:
```sh
sudo apt-get update
sudo apt-get install nvidia-driver-570-server
```
Install the proprietary driver and restart your computer after installation.
```sh
reboot
```
### 3. Install Anaconda
If Anaconda is already installed, skip this step.
```sh
wget https://repo.anaconda.com/archive/Anaconda3-2024.06-1-Linux-x86_64.sh
bash Anaconda3-2024.06-1-Linux-x86_64.sh
```
In the final step, enter `yes`, close the terminal, and reopen it.
### 4. Create an Environment Using Conda
```bash
conda create -n mineru 'python=3.12' -y
conda activate mineru
```
### 5. Install Applications
```sh
pip install -U magic-pdf[full]
```
> [!TIP]
> After installation, you can check the version of `magic-pdf` using the following command:
>
> ```sh
> magic-pdf --version
> ```
### 6. Download Models
Refer to detailed instructions on [how to download model files](how_to_download_models_en.md).
## 7. Understand the Location of the Configuration File
After completing the [6. Download Models](#6-download-models) step, the script will automatically generate a `magic-pdf.json` file in the user directory and configure the default model path.
You can find the `magic-pdf.json` file in your user directory.
> [!TIP]
> The user directory for Linux is "/home/username".
### 8. First Run
Download a sample file from the repository and test it.
```sh
wget https://github.com/opendatalab/MinerU/raw/master/demo/pdfs/small_ocr.pdf
magic-pdf -p small_ocr.pdf -o ./output
```
### 9. Test CUDA Acceleration
If your graphics card has at least **6GB** of VRAM, follow these steps to test CUDA acceleration:
1. Modify the value of `"device-mode"` in the `magic-pdf.json` configuration file located in your home directory.
```json
{
"device-mode": "cuda"
}
```
2. Test CUDA acceleration with the following command:
```sh
magic-pdf -p small_ocr.pdf -o ./output
```
\ No newline at end of file
# Ubuntu 22.04 LTS
## 1. 检测是否已安装nvidia驱动
```bash
nvidia-smi
```
如果看到类似如下的信息,说明已经安装了nvidia驱动,可以跳过步骤2
> [!NOTE]
> `CUDA Version` 显示的版本号应 >= 12.4,如显示的版本号小于12.4,请升级驱动
```plaintext
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.133.07 Driver Version: 572.83 CUDA Version: 12.8 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name TCC/WDDM | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA GeForce RTX 3060 Ti WDDM | 00000000:01:00.0 On | N/A |
| 0% 51C P8 12W / 200W | 1489MiB / 8192MiB | 5% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
```
## 2. 安装驱动
如没有驱动,则通过如下命令
```bash
sudo apt-get update
sudo apt-get install nvidia-driver-570-server
```
安装专有驱动,安装完成后,重启电脑
```bash
reboot
```
## 3. 安装anacoda
如果已安装conda,可以跳过本步骤
```bash
wget -U NoSuchBrowser/1.0 https://mirrors.tuna.tsinghua.edu.cn/anaconda/archive/Anaconda3-2024.06-1-Linux-x86_64.sh
bash Anaconda3-2024.06-1-Linux-x86_64.sh
```
最后一步输入yes,关闭终端重新打开
## 4. 使用conda 创建环境
```bash
conda create -n mineru 'python=3.12' -y
conda activate mineru
```
## 5. 安装应用
```bash
pip install -U magic-pdf[full] -i https://mirrors.aliyun.com/pypi/simple
```
> [!TIP]
> 下载完成后,您可以通过以下命令检查`magic-pdf`的版本:
>
> ```bash
> magic-pdf --version
> ```
## 6. 下载模型
详细参考 [如何下载模型文件](how_to_download_models_zh_cn.md)
## 7. 了解配置文件存放的位置
完成[6.下载模型](#6-下载模型)步骤后,脚本会自动生成用户目录下的magic-pdf.json文件,并自动配置默认模型路径。
您可在【用户目录】下找到magic-pdf.json文件。
> [!TIP]
> linux用户目录为 "/home/用户名"
## 8. 第一次运行
从仓库中下载样本文件,并测试
```bash
wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/demo/pdfs/small_ocr.pdf
magic-pdf -p small_ocr.pdf -o ./output
```
## 9. 测试CUDA加速
如果您的显卡显存大于等于 **6GB** ,可以进行以下流程,测试CUDA解析加速效果
**1.修改【用户目录】中配置文件magic-pdf.json中"device-mode"的值**
```json
{
"device-mode":"cuda"
}
```
**2.运行以下命令测试cuda加速效果**
```bash
magic-pdf -p small_ocr.pdf -o ./output
```
> [!TIP]
> CUDA加速是否生效可以根据log中输出的各个阶段cost耗时来简单判断,通常情况下,使用cuda加速会比cpu更快。
# Windows 10/11
### 1. Install CUDA and cuDNN
You need to install a CUDA version that is compatible with torch's requirements. For details, please refer to the [official PyTorch website](https://pytorch.org/get-started/locally/).
- CUDA 11.8 https://developer.nvidia.com/cuda-11-8-0-download-archive
- CUDA 12.4 https://developer.nvidia.com/cuda-12-4-0-download-archive
- CUDA 12.6 https://developer.nvidia.com/cuda-12-6-0-download-archive
- CUDA 12.8 https://developer.nvidia.com/cuda-12-8-0-download-archive
### 2. Install Anaconda
If Anaconda is already installed, you can skip this step.
Download link: https://repo.anaconda.com/archive/Anaconda3-2024.06-1-Windows-x86_64.exe
### 3. Create an Environment Using Conda
```bash
conda create -n mineru 'python=3.12' -y
conda activate mineru
```
### 4. Install Applications
```
pip install -U magic-pdf[full]
```
> [!IMPORTANT]
> After installation, you can check the version of `magic-pdf` using the following command:
>
> ```bash
> magic-pdf --version
> ```
### 5. Download Models
Refer to detailed instructions on [how to download model files](how_to_download_models_en.md).
### 6. Understand the Location of the Configuration File
After completing the [5. Download Models](#5-download-models) step, the script will automatically generate a `magic-pdf.json` file in the user directory and configure the default model path.
You can find the `magic-pdf.json` file in your 【user directory】 .
> [!TIP]
> The user directory for Windows is "C:/Users/username".
### 7. First Run
Download a sample file from the repository and test it.
```powershell
wget https://github.com/opendatalab/MinerU/raw/master/demo/pdfs/small_ocr.pdf -O small_ocr.pdf
magic-pdf -p small_ocr.pdf -o ./output
```
### 8. Test CUDA Acceleration
If your graphics card has at least 6GB of VRAM, follow these steps to test CUDA-accelerated parsing performance.
1. **Overwrite the installation of torch and torchvision** supporting CUDA.(Please select the appropriate index-url based on your CUDA version. For more details, refer to the [PyTorch official website](https://pytorch.org/get-started/locally/).)
```
pip install --force-reinstall torch torchvision --index-url https://download.pytorch.org/whl/cu124
```
2. **Modify the value of `"device-mode"`** in the `magic-pdf.json` configuration file located in your user directory.
```json
{
"device-mode": "cuda"
}
```
3. **Run the following command to test CUDA acceleration**:
```
magic-pdf -p small_ocr.pdf -o ./output
```
\ No newline at end of file
# Windows10/11
## 1. 安装cuda环境
需要安装符合torch要求的cuda版本,具体可参考[torch官网](https://pytorch.org/get-started/locally/)
- CUDA 11.8 https://developer.nvidia.com/cuda-11-8-0-download-archive
- CUDA 12.4 https://developer.nvidia.com/cuda-12-4-0-download-archive
- CUDA 12.6 https://developer.nvidia.com/cuda-12-6-0-download-archive
- CUDA 12.8 https://developer.nvidia.com/cuda-12-8-0-download-archive
## 2. 安装anaconda
如果已安装conda,可以跳过本步骤
下载链接:
https://mirrors.tuna.tsinghua.edu.cn/anaconda/archive/Anaconda3-2024.06-1-Windows-x86_64.exe
## 3. 使用conda 创建环境
```bash
conda create -n mineru 'python=3.12' -y
conda activate mineru
```
## 4. 安装应用
```bash
pip install -U magic-pdf[full] -i https://mirrors.aliyun.com/pypi/simple
```
> [!IMPORTANT]
> 下载完成后,您可以通过以下命令检查magic-pdf的版本
>
> ```bash
> magic-pdf --version
> ```
## 5. 下载模型
详细参考 [如何下载模型文件](how_to_download_models_zh_cn.md)
## 6. 了解配置文件存放的位置
完成[5.下载模型](#5-下载模型)步骤后,脚本会自动生成用户目录下的magic-pdf.json文件,并自动配置默认模型路径。
您可在【用户目录】下找到magic-pdf.json文件。
> [!TIP]
> windows用户目录为 "C:/Users/用户名"
## 7. 第一次运行
从仓库中下载样本文件,并测试
```powershell
wget https://github.com/opendatalab/MinerU/raw/master/demo/pdfs/small_ocr.pdf -O small_ocr.pdf
magic-pdf -p small_ocr.pdf -o ./output
```
## 8. 测试CUDA加速
如果您的显卡显存大于等于 **6GB** ,可以进行以下流程,测试CUDA解析加速效果
**1.覆盖安装支持cuda的torch和torchvision**(请根据cuda版本选择合适的index-url,具体可参考[torch官网](https://pytorch.org/get-started/locally/))
```bash
pip install --force-reinstall torch torchvision --index-url https://download.pytorch.org/whl/cu124
```
**2.修改【用户目录】中配置文件magic-pdf.json中"device-mode"的值**
```json
{
"device-mode":"cuda"
}
```
**3.运行以下命令测试cuda加速效果**
```bash
magic-pdf -p small_ocr.pdf -o ./output
```
> [!TIP]
> CUDA加速是否生效可以根据log中输出的各个阶段的耗时来简单判断,通常情况下,cuda加速后运行速度比cpu更快。
Model downloads are divided into initial downloads and updates to the model directory. Please refer to the corresponding documentation for instructions on how to proceed.
# Initial download of model files
### Download the Model from Hugging Face
Use a Python Script to Download Model Files from Hugging Face
```bash
pip install huggingface_hub
wget https://github.com/opendatalab/MinerU/raw/master/scripts/download_models_hf.py -O download_models_hf.py
python download_models_hf.py
```
The Python script will automatically download the model files and configure the model directory in the configuration file.
The configuration file can be found in the user directory, with the filename `magic-pdf.json`.
# How to update models previously downloaded
## 1. Models downloaded via Hugging Face or Model Scope
If you previously downloaded models via Hugging Face or Model Scope, you can rerun the Python script used for the initial download. This will automatically update the model directory to the latest version.
模型下载分为首次下载和更新模型目录,请参考对应的文档内容进行操作
# 首次下载模型文件
模型文件可以从 Hugging Face 或 Model Scope 下载,由于网络原因,国内用户访问HF可能会失败,请使用 ModelScope。
<details>
<summary>方法一:从 Hugging Face 下载模型</summary>
<p>使用python脚本 从Hugging Face下载模型文件</p>
<pre><code>pip install huggingface_hub
wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/scripts/download_models_hf.py -O download_models_hf.py
python download_models_hf.py</code></pre>
<p>python脚本会自动下载模型文件并配置好配置文件中的模型目录</p>
</details>
## 方法二:从 ModelScope 下载模型
### 使用python脚本 从ModelScope下载模型文件
```bash
pip install modelscope
wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/scripts/download_models.py -O download_models.py
python download_models.py
```
python脚本会自动下载模型文件并配置好配置文件中的模型目录
配置文件可以在用户目录中找到,文件名为`magic-pdf.json`
> [!TIP]
> windows的用户目录为 "C:\\Users\\用户名", linux用户目录为 "/home/用户名", macOS用户目录为 "/Users/用户名"
# 此前下载过模型,如何更新
## 1. 通过 Hugging Face 或 Model Scope 下载过模型
如此前通过 HuggingFace 或 Model Scope 下载过模型,可以重复执行此前的模型下载python脚本,将会自动将模型目录更新到最新版本。
...@@ -5,8 +5,8 @@ from collections import defaultdict ...@@ -5,8 +5,8 @@ from collections import defaultdict
import numpy as np import numpy as np
from .model_init import AtomModelSingleton from .model_init import AtomModelSingleton
from ...utils.model_utils import crop_img, get_res_list_from_layout_res, get_coords_and_area from ...utils.model_utils import crop_img, get_res_list_from_layout_res
from ...utils.ocr_utils import get_adjusted_mfdetrec_res, get_ocr_result_list from ...utils.ocr_utils import get_adjusted_mfdetrec_res, get_ocr_result_list, OcrConfidence
YOLO_LAYOUT_BASE_BATCH_SIZE = 1 YOLO_LAYOUT_BASE_BATCH_SIZE = 1
MFD_BASE_BATCH_SIZE = 1 MFD_BASE_BATCH_SIZE = 1
...@@ -315,7 +315,7 @@ class BatchAnalyze: ...@@ -315,7 +315,7 @@ class BatchAnalyze:
ocr_text, ocr_score = ocr_res_list[index] ocr_text, ocr_score = ocr_res_list[index]
layout_res_item['text'] = ocr_text layout_res_item['text'] = ocr_text
layout_res_item['score'] = float(f"{ocr_score:.3f}") layout_res_item['score'] = float(f"{ocr_score:.3f}")
if ocr_score < 0.6: if ocr_score < OcrConfidence.min_confidence:
layout_res_item['category_id'] = 16 layout_res_item['category_id'] = 16
total_processed += len(img_crop_list) total_processed += len(img_crop_list)
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
import time import time
from loguru import logger from loguru import logger
from tqdm import tqdm
from mineru.utils.config_reader import get_device, get_llm_aided_config from mineru.utils.config_reader import get_device, get_llm_aided_config
from mineru.backend.pipeline.model_init import AtomModelSingleton from mineru.backend.pipeline.model_init import AtomModelSingleton
...@@ -14,6 +15,7 @@ from mineru.utils.enum_class import ContentType ...@@ -14,6 +15,7 @@ from mineru.utils.enum_class import ContentType
from mineru.utils.llm_aided import llm_aided_title from mineru.utils.llm_aided import llm_aided_title
from mineru.utils.model_utils import clean_memory from mineru.utils.model_utils import clean_memory
from mineru.backend.pipeline.pipeline_magic_model import MagicModel from mineru.backend.pipeline.pipeline_magic_model import MagicModel
from mineru.utils.ocr_utils import OcrConfidence
from mineru.utils.span_block_fix import fill_spans_in_blocks, fix_discarded_block, fix_block_spans from mineru.utils.span_block_fix import fill_spans_in_blocks, fix_discarded_block, fix_block_spans
from mineru.utils.span_pre_proc import remove_outside_spans, remove_overlaps_low_confidence_spans, \ from mineru.utils.span_pre_proc import remove_outside_spans, remove_overlaps_low_confidence_spans, \
remove_overlaps_min_spans, txt_spans_extract remove_overlaps_min_spans, txt_spans_extract
...@@ -163,7 +165,7 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer ...@@ -163,7 +165,7 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
def result_to_middle_json(model_list, images_list, pdf_doc, image_writer, lang=None, ocr_enable=False, formula_enabled=True): def result_to_middle_json(model_list, images_list, pdf_doc, image_writer, lang=None, ocr_enable=False, formula_enabled=True):
middle_json = {"pdf_info": [], "_backend":"pipeline", "_version_name": __version__} middle_json = {"pdf_info": [], "_backend":"pipeline", "_version_name": __version__}
for page_index, page_model_info in enumerate(model_list): for page_index, page_model_info in tqdm(enumerate(model_list), total=len(model_list), desc="Processing pages"):
page = pdf_doc[page_index] page = pdf_doc[page_index]
image_dict = images_list[page_index] image_dict = images_list[page_index]
page_info = page_model_info_to_page_info( page_info = page_model_info_to_page_info(
...@@ -208,7 +210,7 @@ def result_to_middle_json(model_list, images_list, pdf_doc, image_writer, lang=N ...@@ -208,7 +210,7 @@ def result_to_middle_json(model_list, images_list, pdf_doc, image_writer, lang=N
need_ocr_list), f'ocr_res_list: {len(ocr_res_list)}, need_ocr_list: {len(need_ocr_list)}' need_ocr_list), f'ocr_res_list: {len(ocr_res_list)}, need_ocr_list: {len(need_ocr_list)}'
for index, span in enumerate(need_ocr_list): for index, span in enumerate(need_ocr_list):
ocr_text, ocr_score = ocr_res_list[index] ocr_text, ocr_score = ocr_res_list[index]
if ocr_score > 0.6: if ocr_score > OcrConfidence.min_confidence:
span['content'] = ocr_text span['content'] = ocr_text
span['score'] = float(f"{ocr_score:.3f}") span['score'] = float(f"{ocr_score:.3f}")
else: else:
......
...@@ -7,8 +7,8 @@ DEFAULT_SYSTEM_PROMPT = ( ...@@ -7,8 +7,8 @@ DEFAULT_SYSTEM_PROMPT = (
) )
DEFAULT_USER_PROMPT = "Document Parsing:" DEFAULT_USER_PROMPT = "Document Parsing:"
DEFAULT_TEMPERATURE = 0.0 DEFAULT_TEMPERATURE = 0.0
DEFAULT_TOP_P = 0.01 DEFAULT_TOP_P = 0.8
DEFAULT_TOP_K = 1 DEFAULT_TOP_K = 20
DEFAULT_REPETITION_PENALTY = 1.0 DEFAULT_REPETITION_PENALTY = 1.0
DEFAULT_PRESENCE_PENALTY = 0.0 DEFAULT_PRESENCE_PENALTY = 0.0
DEFAULT_NO_REPEAT_NGRAM_SIZE = 100 DEFAULT_NO_REPEAT_NGRAM_SIZE = 100
......
...@@ -22,7 +22,7 @@ try: ...@@ -22,7 +22,7 @@ try:
hf_loaded = True hf_loaded = True
except ImportError as e: except ImportError as e:
logger.warning("hf is not installed. If you are not using huggingface, you can ignore this warning.") logger.warning("hf is not installed. If you are not using transformers, you can ignore this warning.")
engine_loaded = False engine_loaded = False
try: try:
...@@ -51,9 +51,9 @@ def get_predictor( ...@@ -51,9 +51,9 @@ def get_predictor(
) -> BasePredictor: ) -> BasePredictor:
start_time = time.time() start_time = time.time()
if backend == "huggingface": if backend == "transformers":
if not model_path: if not model_path:
raise ValueError("model_path must be provided for huggingface backend.") raise ValueError("model_path must be provided for transformers backend.")
if not hf_loaded: if not hf_loaded:
raise ImportError( raise ImportError(
"transformers is not installed, so huggingface backend cannot be used. " "transformers is not installed, so huggingface backend cannot be used. "
...@@ -77,7 +77,7 @@ def get_predictor( ...@@ -77,7 +77,7 @@ def get_predictor(
raise ImportError( raise ImportError(
"sglang is not installed, so sglang-engine backend cannot be used. " "sglang is not installed, so sglang-engine backend cannot be used. "
"If you need to use sglang-engine backend for inference, " "If you need to use sglang-engine backend for inference, "
"please install sglang[all]==0.4.6.post4 or a newer version." "please install sglang[all]==0.4.7 or a newer version."
) )
predictor = SglangEnginePredictor( predictor = SglangEnginePredictor(
server_args=ServerArgs(model_path, **kwargs), server_args=ServerArgs(model_path, **kwargs),
...@@ -104,7 +104,7 @@ def get_predictor( ...@@ -104,7 +104,7 @@ def get_predictor(
http_timeout=http_timeout, http_timeout=http_timeout,
) )
else: else:
raise ValueError(f"Unsupported backend: {backend}. Supports: huggingface, sglang-engine, sglang-client.") raise ValueError(f"Unsupported backend: {backend}. Supports: transformers, sglang-engine, sglang-client.")
elapsed = round(time.time() - start_time, 2) elapsed = round(time.time() - start_time, 2)
logger.info(f"get_predictor cost: {elapsed}s") logger.info(f"get_predictor cost: {elapsed}s")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment