Unverified Commit bcbbee8c authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #2622 from myhloli/dev

Dev
parents 3cc3f754 ced5a7b4
......@@ -32,14 +32,14 @@ jobs:
- name: Verify version.py
run: |
ls -l magic_pdf/libs/version.py
cat magic_pdf/libs/version.py
ls -l mineru/version.py
cat mineru/version.py
- name: Commit changes
run: |
git config --local user.email "moe@myhloli.com"
git config --local user.name "myhloli"
git add magic_pdf/libs/version.py
git add mineru/version.py
if git diff-index --quiet HEAD; then
echo "No changes to commit"
else
......@@ -71,18 +71,18 @@ jobs:
- name: Verify version.py
run: |
ls -l magic_pdf/libs/version.py
cat magic_pdf/libs/version.py
ls -l mineru/version.py
cat mineru/version.py
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install magic-pdf
- name: Install mineru
run: |
python -m pip install --upgrade pip
pip install -e .[full]
pip install -e .[all]
build:
needs: [ check-install ]
......@@ -103,10 +103,11 @@ jobs:
- name: Install wheel
run: |
python -m pip install wheel
pip install build
- name: Build wheel
run: |
python setup.py bdist_wheel
python -m build --wheel
- name: Upload artifact
uses: actions/upload-artifact@v4
......
version: 2
build:
os: ubuntu-22.04
tools:
python: "3.10"
formats:
- epub
python:
install:
- requirements: next_docs/zh_cn/requirements.txt
sphinx:
configuration: next_docs/zh_cn/conf.py
This diff is collapsed.
This diff is collapsed.
import os
from pathlib import Path
from magic_pdf.data.batch_build_dataset import batch_build_dataset
from magic_pdf.tools.common import batch_do_parse
def batch(pdf_dir, output_dir, method, lang):
os.makedirs(output_dir, exist_ok=True)
doc_paths = []
for doc_path in Path(pdf_dir).glob('*'):
if doc_path.suffix == '.pdf':
doc_paths.append(doc_path)
# build dataset with 2 workers
datasets = batch_build_dataset(doc_paths, 4, lang)
# os.environ["MINERU_MIN_BATCH_INFERENCE_SIZE"] = "200" # every 200 pages will be parsed in one batch
batch_do_parse(output_dir, [str(doc_path.stem) for doc_path in doc_paths], datasets, method)
if __name__ == '__main__':
batch("pdfs", "output", "auto", "")
# Copyright (c) Opendatalab. All rights reserved.
import copy
import json
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod
# args
__dir__ = os.path.dirname(os.path.abspath(__file__))
pdf_file_name = os.path.join(__dir__, "pdfs", "demo1.pdf") # replace with the real pdf path
name_without_extension = os.path.basename(pdf_file_name).split('.')[0]
# prepare env
local_image_dir = os.path.join(__dir__, "output", name_without_extension, "images")
local_md_dir = os.path.join(__dir__, "output", name_without_extension)
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
# read bytes
reader1 = FileBasedDataReader("")
pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
## inference
if ds.classify() == SupportedPdfParseMethod.OCR:
infer_result = ds.apply(doc_analyze, ocr=True)
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
else:
infer_result = ds.apply(doc_analyze, ocr=False)
## pipeline
pipe_result = infer_result.pipe_txt_mode(image_writer)
### get model inference result
model_inference_result = infer_result.get_infer_res()
### draw layout result on each page
pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_extension}_layout.pdf"))
### draw spans result on each page
pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_extension}_spans.pdf"))
### get markdown content
md_content = pipe_result.get_markdown(image_dir)
### dump markdown
pipe_result.dump_md(md_writer, f"{name_without_extension}.md", image_dir)
### get content list content
content_list_content = pipe_result.get_content_list(image_dir)
### dump content list
pipe_result.dump_content_list(md_writer, f"{name_without_extension}_content_list.json", image_dir)
### get middle json
middle_json_content = pipe_result.get_middle_json()
### dump middle json
pipe_result.dump_middle_json(md_writer, f'{name_without_extension}_middle.json')
from pathlib import Path
from loguru import logger
from mineru.cli.common import convert_pdf_bytes_to_bytes_by_pypdfium2, prepare_env, read_fn
from mineru.data.data_reader_writer import FileBasedDataWriter
from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox
from mineru.utils.enum_class import MakeMode
from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
from mineru.backend.pipeline.pipeline_analyze import doc_analyze as pipeline_doc_analyze
from mineru.backend.pipeline.pipeline_middle_json_mkcontent import union_make as pipeline_union_make
from mineru.backend.pipeline.model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json
from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
from mineru.utils.models_download_utils import auto_download_and_get_model_root_path
def do_parse(
output_dir, # Output directory for storing parsing results
pdf_file_names: list[str], # List of PDF file names to be parsed
pdf_bytes_list: list[bytes], # List of PDF bytes to be parsed
p_lang_list: list[str], # List of languages for each PDF, default is 'ch' (Chinese)
backend="pipeline", # The backend for parsing PDF, default is 'pipeline'
parse_method="auto", # The method for parsing PDF, default is 'auto'
p_formula_enable=True, # Enable formula parsing
p_table_enable=True, # Enable table parsing
server_url=None, # Server URL for vlm-sglang-client backend
f_draw_layout_bbox=True, # Whether to draw layout bounding boxes
f_draw_span_bbox=True, # Whether to draw span bounding boxes
f_dump_md=True, # Whether to dump markdown files
f_dump_middle_json=True, # Whether to dump middle JSON files
f_dump_model_output=True, # Whether to dump model output files
f_dump_orig_pdf=True, # Whether to dump original PDF files
f_dump_content_list=True, # Whether to dump content list files
f_make_md_mode=MakeMode.MM_MD, # The mode for making markdown content, default is MM_MD
start_page_id=0, # Start page ID for parsing, default is 0
end_page_id=None, # End page ID for parsing, default is None (parse all pages until the end of the document)
):
if backend == "pipeline":
for idx, pdf_bytes in enumerate(pdf_bytes_list):
new_pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id, end_page_id)
pdf_bytes_list[idx] = new_pdf_bytes
infer_results, all_image_lists, all_pdf_docs, lang_list, ocr_enabled_list = pipeline_doc_analyze(pdf_bytes_list, p_lang_list, parse_method=parse_method, formula_enable=p_formula_enable,table_enable=p_table_enable)
for idx, model_list in enumerate(infer_results):
model_json = copy.deepcopy(model_list)
pdf_file_name = pdf_file_names[idx]
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
images_list = all_image_lists[idx]
pdf_doc = all_pdf_docs[idx]
_lang = lang_list[idx]
_ocr_enable = ocr_enabled_list[idx]
middle_json = pipeline_result_to_middle_json(model_list, images_list, pdf_doc, image_writer, _lang, _ocr_enable, p_formula_enable)
pdf_info = middle_json["pdf_info"]
pdf_bytes = pdf_bytes_list[idx]
if f_draw_layout_bbox:
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_layout.pdf")
if f_draw_span_bbox:
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_span.pdf")
if f_dump_orig_pdf:
md_writer.write(
f"{pdf_file_name}_origin.pdf",
pdf_bytes,
)
if f_dump_md:
image_dir = str(os.path.basename(local_image_dir))
md_content_str = pipeline_union_make(pdf_info, f_make_md_mode, image_dir)
md_writer.write_string(
f"{pdf_file_name}.md",
md_content_str,
)
if f_dump_content_list:
image_dir = str(os.path.basename(local_image_dir))
content_list = pipeline_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir)
md_writer.write_string(
f"{pdf_file_name}_content_list.json",
json.dumps(content_list, ensure_ascii=False, indent=4),
)
if f_dump_middle_json:
md_writer.write_string(
f"{pdf_file_name}_middle.json",
json.dumps(middle_json, ensure_ascii=False, indent=4),
)
if f_dump_model_output:
md_writer.write_string(
f"{pdf_file_name}_model.json",
json.dumps(model_json, ensure_ascii=False, indent=4),
)
logger.info(f"local output dir is {local_md_dir}")
else:
if backend.startswith("vlm-"):
backend = backend[4:]
f_draw_span_bbox = False
parse_method = "vlm"
for idx, pdf_bytes in enumerate(pdf_bytes_list):
pdf_file_name = pdf_file_names[idx]
pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id, end_page_id)
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
middle_json, infer_result = vlm_doc_analyze(pdf_bytes, image_writer=image_writer, backend=backend, server_url=server_url)
pdf_info = middle_json["pdf_info"]
if f_draw_layout_bbox:
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_layout.pdf")
if f_draw_span_bbox:
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_span.pdf")
if f_dump_orig_pdf:
md_writer.write(
f"{pdf_file_name}_origin.pdf",
pdf_bytes,
)
if f_dump_md:
image_dir = str(os.path.basename(local_image_dir))
md_content_str = vlm_union_make(pdf_info, f_make_md_mode, image_dir)
md_writer.write_string(
f"{pdf_file_name}.md",
md_content_str,
)
if f_dump_content_list:
image_dir = str(os.path.basename(local_image_dir))
content_list = vlm_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir)
md_writer.write_string(
f"{pdf_file_name}_content_list.json",
json.dumps(content_list, ensure_ascii=False, indent=4),
)
if f_dump_middle_json:
md_writer.write_string(
f"{pdf_file_name}_middle.json",
json.dumps(middle_json, ensure_ascii=False, indent=4),
)
if f_dump_model_output:
model_output = ("\n" + "-" * 50 + "\n").join(infer_result)
md_writer.write_string(
f"{pdf_file_name}_model_output.txt",
model_output,
)
logger.info(f"local output dir is {local_md_dir}")
def parse_doc(
path_list: list[Path],
output_dir,
lang="ch",
backend="pipeline",
method="auto",
server_url=None,
start_page_id=0, # Start page ID for parsing, default is 0
end_page_id=None # End page ID for parsing, default is None (parse all pages until the end of the document)
):
"""
Parameter description:
path_list: List of document paths to be parsed, can be PDF or image files.
output_dir: Output directory for storing parsing results.
lang: Language option, default is 'ch', optional values include['ch', 'ch_server', 'ch_lite', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']。
Input the languages in the pdf (if known) to improve OCR accuracy. Optional.
Adapted only for the case where the backend is set to "pipeline"
backend: the backend for parsing pdf:
pipeline: More general.
vlm-transformers: More general.
vlm-sglang-engine: Faster(engine).
vlm-sglang-client: Faster(client).
without method specified, pipeline will be used by default.
method: the method for parsing pdf:
auto: Automatically determine the method based on the file type.
txt: Use text extraction method.
ocr: Use OCR method for image-based PDFs.
Without method specified, 'auto' will be used by default.
Adapted only for the case where the backend is set to "pipeline".
server_url: When the backend is `sglang-client`, you need to specify the server_url, for example:`http://127.0.0.1:30000`
"""
try:
file_name_list = []
pdf_bytes_list = []
lang_list = []
for path in path_list:
file_name = str(Path(path).stem)
pdf_bytes = read_fn(path)
file_name_list.append(file_name)
pdf_bytes_list.append(pdf_bytes)
lang_list.append(lang)
do_parse(
output_dir=output_dir,
pdf_file_names=file_name_list,
pdf_bytes_list=pdf_bytes_list,
p_lang_list=lang_list,
backend=backend,
parse_method=method,
server_url=server_url,
start_page_id=start_page_id,
end_page_id=end_page_id
)
except Exception as e:
logger.exception(e)
if __name__ == '__main__':
# args
__dir__ = os.path.dirname(os.path.abspath(__file__))
pdf_files_dir = os.path.join(__dir__, "pdfs")
output_dir = os.path.join(__dir__, "output")
pdf_suffixes = [".pdf"]
image_suffixes = [".png", ".jpeg", ".jpg"]
doc_path_list = []
for doc_path in Path(pdf_files_dir).glob('*'):
if doc_path.suffix in pdf_suffixes + image_suffixes:
doc_path_list.append(doc_path)
"""如果您由于网络问题无法下载模型,可以设置环境变量MINERU_MODEL_SOURCE为modelscope使用免代理仓库下载模型"""
# os.environ['MINERU_MODEL_SOURCE'] = "modelscope"
"""Use pipeline mode if your environment does not support VLM"""
parse_doc(doc_path_list, output_dir, backend="pipeline")
"""To enable VLM mode, change the backend to 'vlm-xxx'"""
# parse_doc(doc_path_list, output_dir, backend="vlm-transformers") # more general.
# parse_doc(doc_path_list, output_dir, backend="vlm-sglang-engine") # faster(engine).
# parse_doc(doc_path_list, output_dir, backend="vlm-sglang-client", server_url="http://127.0.0.1:30000") # faster(client).
\ No newline at end of file
# Use the official Ubuntu base image
FROM swr.cn-central-221.ovaijisuan.com/mindformers/mindformers1.2_mindspore2.3:20240722
USER root
# Set environment variables to non-interactive to avoid prompts during installation
ENV DEBIAN_FRONTEND=noninteractive
# Update the package list and install necessary packages
RUN apt-get update && \
apt-get install -y \
software-properties-common && \
add-apt-repository -y ppa:deadsnakes/ppa && \
apt-get update && \
apt-get install -y \
python3.10 \
python3.10-venv \
python3.10-distutils \
python3.10-dev \
python3-pip \
wget \
git \
libgl1 \
libglib2.0-0 \
&& rm -rf /var/lib/apt/lists/*
# Set Python 3.10 as the default python3
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
# Create a virtual environment for MinerU
RUN python3 -m venv /opt/mineru_venv
# Copy the configuration file template and install magic-pdf latest
RUN /bin/bash -c "wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/magic-pdf.template.json && \
cp magic-pdf.template.json /root/magic-pdf.json && \
source /opt/mineru_venv/bin/activate && \
pip3 install --upgrade pip -i https://mirrors.aliyun.com/pypi/simple && \
pip3 install torch==2.3.1 torchvision==0.18.1 -i https://mirrors.aliyun.com/pypi/simple && \
pip3 install -U magic-pdf[full] 'numpy<2' decorator attrs absl-py cloudpickle ml-dtypes tornado einops -i https://mirrors.aliyun.com/pypi/simple && \
wget https://gitee.com/ascend/pytorch/releases/download/v6.0.rc2-pytorch2.3.1/torch_npu-2.3.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl && \
pip3 install torch_npu-2.3.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl"
# Download models and update the configuration file
RUN /bin/bash -c "source /opt/mineru_venv/bin/activate && \
pip3 install modelscope -i https://mirrors.aliyun.com/pypi/simple && \
wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/scripts/download_models.py -O download_models.py && \
python3 download_models.py && \
sed -i 's|cpu|npu|g' /root/magic-pdf.json"
# Set the entry point to activate the virtual environment and run the command line tool
ENTRYPOINT ["/bin/bash", "-c", "source /opt/mineru_venv/bin/activate && exec \"$@\"", "--"]
......@@ -18,37 +18,19 @@ RUN apt-get update && \
wget \
git \
libgl1 \
libreoffice \
fonts-noto-cjk \
fonts-wqy-zenhei \
fonts-wqy-microhei \
ttf-mscorefonts-installer \
fontconfig \
libglib2.0-0 \
libxrender1 \
libsm6 \
libxext6 \
poppler-utils \
&& rm -rf /var/lib/apt/lists/*
# Set Python 3.10 as the default python3
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
# Create a virtual environment for MinerU
RUN python3 -m venv /opt/mineru_venv
# Copy the configuration file template and install magic-pdf latest
RUN /bin/bash -c "wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/magic-pdf.template.json && \
cp magic-pdf.template.json /root/magic-pdf.json && \
source /opt/mineru_venv/bin/activate && \
pip3 install --upgrade pip -i https://mirrors.aliyun.com/pypi/simple && \
pip3 install -U magic-pdf[full] -i https://mirrors.aliyun.com/pypi/simple"
# install mineru latest
RUN /bin/bash -c "pip3 install --upgrade pip -i https://mirrors.aliyun.com/pypi/simple && \
pip3 install uv -i https://mirrors.aliyun.com/pypi/simple && \
uv pip install 'mineru[all]>=2.0.0' -i https://mirrors.aliyun.com/pypi/simple"
# Download models and update the configuration file
RUN /bin/bash -c "pip3 install modelscope -i https://mirrors.aliyun.com/pypi/simple && \
wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/scripts/download_models.py -O download_models.py && \
python3 download_models.py && \
sed -i 's|cpu|cuda|g' /root/magic-pdf.json"
RUN /bin/bash -c "mineru-models-download -s modelscope -m all"
# Set the entry point to activate the virtual environment and run the command line tool
ENTRYPOINT ["/bin/bash", "-c", "source /opt/mineru_venv/bin/activate && exec \"$@\"", "--"]
ENTRYPOINT ["/bin/bash", "-c", "export MINERU_MODEL_SOURCE=local && exec \"$@\"", "--"]
\ No newline at end of file
......@@ -18,37 +18,19 @@ RUN apt-get update && \
wget \
git \
libgl1 \
libreoffice \
fonts-noto-cjk \
fonts-wqy-zenhei \
fonts-wqy-microhei \
ttf-mscorefonts-installer \
fontconfig \
libglib2.0-0 \
libxrender1 \
libsm6 \
libxext6 \
poppler-utils \
&& rm -rf /var/lib/apt/lists/*
# Set Python 3.10 as the default python3
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
# Create a virtual environment for MinerU
RUN python3 -m venv /opt/mineru_venv
# Copy the configuration file template and install magic-pdf latest
RUN /bin/bash -c "wget https://github.com/opendatalab/MinerU/raw/master/magic-pdf.template.json && \
cp magic-pdf.template.json /root/magic-pdf.json && \
source /opt/mineru_venv/bin/activate && \
pip3 install --upgrade pip && \
pip3 install -U magic-pdf[full]"
# install mineru latest
RUN /bin/bash -c "pip3 install --upgrade pip && \
pip3 install uv && \
uv pip install 'mineru[all]>=2.0.0'"
# Download models and update the configuration file
RUN /bin/bash -c "pip3 install huggingface_hub && \
wget https://github.com/opendatalab/MinerU/raw/master/scripts/download_models_hf.py -O download_models.py && \
python3 download_models.py && \
sed -i 's|cpu|cuda|g' /root/magic-pdf.json"
RUN /bin/bash -c "mineru-models-download -s huggingface -m all"
# Set the entry point to activate the virtual environment and run the command line tool
ENTRYPOINT ["/bin/bash", "-c", "source /opt/mineru_venv/bin/activate && exec \"$@\"", "--"]
ENTRYPOINT ["/bin/bash", "-c", "export MINERU_MODEL_SOURCE=local && exec \"$@\"", "--"]
\ No newline at end of file
# Ascend NPU 加速
## 简介
本文档介绍如何在 Ascend NPU 上使用 MinerU。本文档内容已在`华为 Atlas 800T A2`服务器上测试通过。
```
CPU:鲲鹏 920 aarch64 2.6GHz
NPU:Ascend 910B 64GB
OS:openEuler 22.03 (LTS-SP3)/ Ubuntu 22.04.5 LTS
CANN:8.0.RC2
驱动版本:24.1.rc2.1
```
由于适配 Ascend NPU 的环境较为复杂,建议使用 Docker 容器运行 MinerU。
通过docker运行MinerU前需确保物理机已安装支持CANN 8.0.RC2的驱动和固件。
## 构建镜像
请保持网络状况良好,并执行以下代码构建镜像。
```bash
wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/docker/ascend_npu/Dockerfile -O Dockerfile
docker build -t mineru_npu:latest .
```
如果构建过程中未发生报错则说明镜像构建成功。
## 运行容器
```bash
docker run -it -u root --name mineru-npu --privileged=true \
--ipc=host \
--network=host \
--device=/dev/davinci0 \
--device=/dev/davinci1 \
--device=/dev/davinci2 \
--device=/dev/davinci3 \
--device=/dev/davinci4 \
--device=/dev/davinci5 \
--device=/dev/davinci6 \
--device=/dev/davinci7 \
--device=/dev/davinci_manager \
--device=/dev/devmm_svm \
--device=/dev/hisi_hdc \
-v /var/log/npu/:/usr/slog \
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
mineru_npu:latest \
/bin/bash -c "echo 'source /opt/mineru_venv/bin/activate' >> ~/.bashrc && exec bash"
magic-pdf --help
```
# Ubuntu 22.04 LTS
### 1. Check if NVIDIA Drivers Are Installed
```sh
nvidia-smi
```
If you see information similar to the following, it means that the NVIDIA drivers are already installed, and you can skip Step 2.
> [!NOTE]
> Notice:`CUDA Version` should be >= 12.4, If the displayed version number is less than 12.4, please upgrade the driver.
```plaintext
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.133.07 Driver Version: 572.83 CUDA Version: 12.8 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name TCC/WDDM | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA GeForce RTX 3060 Ti WDDM | 00000000:01:00.0 On | N/A |
| 0% 51C P8 12W / 200W | 1489MiB / 8192MiB | 5% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
```
### 2. Install the Driver
If no driver is installed, use the following command:
```sh
sudo apt-get update
sudo apt-get install nvidia-driver-570-server
```
Install the proprietary driver and restart your computer after installation.
```sh
reboot
```
### 3. Install Anaconda
If Anaconda is already installed, skip this step.
```sh
wget https://repo.anaconda.com/archive/Anaconda3-2024.06-1-Linux-x86_64.sh
bash Anaconda3-2024.06-1-Linux-x86_64.sh
```
In the final step, enter `yes`, close the terminal, and reopen it.
### 4. Create an Environment Using Conda
```bash
conda create -n mineru 'python=3.12' -y
conda activate mineru
```
### 5. Install Applications
```sh
pip install -U magic-pdf[full]
```
> [!TIP]
> After installation, you can check the version of `magic-pdf` using the following command:
>
> ```sh
> magic-pdf --version
> ```
### 6. Download Models
Refer to detailed instructions on [how to download model files](how_to_download_models_en.md).
## 7. Understand the Location of the Configuration File
After completing the [6. Download Models](#6-download-models) step, the script will automatically generate a `magic-pdf.json` file in the user directory and configure the default model path.
You can find the `magic-pdf.json` file in your user directory.
> [!TIP]
> The user directory for Linux is "/home/username".
### 8. First Run
Download a sample file from the repository and test it.
```sh
wget https://github.com/opendatalab/MinerU/raw/master/demo/pdfs/small_ocr.pdf
magic-pdf -p small_ocr.pdf -o ./output
```
### 9. Test CUDA Acceleration
If your graphics card has at least **6GB** of VRAM, follow these steps to test CUDA acceleration:
1. Modify the value of `"device-mode"` in the `magic-pdf.json` configuration file located in your home directory.
```json
{
"device-mode": "cuda"
}
```
2. Test CUDA acceleration with the following command:
```sh
magic-pdf -p small_ocr.pdf -o ./output
```
\ No newline at end of file
# Ubuntu 22.04 LTS
## 1. 检测是否已安装nvidia驱动
```bash
nvidia-smi
```
如果看到类似如下的信息,说明已经安装了nvidia驱动,可以跳过步骤2
> [!NOTE]
> `CUDA Version` 显示的版本号应 >= 12.4,如显示的版本号小于12.4,请升级驱动
```plaintext
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.133.07 Driver Version: 572.83 CUDA Version: 12.8 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name TCC/WDDM | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA GeForce RTX 3060 Ti WDDM | 00000000:01:00.0 On | N/A |
| 0% 51C P8 12W / 200W | 1489MiB / 8192MiB | 5% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
```
## 2. 安装驱动
如没有驱动,则通过如下命令
```bash
sudo apt-get update
sudo apt-get install nvidia-driver-570-server
```
安装专有驱动,安装完成后,重启电脑
```bash
reboot
```
## 3. 安装anacoda
如果已安装conda,可以跳过本步骤
```bash
wget -U NoSuchBrowser/1.0 https://mirrors.tuna.tsinghua.edu.cn/anaconda/archive/Anaconda3-2024.06-1-Linux-x86_64.sh
bash Anaconda3-2024.06-1-Linux-x86_64.sh
```
最后一步输入yes,关闭终端重新打开
## 4. 使用conda 创建环境
```bash
conda create -n mineru 'python=3.12' -y
conda activate mineru
```
## 5. 安装应用
```bash
pip install -U magic-pdf[full] -i https://mirrors.aliyun.com/pypi/simple
```
> [!TIP]
> 下载完成后,您可以通过以下命令检查`magic-pdf`的版本:
>
> ```bash
> magic-pdf --version
> ```
## 6. 下载模型
详细参考 [如何下载模型文件](how_to_download_models_zh_cn.md)
## 7. 了解配置文件存放的位置
完成[6.下载模型](#6-下载模型)步骤后,脚本会自动生成用户目录下的magic-pdf.json文件,并自动配置默认模型路径。
您可在【用户目录】下找到magic-pdf.json文件。
> [!TIP]
> linux用户目录为 "/home/用户名"
## 8. 第一次运行
从仓库中下载样本文件,并测试
```bash
wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/demo/pdfs/small_ocr.pdf
magic-pdf -p small_ocr.pdf -o ./output
```
## 9. 测试CUDA加速
如果您的显卡显存大于等于 **6GB** ,可以进行以下流程,测试CUDA解析加速效果
**1.修改【用户目录】中配置文件magic-pdf.json中"device-mode"的值**
```json
{
"device-mode":"cuda"
}
```
**2.运行以下命令测试cuda加速效果**
```bash
magic-pdf -p small_ocr.pdf -o ./output
```
> [!TIP]
> CUDA加速是否生效可以根据log中输出的各个阶段cost耗时来简单判断,通常情况下,使用cuda加速会比cpu更快。
# Windows 10/11
### 1. Install CUDA and cuDNN
You need to install a CUDA version that is compatible with torch's requirements. For details, please refer to the [official PyTorch website](https://pytorch.org/get-started/locally/).
- CUDA 11.8 https://developer.nvidia.com/cuda-11-8-0-download-archive
- CUDA 12.4 https://developer.nvidia.com/cuda-12-4-0-download-archive
- CUDA 12.6 https://developer.nvidia.com/cuda-12-6-0-download-archive
- CUDA 12.8 https://developer.nvidia.com/cuda-12-8-0-download-archive
### 2. Install Anaconda
If Anaconda is already installed, you can skip this step.
Download link: https://repo.anaconda.com/archive/Anaconda3-2024.06-1-Windows-x86_64.exe
### 3. Create an Environment Using Conda
```bash
conda create -n mineru 'python=3.12' -y
conda activate mineru
```
### 4. Install Applications
```
pip install -U magic-pdf[full]
```
> [!IMPORTANT]
> After installation, you can check the version of `magic-pdf` using the following command:
>
> ```bash
> magic-pdf --version
> ```
### 5. Download Models
Refer to detailed instructions on [how to download model files](how_to_download_models_en.md).
### 6. Understand the Location of the Configuration File
After completing the [5. Download Models](#5-download-models) step, the script will automatically generate a `magic-pdf.json` file in the user directory and configure the default model path.
You can find the `magic-pdf.json` file in your 【user directory】 .
> [!TIP]
> The user directory for Windows is "C:/Users/username".
### 7. First Run
Download a sample file from the repository and test it.
```powershell
wget https://github.com/opendatalab/MinerU/raw/master/demo/pdfs/small_ocr.pdf -O small_ocr.pdf
magic-pdf -p small_ocr.pdf -o ./output
```
### 8. Test CUDA Acceleration
If your graphics card has at least 6GB of VRAM, follow these steps to test CUDA-accelerated parsing performance.
1. **Overwrite the installation of torch and torchvision** supporting CUDA.(Please select the appropriate index-url based on your CUDA version. For more details, refer to the [PyTorch official website](https://pytorch.org/get-started/locally/).)
```
pip install --force-reinstall torch torchvision --index-url https://download.pytorch.org/whl/cu124
```
2. **Modify the value of `"device-mode"`** in the `magic-pdf.json` configuration file located in your user directory.
```json
{
"device-mode": "cuda"
}
```
3. **Run the following command to test CUDA acceleration**:
```
magic-pdf -p small_ocr.pdf -o ./output
```
\ No newline at end of file
# Windows10/11
## 1. 安装cuda环境
需要安装符合torch要求的cuda版本,具体可参考[torch官网](https://pytorch.org/get-started/locally/)
- CUDA 11.8 https://developer.nvidia.com/cuda-11-8-0-download-archive
- CUDA 12.4 https://developer.nvidia.com/cuda-12-4-0-download-archive
- CUDA 12.6 https://developer.nvidia.com/cuda-12-6-0-download-archive
- CUDA 12.8 https://developer.nvidia.com/cuda-12-8-0-download-archive
## 2. 安装anaconda
如果已安装conda,可以跳过本步骤
下载链接:
https://mirrors.tuna.tsinghua.edu.cn/anaconda/archive/Anaconda3-2024.06-1-Windows-x86_64.exe
## 3. 使用conda 创建环境
```bash
conda create -n mineru 'python=3.12' -y
conda activate mineru
```
## 4. 安装应用
```bash
pip install -U magic-pdf[full] -i https://mirrors.aliyun.com/pypi/simple
```
> [!IMPORTANT]
> 下载完成后,您可以通过以下命令检查magic-pdf的版本
>
> ```bash
> magic-pdf --version
> ```
## 5. 下载模型
详细参考 [如何下载模型文件](how_to_download_models_zh_cn.md)
## 6. 了解配置文件存放的位置
完成[5.下载模型](#5-下载模型)步骤后,脚本会自动生成用户目录下的magic-pdf.json文件,并自动配置默认模型路径。
您可在【用户目录】下找到magic-pdf.json文件。
> [!TIP]
> windows用户目录为 "C:/Users/用户名"
## 7. 第一次运行
从仓库中下载样本文件,并测试
```powershell
wget https://github.com/opendatalab/MinerU/raw/master/demo/pdfs/small_ocr.pdf -O small_ocr.pdf
magic-pdf -p small_ocr.pdf -o ./output
```
## 8. 测试CUDA加速
如果您的显卡显存大于等于 **6GB** ,可以进行以下流程,测试CUDA解析加速效果
**1.覆盖安装支持cuda的torch和torchvision**(请根据cuda版本选择合适的index-url,具体可参考[torch官网](https://pytorch.org/get-started/locally/))
```bash
pip install --force-reinstall torch torchvision --index-url https://download.pytorch.org/whl/cu124
```
**2.修改【用户目录】中配置文件magic-pdf.json中"device-mode"的值**
```json
{
"device-mode":"cuda"
}
```
**3.运行以下命令测试cuda加速效果**
```bash
magic-pdf -p small_ocr.pdf -o ./output
```
> [!TIP]
> CUDA加速是否生效可以根据log中输出的各个阶段的耗时来简单判断,通常情况下,cuda加速后运行速度比cpu更快。
Model downloads are divided into initial downloads and updates to the model directory. Please refer to the corresponding documentation for instructions on how to proceed.
# Initial download of model files
### Download the Model from Hugging Face
Use a Python Script to Download Model Files from Hugging Face
```bash
pip install huggingface_hub
wget https://github.com/opendatalab/MinerU/raw/master/scripts/download_models_hf.py -O download_models_hf.py
python download_models_hf.py
```
The Python script will automatically download the model files and configure the model directory in the configuration file.
The configuration file can be found in the user directory, with the filename `magic-pdf.json`.
# How to update models previously downloaded
## 1. Models downloaded via Hugging Face or Model Scope
If you previously downloaded models via Hugging Face or Model Scope, you can rerun the Python script used for the initial download. This will automatically update the model directory to the latest version.
模型下载分为首次下载和更新模型目录,请参考对应的文档内容进行操作
# 首次下载模型文件
模型文件可以从 Hugging Face 或 Model Scope 下载,由于网络原因,国内用户访问HF可能会失败,请使用 ModelScope。
<details>
<summary>方法一:从 Hugging Face 下载模型</summary>
<p>使用python脚本 从Hugging Face下载模型文件</p>
<pre><code>pip install huggingface_hub
wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/scripts/download_models_hf.py -O download_models_hf.py
python download_models_hf.py</code></pre>
<p>python脚本会自动下载模型文件并配置好配置文件中的模型目录</p>
</details>
## 方法二:从 ModelScope 下载模型
### 使用python脚本 从ModelScope下载模型文件
```bash
pip install modelscope
wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/scripts/download_models.py -O download_models.py
python download_models.py
```
python脚本会自动下载模型文件并配置好配置文件中的模型目录
配置文件可以在用户目录中找到,文件名为`magic-pdf.json`
> [!TIP]
> windows的用户目录为 "C:\\Users\\用户名", linux用户目录为 "/home/用户名", macOS用户目录为 "/Users/用户名"
# 此前下载过模型,如何更新
## 1. 通过 Hugging Face 或 Model Scope 下载过模型
如此前通过 HuggingFace 或 Model Scope 下载过模型,可以重复执行此前的模型下载python脚本,将会自动将模型目录更新到最新版本。
......@@ -5,8 +5,8 @@ from collections import defaultdict
import numpy as np
from .model_init import AtomModelSingleton
from ...utils.model_utils import crop_img, get_res_list_from_layout_res, get_coords_and_area
from ...utils.ocr_utils import get_adjusted_mfdetrec_res, get_ocr_result_list
from ...utils.model_utils import crop_img, get_res_list_from_layout_res
from ...utils.ocr_utils import get_adjusted_mfdetrec_res, get_ocr_result_list, OcrConfidence
YOLO_LAYOUT_BASE_BATCH_SIZE = 1
MFD_BASE_BATCH_SIZE = 1
......@@ -315,7 +315,7 @@ class BatchAnalyze:
ocr_text, ocr_score = ocr_res_list[index]
layout_res_item['text'] = ocr_text
layout_res_item['score'] = float(f"{ocr_score:.3f}")
if ocr_score < 0.6:
if ocr_score < OcrConfidence.min_confidence:
layout_res_item['category_id'] = 16
total_processed += len(img_crop_list)
......
......@@ -2,6 +2,7 @@
import time
from loguru import logger
from tqdm import tqdm
from mineru.utils.config_reader import get_device, get_llm_aided_config
from mineru.backend.pipeline.model_init import AtomModelSingleton
......@@ -14,6 +15,7 @@ from mineru.utils.enum_class import ContentType
from mineru.utils.llm_aided import llm_aided_title
from mineru.utils.model_utils import clean_memory
from mineru.backend.pipeline.pipeline_magic_model import MagicModel
from mineru.utils.ocr_utils import OcrConfidence
from mineru.utils.span_block_fix import fill_spans_in_blocks, fix_discarded_block, fix_block_spans
from mineru.utils.span_pre_proc import remove_outside_spans, remove_overlaps_low_confidence_spans, \
remove_overlaps_min_spans, txt_spans_extract
......@@ -163,7 +165,7 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
def result_to_middle_json(model_list, images_list, pdf_doc, image_writer, lang=None, ocr_enable=False, formula_enabled=True):
middle_json = {"pdf_info": [], "_backend":"pipeline", "_version_name": __version__}
for page_index, page_model_info in enumerate(model_list):
for page_index, page_model_info in tqdm(enumerate(model_list), total=len(model_list), desc="Processing pages"):
page = pdf_doc[page_index]
image_dict = images_list[page_index]
page_info = page_model_info_to_page_info(
......@@ -208,7 +210,7 @@ def result_to_middle_json(model_list, images_list, pdf_doc, image_writer, lang=N
need_ocr_list), f'ocr_res_list: {len(ocr_res_list)}, need_ocr_list: {len(need_ocr_list)}'
for index, span in enumerate(need_ocr_list):
ocr_text, ocr_score = ocr_res_list[index]
if ocr_score > 0.6:
if ocr_score > OcrConfidence.min_confidence:
span['content'] = ocr_text
span['score'] = float(f"{ocr_score:.3f}")
else:
......
......@@ -7,8 +7,8 @@ DEFAULT_SYSTEM_PROMPT = (
)
DEFAULT_USER_PROMPT = "Document Parsing:"
DEFAULT_TEMPERATURE = 0.0
DEFAULT_TOP_P = 0.01
DEFAULT_TOP_K = 1
DEFAULT_TOP_P = 0.8
DEFAULT_TOP_K = 20
DEFAULT_REPETITION_PENALTY = 1.0
DEFAULT_PRESENCE_PENALTY = 0.0
DEFAULT_NO_REPEAT_NGRAM_SIZE = 100
......
......@@ -22,7 +22,7 @@ try:
hf_loaded = True
except ImportError as e:
logger.warning("hf is not installed. If you are not using huggingface, you can ignore this warning.")
logger.warning("hf is not installed. If you are not using transformers, you can ignore this warning.")
engine_loaded = False
try:
......@@ -51,9 +51,9 @@ def get_predictor(
) -> BasePredictor:
start_time = time.time()
if backend == "huggingface":
if backend == "transformers":
if not model_path:
raise ValueError("model_path must be provided for huggingface backend.")
raise ValueError("model_path must be provided for transformers backend.")
if not hf_loaded:
raise ImportError(
"transformers is not installed, so huggingface backend cannot be used. "
......@@ -77,7 +77,7 @@ def get_predictor(
raise ImportError(
"sglang is not installed, so sglang-engine backend cannot be used. "
"If you need to use sglang-engine backend for inference, "
"please install sglang[all]==0.4.6.post4 or a newer version."
"please install sglang[all]==0.4.7 or a newer version."
)
predictor = SglangEnginePredictor(
server_args=ServerArgs(model_path, **kwargs),
......@@ -104,7 +104,7 @@ def get_predictor(
http_timeout=http_timeout,
)
else:
raise ValueError(f"Unsupported backend: {backend}. Supports: huggingface, sglang-engine, sglang-client.")
raise ValueError(f"Unsupported backend: {backend}. Supports: transformers, sglang-engine, sglang-client.")
elapsed = round(time.time() - start_time, 2)
logger.info(f"get_predictor cost: {elapsed}s")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment