Unverified Commit 0bdbdffc authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #3076 from opendatalab/release-2.1.1

Release 2.1.1
parents cad4c585 4f88955d
...@@ -14,33 +14,37 @@ on: ...@@ -14,33 +14,37 @@ on:
jobs: jobs:
cli-test: cli-test:
if: github.repository == 'opendatalab/MinerU' if: github.repository == 'opendatalab/MinerU'
runs-on: pdf runs-on: ubuntu-latest
timeout-minutes: 240 timeout-minutes: 240
strategy: strategy:
fail-fast: true fail-fast: true
steps: steps:
- name: PDF cli - name: PDF cli
uses: actions/checkout@v3 uses: actions/checkout@v4
with: with:
ref: dev
fetch-depth: 2 fetch-depth: 2
- name: install uv
uses: astral-sh/setup-uv@v5
- name: install&test - name: install&test
run: | run: |
source activate mineru uv --version
conda env list uv venv --python 3.12
pip show coverage source .venv/bin/activate
cd $GITHUB_WORKSPACE && sh tests/retry_env.sh uv pip install .[test]
# cd $GITHUB_WORKSPACE && python tests/clean_coverage.py cd $GITHUB_WORKSPACE && python tests/clean_coverage.py
# cd $GITHUB_WORKSPACE && coverage run -m pytest tests/unittest/ --cov=magic_pdf/ --cov-report html --cov-report term-missing cd $GITHUB_WORKSPACE && coverage run
# cd $GITHUB_WORKSPACE && python tests/get_coverage.py cd $GITHUB_WORKSPACE && python tests/get_coverage.py
cd $GITHUB_WORKSPACE && pytest -m P0 -s -v tests/test_cli/test_cli_sdk.py
notify_to_feishu: notify_to_feishu:
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure')}} if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure')}}
needs: cli-test needs: cli-test
runs-on: pdf runs-on: ubuntu-latest
steps: steps:
- name: notify - name: notify
run: | run: |
curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"'${{ github.repository }}' GitHubAction Failed","content":[[{"tag":"text","text":""},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'$USER_ID'"}]]}}}}' $WEBHOOK_URL curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"'${{ github.repository }}' GitHubAction Failed","content":[[{"tag":"text","text":""},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"}]]}}}}' ${{ secrets.FEISHU_WEBHOOK_URL }}
...@@ -13,49 +13,36 @@ on: ...@@ -13,49 +13,36 @@ on:
jobs: jobs:
cli-test: cli-test:
if: github.repository == 'opendatalab/MinerU' if: github.repository == 'opendatalab/MinerU'
runs-on: pdf runs-on: ubuntu-latest
timeout-minutes: 240 timeout-minutes: 240
strategy: strategy:
fail-fast: true fail-fast: true
steps: steps:
- name: PDF cli - name: PDF cli
uses: actions/checkout@v3 uses: actions/checkout@v4
with: with:
ref: dev
fetch-depth: 2 fetch-depth: 2
- name: install uv
uses: astral-sh/setup-uv@v5
- name: install&test - name: install&test
run: | run: |
source activate mineru uv --version
conda env list uv venv --python 3.12
pip show coverage source .venv/bin/activate
cd $GITHUB_WORKSPACE && sh tests/retry_env.sh uv pip install .[test]
# cd $GITHUB_WORKSPACE && python tests/clean_coverage.py cd $GITHUB_WORKSPACE && python tests/clean_coverage.py
# cd $GITHUB_WORKSPACE && coverage run -m pytest tests/unittest/ --cov=magic_pdf/ --cov-report html --cov-report term-missing cd $GITHUB_WORKSPACE && coverage run
# cd $GITHUB_WORKSPACE && python tests/get_coverage.py cd $GITHUB_WORKSPACE && python tests/get_coverage.py
cd $GITHUB_WORKSPACE && pytest -s -v tests/test_cli/test_cli_sdk.py
notify_to_feishu: notify_to_feishu:
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure')}} if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure')}}
needs: cli-test needs: cli-test
runs-on: pdf runs-on: ubuntu-latest
steps: steps:
- name: get_actor
run: |
metion_list="dt-yy"
echo $GITHUB_ACTOR
if [[ $GITHUB_ACTOR == "drunkpig" ]]; then
metion_list="xuchao"
elif [[ $GITHUB_ACTOR == "myhloli" ]]; then
metion_list="zhaoxiaomeng"
elif [[ $GITHUB_ACTOR == "icecraft" ]]; then
metion_list="xurui1"
fi
echo $metion_list
echo "METIONS=$metion_list" >> "$GITHUB_ENV"
echo ${{ env.METIONS }}
- name: notify - name: notify
run: | run: |
#echo ${{ secrets.USER_ID }} curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"'${{ github.repository }}' GitHubAction Failed","content":[[{"tag":"text","text":""},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"}]]}}}}' ${{ secrets.FEISHU_WEBHOOK_URL }}
curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"'${{ github.repository }}' GitHubAction Failed","content":[[{"tag":"text","text":""},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'$USER_ID'"}]]}}}}' $WEBHOOK_URL
name: Publish docs via GitHub Pages
on:
push:
branches:
- "master"
- "dev"
jobs:
build:
name: Deploy docs
runs-on: ubuntu-latest
steps:
- name: Checkout master
uses: actions/checkout@v4
with:
ref: dev
- name: Deploy docs
uses: mhausenblas/mkdocs-deploy-gh-pages@master
# Or use mhausenblas/mkdocs-deploy-gh-pages@nomaterial to build without the mkdocs-material theme
env:
GITHUB_TOKEN: ${{ secrets.RELEASE_TOKEN }}
REQUIREMENTS: /docs/requirements.txt
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mineru)](https://pypi.org/project/mineru/) [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mineru)](https://pypi.org/project/mineru/)
[![Downloads](https://static.pepy.tech/badge/mineru)](https://pepy.tech/project/mineru) [![Downloads](https://static.pepy.tech/badge/mineru)](https://pepy.tech/project/mineru)
[![Downloads](https://static.pepy.tech/badge/mineru/month)](https://pepy.tech/project/mineru) [![Downloads](https://static.pepy.tech/badge/mineru/month)](https://pepy.tech/project/mineru)
[![OpenDataLab](https://img.shields.io/badge/Demo_on_OpenDataLab-blue?logo=&labelColor=white)](https://mineru.net/OpenSourceTools/Extractor?source=github) [![OpenDataLab](https://img.shields.io/badge/webapp_on_mineru.net-blue?logo=&labelColor=white)](https://mineru.net/OpenSourceTools/Extractor?source=github)
[![HuggingFace](https://img.shields.io/badge/Demo_on_HuggingFace-yellow.svg?logo=&labelColor=white)](https://huggingface.co/spaces/opendatalab/MinerU) [![HuggingFace](https://img.shields.io/badge/Demo_on_HuggingFace-yellow.svg?logo=&labelColor=white)](https://huggingface.co/spaces/opendatalab/MinerU)
[![ModelScope](https://img.shields.io/badge/Demo_on_ModelScope-purple?logo=&labelColor=white)](https://www.modelscope.cn/studios/OpenDataLab/MinerU) [![ModelScope](https://img.shields.io/badge/Demo_on_ModelScope-purple?logo=&labelColor=white)](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/gist/myhloli/3b3a00a4a0a61577b6c30f989092d20d/mineru_demo.ipynb) [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/gist/myhloli/3b3a00a4a0a61577b6c30f989092d20d/mineru_demo.ipynb)
...@@ -31,9 +31,6 @@ ...@@ -31,9 +31,6 @@
<!-- hot link --> <!-- hot link -->
<p align="center"> <p align="center">
<a href="https://github.com/opendatalab/PDF-Extract-Kit">PDF-Extract-Kit: High-Quality PDF Extraction Toolkit</a>🔥🔥🔥
<br>
<br>
🚀<a href="https://mineru.net/?source=github">Access MinerU Now→✅ Zero-Install Web Version ✅ Full-Featured Desktop Client ✅ Instant API Access; Skip deployment headaches – get all product formats in one click. Developers, dive in!</a> 🚀<a href="https://mineru.net/?source=github">Access MinerU Now→✅ Zero-Install Web Version ✅ Full-Featured Desktop Client ✅ Instant API Access; Skip deployment headaches – get all product formats in one click. Developers, dive in!</a>
</p> </p>
...@@ -47,6 +44,14 @@ ...@@ -47,6 +44,14 @@
# Changelog # Changelog
- 2025/07/16 2.1.1 Released
- Bug fixes
- Fixed text block content loss issue that could occur in certain `pipeline` scenarios #3005
- Fixed issue where `sglang-client` required unnecessary packages like `torch` #2968
- Updated `dockerfile` to fix incomplete text content parsing due to missing fonts in Linux #2915
- Usability improvements
- Updated `compose.yaml` to facilitate direct startup of `sglang-server`, `mineru-api`, and `mineru-gradio` services
- Launched brand new [online documentation site](https://opendatalab.github.io/MinerU/), simplified readme, providing better documentation experience
- 2025/07/05 Version 2.1.0 Released - 2025/07/05 Version 2.1.0 Released
- This is the first major update of MinerU 2, which includes a large number of new features and improvements, covering significant performance optimizations, user experience enhancements, and bug fixes. The detailed update contents are as follows: - This is the first major update of MinerU 2, which includes a large number of new features and improvements, covering significant performance optimizations, user experience enhancements, and bug fixes. The detailed update contents are as follows:
- **Performance Optimizations:** - **Performance Optimizations:**
...@@ -398,36 +403,6 @@ ...@@ -398,36 +403,6 @@
</details> </details>
</details> </details>
<!-- TABLE OF CONTENT -->
<details open="open">
<summary><h2 style="display: inline-block">Table of Contents</h2></summary>
<ol>
<li>
<a href="#mineru">MinerU</a>
<ul>
<li><a href="#project-introduction">Project Introduction</a></li>
<li><a href="#key-features">Key Features</a></li>
<li><a href="#quick-start">Quick Start</a>
<ul>
<li><a href="#online-demo">Online Demo</a></li>
<li><a href="#local-deployment">Local Deployment</a></li>
</ul>
</li>
</ul>
</li>
<li><a href="#todo">TODO</a></li>
<li><a href="#known-issues">Known Issues</a></li>
<li><a href="#faq">FAQ</a></li>
<li><a href="#all-thanks-to-our-contributors">All Thanks To Our Contributors</a></li>
<li><a href="#license-information">License Information</a></li>
<li><a href="#acknowledgments">Acknowledgments</a></li>
<li><a href="#citation">Citation</a></li>
<li><a href="#star-history">Star History</a></li>
<li><a href="#links">Links</a></li>
</ol>
</details>
# MinerU # MinerU
## Project Introduction ## Project Introduction
...@@ -453,14 +428,25 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c ...@@ -453,14 +428,25 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
- Supports running in a pure CPU environment, and also supports GPU(CUDA)/NPU(CANN)/MPS acceleration - Supports running in a pure CPU environment, and also supports GPU(CUDA)/NPU(CANN)/MPS acceleration
- Compatible with Windows, Linux, and Mac platforms. - Compatible with Windows, Linux, and Mac platforms.
## Quick Start # Quick Start
If you encounter any installation issues, please first consult the <a href="#faq">FAQ</a>. </br> If you encounter any installation issues, please first consult the <a href="#faq">FAQ</a>. </br>
If the parsing results are not as expected, refer to the <a href="#known-issues">Known Issues</a>. </br> If the parsing results are not as expected, refer to the <a href="#known-issues">Known Issues</a>. </br>
There are three different ways to experience MinerU:
- [Online Demo](#online-demo) ## Online Experience
- [Local Deployment](#local-deployment)
### Official online web application
The official online version has the same functionality as the client, with a beautiful interface and rich features, requires login to use
- [![OpenDataLab](https://img.shields.io/badge/webapp_on_mineru.net-blue?logo=&labelColor=white)](https://mineru.net/OpenSourceTools/Extractor?source=github)
### Gradio-based online demo
A WebUI developed based on Gradio, with a simple interface and only core parsing functionality, no login required
- [![ModelScope](https://img.shields.io/badge/Demo_on_ModelScope-purple?logo=&labelColor=white)](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
- [![HuggingFace](https://img.shields.io/badge/Demo_on_HuggingFace-yellow.svg?logo=&labelColor=white)](https://huggingface.co/spaces/opendatalab/MinerU)
## Local Deployment
> [!WARNING] > [!WARNING]
...@@ -481,9 +467,9 @@ There are three different ways to experience MinerU: ...@@ -481,9 +467,9 @@ There are three different ways to experience MinerU:
</tr> </tr>
<tr> <tr>
<td>Operating System</td> <td>Operating System</td>
<td>windows/linux/mac</td> <td>Linux / Windows / macOS</td>
<td>windows/linux</td> <td>Linux / Windows</td>
<td>windows(wsl2)/linux</td> <td>Linux / Windows (via WSL2)</td>
</tr> </tr>
<tr> <tr>
<td>CPU Inference Support</td> <td>CPU Inference Support</td>
...@@ -492,12 +478,12 @@ There are three different ways to experience MinerU: ...@@ -492,12 +478,12 @@ There are three different ways to experience MinerU:
</tr> </tr>
<tr> <tr>
<td>GPU Requirements</td> <td>GPU Requirements</td>
<td>Turing architecture or later, 6GB+ VRAM or Apple Silicon</td> <td>Turing architecture and later, 6GB+ VRAM or Apple Silicon</td>
<td colspan="2">Turing architecture or later, 8GB+ VRAM</td> <td colspan="2">Turing architecture and later, 8GB+ VRAM</td>
</tr> </tr>
<tr> <tr>
<td>Memory Requirements</td> <td>Memory Requirements</td>
<td colspan="3">Minimum 16GB+, 32GB+ recommended</td> <td colspan="3">Minimum 16GB+, recommended 32GB+</td>
</tr> </tr>
<tr> <tr>
<td>Disk Space Requirements</td> <td>Disk Space Requirements</td>
...@@ -509,280 +495,42 @@ There are three different ways to experience MinerU: ...@@ -509,280 +495,42 @@ There are three different ways to experience MinerU:
</tr> </tr>
</table> </table>
## Online Demo ### Install MinerU
[![OpenDataLab](https://img.shields.io/badge/Demo_on_OpenDataLab-blue?logo=&labelColor=white)](https://mineru.net/OpenSourceTools/Extractor?source=github)
[![HuggingFace](https://img.shields.io/badge/Demo_on_HuggingFace-yellow.svg?logo=&labelColor=white)](https://huggingface.co/spaces/opendatalab/MinerU)
[![ModelScope](https://img.shields.io/badge/Demo_on_ModelScope-purple?logo=&labelColor=white)](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
## Local Deployment
### 1. Install MinerU
#### 1.1 Install via pip or uv
#### Install MinerU using pip or uv
```bash ```bash
pip install --upgrade pip pip install --upgrade pip
pip install uv pip install uv
uv pip install -U "mineru[core]" uv pip install -U "mineru[core]"
``` ```
#### 1.2 Install from source #### Install MinerU from source code
```bash ```bash
git clone https://github.com/opendatalab/MinerU.git git clone https://github.com/opendatalab/MinerU.git
cd MinerU cd MinerU
uv pip install -e .[core] uv pip install -e .[core]
``` ```
> [!NOTE]
> Linux and macOS systems automatically support CUDA/MPS acceleration after installation. For Windows users who want to use CUDA acceleration,
> please visit the [PyTorch official website](https://pytorch.org/get-started/locally/) to install PyTorch with the appropriate CUDA version.
#### 1.3 Install Full Version (supports sglang acceleration) (requires device with Turing or newer architecture and at least 8GB GPU memory)
If you need to use **sglang to accelerate VLM model inference**, you can choose any of the following methods to install the full version:
- Install using uv or pip:
```bash
uv pip install -U "mineru[all]"
```
- Install from source:
```bash
uv pip install -e .[all]
```
> [!TIP] > [!TIP]
> If any exceptions occur during the installation of `sglang`, please refer to the [official sglang documentation](https://docs.sglang.ai/start/install.html) for troubleshooting and solutions, or directly use Docker-based installation. > `mineru[core]` includes all core features except `sglang` acceleration, compatible with Windows / Linux / macOS systems, suitable for most users.
> If you need to use `sglang` acceleration for VLM model inference or install a lightweight client on edge devices, please refer to the documentation [Extension Modules Installation Guide](https://opendatalab.github.io/MinerU/quick_start/extension_modules/).
- Build image using Dockerfile:
```bash
wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/docker/global/Dockerfile
docker build -t mineru-sglang:latest -f Dockerfile .
```
Start Docker container:
```bash
docker run --gpus all \
--shm-size 32g \
-p 30000:30000 \
--ipc=host \
mineru-sglang:latest \
mineru-sglang-server --host 0.0.0.0 --port 30000
```
Or start using Docker Compose:
```bash
wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/docker/compose.yaml
docker compose -f compose.yaml up -d
```
> [!TIP]
> The Dockerfile uses `lmsysorg/sglang:v0.4.8.post1-cu126` as the default base image, which supports the Turing/Ampere/Ada Lovelace/Hopper platforms.
> If you are using the newer Blackwell platform, please change the base image to `lmsysorg/sglang:v0.4.8.post1-cu128-b200`.
#### 1.4 Install client (for connecting to sglang-server on edge devices that require only CPU and network connectivity)
```bash
uv pip install -U mineru
mineru -p <input_path> -o <output_path> -b vlm-sglang-client -u http://<host_ip>:<port>
```
--- ---
### 2. Using MinerU #### Deploy MinerU using Docker
MinerU provides a convenient Docker deployment method, which helps quickly set up the environment and solve some tricky environment compatibility issues.
#### 2.1 Command Line Usage You can get the [Docker Deployment Instructions](https://opendatalab.github.io/MinerU/quick_start/docker_deployment/) in the documentation.
##### Basic Usage
The simplest command line invocation is:
```bash
mineru -p <input_path> -o <output_path>
```
- `<input_path>`: Local PDF/Image file or directory (supports pdf/png/jpg/jpeg/webp/gif)
- `<output_path>`: Output directory
##### View Help Information
Get all available parameter descriptions:
```bash
mineru --help
```
##### Parameter Details
```text
Usage: mineru [OPTIONS]
Options:
-v, --version Show version and exit
-p, --path PATH Input file path or directory (required)
-o, --output PATH Output directory (required)
-m, --method [auto|txt|ocr] Parsing method: auto (default), txt, ocr (pipeline backend only)
-b, --backend [pipeline|vlm-transformers|vlm-sglang-engine|vlm-sglang-client]
Parsing backend (default: pipeline)
-l, --lang [ch|ch_server|ch_lite|en|korean|japan|chinese_cht|ta|te|ka|latin|arabic|east_slavic|cyrillic|devanagari]
Specify document language (improves OCR accuracy, pipeline backend only)
-u, --url TEXT Service address when using sglang-client
-s, --start INTEGER Starting page number (0-based)
-e, --end INTEGER Ending page number (0-based)
-f, --formula BOOLEAN Enable formula parsing (default: on)
-t, --table BOOLEAN Enable table parsing (default: on)
-d, --device TEXT Inference device (e.g., cpu/cuda/cuda:0/npu/mps, pipeline backend only)
--vram INTEGER Maximum GPU VRAM usage per process (GB)(pipeline backend only)
--source [huggingface|modelscope|local]
Model source, default: huggingface
--help Show help information
```
--- ---
#### 2.2 Model Source Configuration ### Using MinerU
MinerU automatically downloads required models from HuggingFace on first run. If HuggingFace is inaccessible, you can switch model sources:
##### Switch to ModelScope Source
```bash
mineru -p <input_path> -o <output_path> --source modelscope
```
Or set environment variable:
```bash
export MINERU_MODEL_SOURCE=modelscope
mineru -p <input_path> -o <output_path>
```
##### Using Local Models
###### 1. Download Models Locally
```bash
mineru-models-download --help
```
Or use interactive command-line tool to select models:
```bash
mineru-models-download
```
After download, model paths will be displayed in current terminal and automatically written to `mineru.json` in user directory.
###### 2. Parse Using Local Models
```bash
mineru -p <input_path> -o <output_path> --source local
```
Or enable via environment variable:
The simplest command line invocation is:
```bash ```bash
export MINERU_MODEL_SOURCE=local
mineru -p <input_path> -o <output_path> mineru -p <input_path> -o <output_path>
``` ```
--- You can use MinerU for PDF parsing through various methods such as command line, API, and WebUI. For detailed instructions, please refer to the [Usage Guide](https://opendatalab.github.io/MinerU/usage/).
#### 2.3 Using sglang to Accelerate VLM Model Inference
##### Through the sglang-engine Mode
```bash
mineru -p <input_path> -o <output_path> -b vlm-sglang-engine
```
##### Through the sglang-server/client Mode
1. Start Server:
```bash
mineru-sglang-server --port 30000
```
2. Use Client in another terminal:
```bash
mineru -p <input_path> -o <output_path> -b vlm-sglang-client -u http://127.0.0.1:30000
```
> [!TIP]
> For more information about output files, please refer to [Output File Documentation](docs/output_file_en_us.md)
---
### 3. API Calls or Visual Invocation
1. Directly invoke using Python API: [Python Invocation Example](demo/demo.py)
2. Invoke using FastAPI:
```bash
mineru-api --host 127.0.0.1 --port 8000
```
Visit http://127.0.0.1:8000/docs in your browser to view the API documentation.
3. Use Gradio WebUI or Gradio API:
```bash
# Using pipeline/vlm-transformers/vlm-sglang-client backend
mineru-gradio --server-name 127.0.0.1 --server-port 7860
# Or using vlm-sglang-engine/pipeline backend
mineru-gradio --server-name 127.0.0.1 --server-port 7860 --enable-sglang-engine true
```
Access http://127.0.0.1:7860 in your browser to use the Gradio WebUI, or visit http://127.0.0.1:7860/?view=api to use the Gradio API.
> [!TIP]
> Below are some suggestions and notes for using the sglang acceleration mode:
> - The sglang acceleration mode currently supports operation on Turing architecture GPUs with a minimum of 8GB VRAM, but you may encounter VRAM shortages on GPUs with less than 24GB VRAM. You can optimize VRAM usage with the following parameters:
> - If running on a single GPU and encountering VRAM shortage, reduce the KV cache size by setting `--mem-fraction-static 0.5`. If VRAM issues persist, try lowering it further to `0.4` or below.
> - If you have more than one GPU, you can expand available VRAM using tensor parallelism (TP) mode: `--tp-size 2`
> - If you are already successfully using sglang to accelerate VLM inference but wish to further improve inference speed, consider the following parameters:
> - If using multiple GPUs, increase throughput using sglang's multi-GPU parallel mode: `--dp-size 2`
> - You can also enable `torch.compile` to accelerate inference speed by about 15%: `--enable-torch-compile`
> - For more information on using sglang parameters, please refer to the [sglang official documentation](https://docs.sglang.ai/backend/server_arguments.html#common-launch-commands)
> - All sglang-supported parameters can be passed to MinerU via command-line arguments, including those used with the following commands: `mineru`, `mineru-sglang-server`, `mineru-gradio`, `mineru-api`
> [!TIP]
> - In any case, you can specify visible GPU devices at the start of a command line by adding the `CUDA_VISIBLE_DEVICES` environment variable. For example:
> ```bash
> CUDA_VISIBLE_DEVICES=1 mineru -p <input_path> -o <output_path>
> ```
> - This method works for all command-line calls, including `mineru`, `mineru-sglang-server`, `mineru-gradio`, and `mineru-api`, and applies to both `pipeline` and `vlm` backends.
> - Below are some common `CUDA_VISIBLE_DEVICES` settings:
> ```bash
> CUDA_VISIBLE_DEVICES=1 Only device 1 will be seen
> CUDA_VISIBLE_DEVICES=0,1 Devices 0 and 1 will be visible
> CUDA_VISIBLE_DEVICES="0,1" Same as above, quotation marks are optional
> CUDA_VISIBLE_DEVICES=0,2,3 Devices 0, 2, 3 will be visible; device 1 is masked
> CUDA_VISIBLE_DEVICES="" No GPU will be visible
> ```
> - Below are some possible use cases:
> - If you have multiple GPUs and need to specify GPU 0 and GPU 1 to launch 'sglang-server' in multi-GPU mode, you can use the following command:
> ```bash
> CUDA_VISIBLE_DEVICES=0,1 mineru-sglang-server --port 30000 --dp-size 2
> ```
> - If you have multiple GPUs and need to launch two `fastapi` services on GPU 0 and GPU 1 respectively, listening on different ports, you can use the following commands:
> ```bash
> # In terminal 1
> CUDA_VISIBLE_DEVICES=0 mineru-api --host 127.0.0.1 --port 8000
> # In terminal 2
> CUDA_VISIBLE_DEVICES=1 mineru-api --host 127.0.0.1 --port 8001
> ```
---
### 4. Extending MinerU Functionality Through Configuration Files
- MinerU is designed to work out-of-the-box, but also supports extending functionality through configuration files. You can create a `mineru.json` file in your home directory and add custom configurations.
- The `mineru.json` file will be automatically generated when you use the built-in model download command `mineru-models-download`. Alternatively, you can create it by copying the [configuration template file](./mineru.template.json) to your home directory and renaming it to `mineru.json`.
- Below are some available configuration options:
- `latex-delimiter-config`: Used to configure LaTeX formula delimiters, defaults to the `$` symbol, and can be modified to other symbols or strings as needed.
- `llm-aided-config`: Used to configure related parameters for LLM-assisted heading level detection, compatible with all LLM models supporting the `OpenAI protocol`. It defaults to Alibaba Cloud Qwen's `qwen2.5-32b-instruct` model. You need to configure an API key yourself and set `enable` to `true` to activate this feature.
- `models-dir`: Used to specify local model storage directories. Please specify separate model directories for the `pipeline` and `vlm` backends. After specifying these directories, you can use local models by setting the environment variable `export MINERU_MODEL_SOURCE=local`.
---
# TODO # TODO
...@@ -790,6 +538,9 @@ mineru -p <input_path> -o <output_path> -b vlm-sglang-client -u http://127.0.0.1 ...@@ -790,6 +538,9 @@ mineru -p <input_path> -o <output_path> -b vlm-sglang-client -u http://127.0.0.1
- [x] Recognition of `index` and `list` in the main text - [x] Recognition of `index` and `list` in the main text
- [x] Table recognition - [x] Table recognition
- [x] Heading Classification - [x] Heading Classification
- [x] Handwritten Text Recognition
- [x] Vertical Text Recognition
- [x] Latin Accent Mark Recognition
- [ ] Code block recognition in the main text - [ ] Code block recognition in the main text
- [ ] [Chemical formula recognition](docs/chemical_knowledge_introduction/introduction.pdf) - [ ] [Chemical formula recognition](docs/chemical_knowledge_introduction/introduction.pdf)
- [ ] Geometric shape recognition - [ ] Geometric shape recognition
...@@ -807,7 +558,7 @@ mineru -p <input_path> -o <output_path> -b vlm-sglang-client -u http://127.0.0.1 ...@@ -807,7 +558,7 @@ mineru -p <input_path> -o <output_path> -b vlm-sglang-client -u http://127.0.0.1
# FAQ # FAQ
- If you encounter any issues during usage, you can first check the [FAQ](docs/FAQ_en_us.md) for solutions. - If you encounter any issues during usage, you can first check the [FAQ](https://opendatalab.github.io/MinerU/faq/) for solutions.
- If your issue remains unresolved, you may also use [DeepWiki](https://deepwiki.com/opendatalab/MinerU) to interact with an AI assistant, which can address most common problems. - If your issue remains unresolved, you may also use [DeepWiki](https://deepwiki.com/opendatalab/MinerU) to interact with an AI assistant, which can address most common problems.
- If you still cannot resolve the issue, you are welcome to join our community via [Discord](https://discord.gg/Tdedn9GTXq) or [WeChat](http://mineru.space/s/V85Yl) to discuss with other users and developers. - If you still cannot resolve the issue, you are welcome to join our community via [Discord](https://discord.gg/Tdedn9GTXq) or [WeChat](http://mineru.space/s/V85Yl) to discuss with other users and developers.
...@@ -877,7 +628,6 @@ Currently, some models in this project are trained based on YOLO. However, since ...@@ -877,7 +628,6 @@ Currently, some models in this project are trained based on YOLO. However, since
- [LabelU (A Lightweight Multi-modal Data Annotation Tool)](https://github.com/opendatalab/labelU) - [LabelU (A Lightweight Multi-modal Data Annotation Tool)](https://github.com/opendatalab/labelU)
- [LabelLLM (An Open-source LLM Dialogue Annotation Platform)](https://github.com/opendatalab/LabelLLM) - [LabelLLM (An Open-source LLM Dialogue Annotation Platform)](https://github.com/opendatalab/LabelLLM)
- [PDF-Extract-Kit (A Comprehensive Toolkit for High-Quality PDF Content Extraction)](https://github.com/opendatalab/PDF-Extract-Kit) - [PDF-Extract-Kit (A Comprehensive Toolkit for High-Quality PDF Content Extraction)](https://github.com/opendatalab/PDF-Extract-Kit)
- [OmniDocBench (A Comprehensive Benchmark for Document Parsing and Evaluation)](https://github.com/opendatalab/OmniDocBench) - [OmniDocBench (A Comprehensive Benchmark for Document Parsing and Evaluation)](https://github.com/opendatalab/OmniDocBench)
- [Magic-HTML (Mixed web page extraction tool)](https://github.com/opendatalab/magic-html) - [Magic-HTML (Mixed web page extraction tool)](https://github.com/opendatalab/magic-html)
- [Magic-Doc (Fast speed ppt/pptx/doc/docx/pdf extraction tool)](https://github.com/InternLM/magic-doc) - [Magic-Doc (Fast speed ppt/pptx/doc/docx/pdf extraction tool)](https://github.com/InternLM/magic-doc)
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mineru)](https://pypi.org/project/mineru/) [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mineru)](https://pypi.org/project/mineru/)
[![Downloads](https://static.pepy.tech/badge/mineru)](https://pepy.tech/project/mineru) [![Downloads](https://static.pepy.tech/badge/mineru)](https://pepy.tech/project/mineru)
[![Downloads](https://static.pepy.tech/badge/mineru/month)](https://pepy.tech/project/mineru) [![Downloads](https://static.pepy.tech/badge/mineru/month)](https://pepy.tech/project/mineru)
[![OpenDataLab](https://img.shields.io/badge/Demo_on_OpenDataLab-blue?logo=&labelColor=white)](https://mineru.net/OpenSourceTools/Extractor?source=github) [![OpenDataLab](https://img.shields.io/badge/webapp_on_mineru.net-blue?logo=&labelColor=white)](https://mineru.net/OpenSourceTools/Extractor?source=github)
[![ModelScope](https://img.shields.io/badge/Demo_on_ModelScope-purple?logo=&labelColor=white)](https://www.modelscope.cn/studios/OpenDataLab/MinerU) [![ModelScope](https://img.shields.io/badge/Demo_on_ModelScope-purple?logo=&labelColor=white)](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
[![HuggingFace](https://img.shields.io/badge/Demo_on_HuggingFace-yellow.svg?logo=&labelColor=white)](https://huggingface.co/spaces/opendatalab/MinerU) [![HuggingFace](https://img.shields.io/badge/Demo_on_HuggingFace-yellow.svg?logo=&labelColor=white)](https://huggingface.co/spaces/opendatalab/MinerU)
[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/gist/myhloli/3b3a00a4a0a61577b6c30f989092d20d/mineru_demo.ipynb) [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/gist/myhloli/3b3a00a4a0a61577b6c30f989092d20d/mineru_demo.ipynb)
...@@ -31,9 +31,6 @@ ...@@ -31,9 +31,6 @@
<!-- hot link --> <!-- hot link -->
<p align="center"> <p align="center">
<a href="https://github.com/opendatalab/PDF-Extract-Kit">PDF-Extract-Kit: 高质量PDF解析工具箱</a>🔥🔥🔥
<br>
<br>
🚀<a href="https://mineru.net/?source=github">MinerU 官网入口→✅ 免装在线版 ✅ 全功能客户端 ✅ 开发者API在线调用,省去部署麻烦,多种产品形态一键get,速冲!</a> 🚀<a href="https://mineru.net/?source=github">MinerU 官网入口→✅ 免装在线版 ✅ 全功能客户端 ✅ 开发者API在线调用,省去部署麻烦,多种产品形态一键get,速冲!</a>
</p> </p>
...@@ -46,6 +43,14 @@ ...@@ -46,6 +43,14 @@
</div> </div>
# 更新记录 # 更新记录
- 2025/07/16 2.1.1发布
- bug修复
- 修复`pipeline`在某些情况可能发生的文本块内容丢失问题 #3005
- 修复`sglang-client`需要安装`torch`等不必要的包的问题 #2968
- 更新`dockerfile`以修复linux字体缺失导致的解析文本内容不完整问题 #2915
- 易用性更新
- 更新`compose.yaml`,便于用户直接启动`sglang-server``mineru-api``mineru-gradio`服务
- 启用全新的[在线文档站点](https://opendatalab.github.io/MinerU/zh/),简化readme,提供更好的文档体验
- 2025/07/05 2.1.0发布 - 2025/07/05 2.1.0发布
- 这是 MinerU 2 的第一个大版本更新,包含了大量新功能和改进,包含众多性能优化、体验优化和bug修复,具体更新内容如下: - 这是 MinerU 2 的第一个大版本更新,包含了大量新功能和改进,包含众多性能优化、体验优化和bug修复,具体更新内容如下:
- 性能优化: - 性能优化:
...@@ -386,37 +391,6 @@ ...@@ -386,37 +391,6 @@
</details> </details>
</details> </details>
<!-- TABLE OF CONTENT -->
<details open="open">
<summary><h2 style="display: inline-block">文档目录</h2></summary>
<ol>
<li>
<a href="#mineru">MinerU</a>
<ul>
<li><a href="#项目简介">项目简介</a></li>
<li><a href="#主要功能">主要功能</a></li>
<li><a href="#快速开始">快速开始</a>
<ul>
<li><a href="#在线体验">在线体验</a></li>
<li><a href="#本地部署">本地部署</a></li>
</ul>
</ul>
</li>
<li><a href="#todo">TODO</a></li>
<li><a href="#known-issues">Known Issues</a></li>
<li><a href="#faq">FAQ</a></li>
<li><a href="#all-thanks-to-our-contributors">Contributors</a></li>
<li><a href="#license-information">License Information</a></li>
<li><a href="#acknowledgments">Acknowledgements</a></li>
<li><a href="#citation">Citation</a></li>
<li><a href="#star-history">Star History</a></li>
<li><a href="#links">Links</a></li>
</ol>
</details>
# MinerU # MinerU
## 项目简介 ## 项目简介
...@@ -442,15 +416,25 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c ...@@ -442,15 +416,25 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
- 支持纯CPU环境运行,并支持 GPU(CUDA)/NPU(CANN)/MPS 加速 - 支持纯CPU环境运行,并支持 GPU(CUDA)/NPU(CANN)/MPS 加速
- 兼容Windows、Linux和Mac平台 - 兼容Windows、Linux和Mac平台
## 快速开始 # 快速开始
如果遇到任何安装问题,请先查询 <a href="#faq">FAQ</a> </br> 如果安装或使用中遇到任何问题,请先查询 <a href="#faq">FAQ</a> </br>
如果遇到解析效果不及预期,参考 <a href="#known-issues">Known Issues</a></br> 如果遇到解析效果不及预期,参考 <a href="#known-issues">Known Issues</a></br>
有2种不同方式可以体验MinerU的效果:
- [在线体验](#在线体验) ## 在线体验
- [本地部署](#本地部署)
### 官网在线应用
官网在线版功能与客户端一致,界面美观,功能丰富,需要登录使用
- [![OpenDataLab](https://img.shields.io/badge/webapp_on_mineru.net-blue?logo=&labelColor=white)](https://mineru.net/OpenSourceTools/Extractor?source=github)
### 基于Gradio的在线demo
基于gradio开发的webui,界面简洁,仅包含核心解析功能,免登录
- [![ModelScope](https://img.shields.io/badge/Demo_on_ModelScope-purple?logo=&labelColor=white)](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
- [![HuggingFace](https://img.shields.io/badge/Demo_on_HuggingFace-yellow.svg?logo=&labelColor=white)](https://huggingface.co/spaces/opendatalab/MinerU)
## 本地部署
> [!WARNING] > [!WARNING]
> **安装前必看——软硬件环境支持说明** > **安装前必看——软硬件环境支持说明**
...@@ -470,9 +454,9 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c ...@@ -470,9 +454,9 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
</tr> </tr>
<tr> <tr>
<td>操作系统</td> <td>操作系统</td>
<td>windows/linux/mac</td> <td>Linux / Windows / macOS</td>
<td>windows/linux</td> <td>Linux / Windows</td>
<td>windows(wsl2)/linux</td> <td>Linux / Windows (via WSL2)</td>
</tr> </tr>
<tr> <tr>
<td>CPU推理支持</td> <td>CPU推理支持</td>
...@@ -498,280 +482,42 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c ...@@ -498,280 +482,42 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
</tr> </tr>
</table> </table>
## 在线体验 ### 安装 MinerU
[![OpenDataLab](https://img.shields.io/badge/Demo_on_OpenDataLab-blue?logo=&labelColor=white)](https://mineru.net/OpenSourceTools/Extractor?source=github)
[![ModelScope](https://img.shields.io/badge/Demo_on_ModelScope-purple?logo=&labelColor=white)](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
[![HuggingFace](https://img.shields.io/badge/Demo_on_HuggingFace-yellow.svg?logo=&labelColor=white)](https://huggingface.co/spaces/opendatalab/MinerU)
## 本地部署
### 1. 安装 MinerU
#### 1.1 使用 pip 或 uv 安装
#### 使用pip或uv安装MinerU
```bash ```bash
pip install --upgrade pip -i https://mirrors.aliyun.com/pypi/simple pip install --upgrade pip -i https://mirrors.aliyun.com/pypi/simple
pip install uv -i https://mirrors.aliyun.com/pypi/simple pip install uv -i https://mirrors.aliyun.com/pypi/simple
uv pip install -U "mineru[core]" -i https://mirrors.aliyun.com/pypi/simple uv pip install -U "mineru[core]" -i https://mirrors.aliyun.com/pypi/simple
``` ```
#### 1.2 源码安装 #### 通过源码安装MinerU
```bash ```bash
git clone https://github.com/opendatalab/MinerU.git git clone https://github.com/opendatalab/MinerU.git
cd MinerU cd MinerU
uv pip install -e .[core] -i https://mirrors.aliyun.com/pypi/simple uv pip install -e .[core] -i https://mirrors.aliyun.com/pypi/simple
``` ```
> [!NOTE]
> Linux和macOS系统安装后自动支持cuda/mps加速,Windows用户如需使用cuda加速,
> 请前往 [Pytorch官网](https://pytorch.org/get-started/locally/) 选择合适的cuda版本安装pytorch。
#### 1.3 安装完整版(支持 sglang 加速)(需确保设备有Turing及以后架构,8G显存及以上显卡)
如需使用 **sglang 加速 VLM 模型推理**,请选择合适的方式安装完整版本:
- 使用uv或pip安装
```bash
uv pip install -U "mineru[all]" -i https://mirrors.aliyun.com/pypi/simple
```
- 从源码安装:
```bash
uv pip install -e .[all] -i https://mirrors.aliyun.com/pypi/simple
```
> [!TIP]
> sglang安装过程中如发生异常,请参考[sglang官方文档](https://docs.sglang.ai/start/install.html)尝试解决或直接使用docker方式安装。
- 使用 Dockerfile 构建镜像:
```bash
wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/docker/china/Dockerfile
docker build -t mineru-sglang:latest -f Dockerfile .
```
启动 Docker 容器:
```bash
docker run --gpus all \
--shm-size 32g \
-p 30000:30000 \
--ipc=host \
mineru-sglang:latest \
mineru-sglang-server --host 0.0.0.0 --port 30000
```
或使用 Docker Compose 启动:
```bash
wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/docker/compose.yaml
docker compose -f compose.yaml up -d
```
> [!TIP] > [!TIP]
> Dockerfile默认使用`lmsysorg/sglang:v0.4.8.post1-cu126`作为基础镜像,支持Turing/Ampere/Ada Lovelace/Hopper平台, > `mineru[core]`包含除`sglang`加速外的所有核心功能,兼容Windows / Linux / macOS系统,适合绝大多数用户。
> 如您使用较新的`Blackwell`平台,请将基础镜像修改为`lmsysorg/sglang:v0.4.8.post1-cu128-b200`。 > 如果您有使用`sglang`加速VLM模型推理,或是在边缘设备安装轻量版client端等需求,可以参考文档[扩展模块安装指南](https://opendatalab.github.io/MinerU/zh/quick_start/extension_modules/)。
#### 1.4 安装client(用于在仅需 CPU 和网络连接的边缘设备上连接 sglang-server)
```bash
uv pip install -U mineru -i https://mirrors.aliyun.com/pypi/simple
mineru -p <input_path> -o <output_path> -b vlm-sglang-client -u http://<host_ip>:<port>
```
--- ---
### 2. 使用 MinerU #### 使用docker部署Mineru
MinerU提供了便捷的docker部署方式,这有助于快速搭建环境并解决一些棘手的环境兼容问题。
#### 2.1 命令行使用方式 您可以在文档中获取[Docker部署说明](https://opendatalab.github.io/MinerU/zh/quick_start/docker_deployment/)
##### 基础用法
最简单的命令行调用方式如下:
```bash
mineru -p <input_path> -o <output_path>
```
- `<input_path>`:本地 PDF/图片 文件或目录(支持 pdf/png/jpg/jpeg/webp/gif)
- `<output_path>`:输出目录
##### 查看帮助信息
获取所有可用参数说明:
```bash
mineru --help
```
##### 参数详解
```text
Usage: mineru [OPTIONS]
Options:
-v, --version 显示版本并退出
-p, --path PATH 输入文件路径或目录(必填)
-o, --output PATH 输出目录(必填)
-m, --method [auto|txt|ocr] 解析方法:auto(默认)、txt、ocr(仅用于 pipeline 后端)
-b, --backend [pipeline|vlm-transformers|vlm-sglang-engine|vlm-sglang-client]
解析后端(默认为 pipeline)
-l, --lang [ch|ch_server|ch_lite|en|korean|japan|chinese_cht|ta|te|ka|latin|arabic|east_slavic|cyrillic|devanagari]
指定文档语言(可提升 OCR 准确率,仅用于 pipeline 后端)
-u, --url TEXT 当使用 sglang-client 时,需指定服务地址
-s, --start INTEGER 开始解析的页码(从 0 开始)
-e, --end INTEGER 结束解析的页码(从 0 开始)
-f, --formula BOOLEAN 是否启用公式解析(默认开启)
-t, --table BOOLEAN 是否启用表格解析(默认开启)
-d, --device TEXT 推理设备(如 cpu/cuda/cuda:0/npu/mps,仅 pipeline 后端)
--vram INTEGER 单进程最大 GPU 显存占用(GB)(仅 pipeline 后端)
--source [huggingface|modelscope|local]
模型来源,默认 huggingface
--help 显示帮助信息
```
--- ---
#### 2.2 模型源配置 ### 使用 MinerU
MinerU 默认在首次运行时自动从 HuggingFace 下载所需模型。若无法访问 HuggingFace,可通过以下方式切换模型源:
##### 切换至 ModelScope 源
```bash
mineru -p <input_path> -o <output_path> --source modelscope
```
或设置环境变量:
最简单的命令行调用方式:
```bash ```bash
export MINERU_MODEL_SOURCE=modelscope
mineru -p <input_path> -o <output_path> mineru -p <input_path> -o <output_path>
``` ```
##### 使用本地模型 您可以通过命令行、API、WebUI等多种方式使用MinerU进行PDF解析,具体使用方法请参考[使用指南](https://opendatalab.github.io/MinerU/zh/usage/)
###### 1. 下载模型到本地
```bash
mineru-models-download --help
```
或使用交互式命令行工具选择模型下载:
```bash
mineru-models-download
```
下载完成后,模型路径会在当前终端窗口输出,并自动写入用户目录下的 `mineru.json`
###### 2. 使用本地模型进行解析
```bash
mineru -p <input_path> -o <output_path> --source local
```
或通过环境变量启用:
```bash
export MINERU_MODEL_SOURCE=local
mineru -p <input_path> -o <output_path>
```
---
#### 2.3 使用 sglang 加速 VLM 模型推理
##### 通过 sglang-engine 模式
```bash
mineru -p <input_path> -o <output_path> -b vlm-sglang-engine
```
##### 通过 sglang-server/client 模式
1. 启动 Server:
```bash
mineru-sglang-server --port 30000
```
2. 在另一个终端中使用 Client 调用:
```bash
mineru -p <input_path> -o <output_path> -b vlm-sglang-client -u http://127.0.0.1:30000
```
> [!TIP]
> 更多关于输出文件的信息,请参考 [输出文件说明](docs/output_file_zh_cn.md)
---
### 3. API 调用 或 可视化调用
1. 使用python api直接调用:[Python 调用示例](demo/demo.py)
2. 使用fast api方式调用:
```bash
mineru-api --host 127.0.0.1 --port 8000
```
在浏览器中访问 http://127.0.0.1:8000/docs 查看API文档。
3. 使用gradio webui 或 gradio api调用
```bash
# 使用 pipeline/vlm-transformers/vlm-sglang-client 后端
mineru-gradio --server-name 127.0.0.1 --server-port 7860
# 或使用 vlm-sglang-engine/pipeline 后端
mineru-gradio --server-name 127.0.0.1 --server-port 7860 --enable-sglang-engine true
```
在浏览器中访问 http://127.0.0.1:7860 使用 Gradio WebUI 或访问 http://127.0.0.1:7860/?view=api 使用 Gradio API。
> [!TIP]
> 以下是一些使用sglang加速模式的建议和注意事项:
> - sglang加速模式目前支持在最低8G显存的Turing架构显卡上运行,但在显存<24G的显卡上可能会遇到显存不足的问题, 可以通过使用以下参数来优化显存使用:
> - 如果您使用单张显卡遇到显存不足的情况时,可能需要调低KV缓存大小,`--mem-fraction-static 0.5`,如仍出现显存不足问题,可尝试进一步降低到`0.4`或更低。
> - 如您有两张以上显卡,可尝试通过张量并行(TP)模式简单扩充可用显存:`--tp-size 2`
> - 如果您已经可以正常使用sglang对vlm模型进行加速推理,但仍然希望进一步提升推理速度,可以尝试以下参数:
> - 如果您有超过多张显卡,可以使用sglang的多卡并行模式来增加吞吐量:`--dp-size 2`
> - 同时您可以启用`torch.compile`来将推理速度加速约15%:`--enable-torch-compile`
> - 如果您想了解更多有关`sglang`的参数使用方法,请参考 [sglang官方文档](https://docs.sglang.ai/backend/server_arguments.html#common-launch-commands)
> - 所有sglang官方支持的参数都可用通过命令行参数传递给 MinerU,包括以下命令:`mineru`、`mineru-sglang-server`、`mineru-gradio`、`mineru-api`
> [!TIP]
> - 任何情况下,您都可以通过在命令行的开头添加`CUDA_VISIBLE_DEVICES` 环境变量来指定可见的 GPU 设备。例如:
> ```bash
> CUDA_VISIBLE_DEVICES=1 mineru -p <input_path> -o <output_path>
> ```
> - 这种指定方式对所有的命令行调用都有效,包括 `mineru`、`mineru-sglang-server`、`mineru-gradio` 和 `mineru-api`,且对`pipeline`、`vlm`后端均适用。
> - 以下是一些常见的 `CUDA_VISIBLE_DEVICES` 设置示例:
> ```bash
> CUDA_VISIBLE_DEVICES=1 Only device 1 will be seen
> CUDA_VISIBLE_DEVICES=0,1 Devices 0 and 1 will be visible
> CUDA_VISIBLE_DEVICES=“0,1” Same as above, quotation marks are optional
> CUDA_VISIBLE_DEVICES=0,2,3 Devices 0, 2, 3 will be visible; device 1 is masked
> CUDA_VISIBLE_DEVICES="" No GPU will be visible
> ```
> - 以下是一些可能的使用场景:
> - 如果您有多张显卡,需要指定卡0和卡1,并使用多卡并行来启动'sglang-server',可以使用以下命令:
> ```bash
> CUDA_VISIBLE_DEVICES=0,1 mineru-sglang-server --port 30000 --dp-size 2
> ```
> - 如果您有多张显卡,需要在卡0和卡1上启动两个`fastapi`服务,并分别监听不同的端口,可以使用以下命令:
> ```bash
> # 在终端1中
> CUDA_VISIBLE_DEVICES=0 mineru-api --host 127.0.0.1 --port 8000
> # 在终端2中
> CUDA_VISIBLE_DEVICES=1 mineru-api --host 127.0.0.1 --port 8001
> ```
---
### 4. 基于配置文件扩展 MinerU 功能
- MinerU 现已实现开箱即用,但也支持通过配置文件扩展功能。您可以在用户目录下创建 `mineru.json` 文件,添加自定义配置。
- `mineru.json` 文件会在您使用内置模型下载命令 `mineru-models-download` 时自动生成,也可以通过将[配置模板文件](./mineru.template.json)复制到用户目录下并重命名为 `mineru.json` 来创建。
- 以下是一些可用的配置选项:
- `latex-delimiter-config`:用于配置 LaTeX 公式的分隔符,默认为`$`符号,可根据需要修改为其他符号或字符串。
- `llm-aided-config`:用于配置 LLM 辅助标题分级的相关参数,兼容所有支持`openai协议`的 LLM 模型,默认使用`阿里云百炼``qwen2.5-32b-instruct`模型,您需要自行配置 API 密钥并将`enable`设置为`true`来启用此功能。
- `models-dir`:用于指定本地模型存储目录,请为`pipeline``vlm`后端分别指定模型目录,指定目录后您可通过配置环境变量`export MINERU_MODEL_SOURCE=local`来使用本地模型。
---
# TODO # TODO
...@@ -779,9 +525,12 @@ mineru -p <input_path> -o <output_path> -b vlm-sglang-client -u http://127.0.0.1 ...@@ -779,9 +525,12 @@ mineru -p <input_path> -o <output_path> -b vlm-sglang-client -u http://127.0.0.1
- [x] 正文中目录、列表识别 - [x] 正文中目录、列表识别
- [x] 表格识别 - [x] 表格识别
- [x] 标题分级 - [x] 标题分级
- [x] 手写文本识别
- [x] 竖排文本识别
- [x] 拉丁字母重音符号识别
- [ ] 正文中代码块识别 - [ ] 正文中代码块识别
- [ ] [化学式识别](docs/chemical_knowledge_introduction/introduction.pdf) - [ ] [化学式识别](docs/chemical_knowledge_introduction/introduction.pdf)
- [ ] 几何图形识别 - [ ] 图表内容识别
# Known Issues # Known Issues
...@@ -791,12 +540,12 @@ mineru -p <input_path> -o <output_path> -b vlm-sglang-client -u http://127.0.0.1 ...@@ -791,12 +540,12 @@ mineru -p <input_path> -o <output_path> -b vlm-sglang-client -u http://127.0.0.1
- 代码块在layout模型里还没有支持 - 代码块在layout模型里还没有支持
- 漫画书、艺术图册、小学教材、习题尚不能很好解析 - 漫画书、艺术图册、小学教材、习题尚不能很好解析
- 表格识别在复杂表格上可能会出现行/列识别错误 - 表格识别在复杂表格上可能会出现行/列识别错误
- 在小语种PDF上,OCR识别可能会出现字符不准确的情况(如拉丁文的重音符号、阿拉伯文易混淆字符等) - 在小语种PDF上,OCR识别可能会出现字符不准确的情况(如阿拉伯文易混淆字符等)
- 部分公式可能会无法在markdown中渲染 - 部分公式可能会无法在markdown中渲染
# FAQ # FAQ
- 如果您在使用过程中遇到问题,可以先查看[常见问题](docs/FAQ_zh_cn.md)是否有解答。 - 如果您在使用过程中遇到问题,可以先查看[常见问题](https://opendatalab.github.io/MinerU/zh/faq/)是否有解答。
- 如果未能解决您的问题,您也可以使用[DeepWiki](https://deepwiki.com/opendatalab/MinerU)与AI助手交流,这可以解决大部分常见问题。 - 如果未能解决您的问题,您也可以使用[DeepWiki](https://deepwiki.com/opendatalab/MinerU)与AI助手交流,这可以解决大部分常见问题。
- 如果您仍然无法解决问题,您可通过[Discord](https://discord.gg/Tdedn9GTXq)[WeChat](http://mineru.space/s/V85Yl)加入社区,与其他用户和开发者交流。 - 如果您仍然无法解决问题,您可通过[Discord](https://discord.gg/Tdedn9GTXq)[WeChat](http://mineru.space/s/V85Yl)加入社区,与其他用户和开发者交流。
...@@ -861,11 +610,11 @@ mineru -p <input_path> -o <output_path> -b vlm-sglang-client -u http://127.0.0.1 ...@@ -861,11 +610,11 @@ mineru -p <input_path> -o <output_path> -b vlm-sglang-client -u http://127.0.0.1
# Links # Links
- [Easy Data Preparation with latest LLMs-based Operators and Pipelines](https://github.com/OpenDCAI/DataFlow)
- [Vis3 (OSS browser based on s3)](https://github.com/opendatalab/Vis3)
- [LabelU (A Lightweight Multi-modal Data Annotation Tool)](https://github.com/opendatalab/labelU) - [LabelU (A Lightweight Multi-modal Data Annotation Tool)](https://github.com/opendatalab/labelU)
- [LabelLLM (An Open-source LLM Dialogue Annotation Platform)](https://github.com/opendatalab/LabelLLM) - [LabelLLM (An Open-source LLM Dialogue Annotation Platform)](https://github.com/opendatalab/LabelLLM)
- [PDF-Extract-Kit (A Comprehensive Toolkit for High-Quality PDF Content Extraction)](https://github.com/opendatalab/PDF-Extract-Kit) - [PDF-Extract-Kit (A Comprehensive Toolkit for High-Quality PDF Content Extraction)](https://github.com/opendatalab/PDF-Extract-Kit)
- [Vis3 (OSS browser based on s3)](https://github.com/opendatalab/Vis3)
- [OmniDocBench (A Comprehensive Benchmark for Document Parsing and Evaluation)](https://github.com/opendatalab/OmniDocBench) - [OmniDocBench (A Comprehensive Benchmark for Document Parsing and Evaluation)](https://github.com/opendatalab/OmniDocBench)
- [Magic-HTML (Mixed web page extraction tool)](https://github.com/opendatalab/magic-html) - [Magic-HTML (Mixed web page extraction tool)](https://github.com/opendatalab/magic-html)
- [Magic-Doc (Fast speed ppt/pptx/doc/docx/pdf extraction tool)](https://github.com/InternLM/magic-doc) - [Magic-Doc (Fast speed ppt/pptx/doc/docx/pdf extraction tool)](https://github.com/InternLM/magic-doc)
\ No newline at end of file
# Use the official sglang image # Use the official sglang image
FROM lmsysorg/sglang:v0.4.8.post1-cu126 FROM lmsysorg/sglang:v0.4.8.post1-cu126
# Install libgl for opencv support # Install libgl for opencv support & Noto fonts for Chinese characters
RUN apt-get update && apt-get install -y libgl1 && apt-get clean && rm -rf /var/lib/apt/lists/* RUN apt-get update && \
apt-get install -y fonts-noto-core fonts-noto-cjk && \
apt-get install -y libgl1 && \
apt-get clean && \
fc-cache -fv && \
rm -rf /var/lib/apt/lists/*
# Install mineru latest # Install mineru latest
RUN python3 -m pip install -U 'mineru[core]' -i https://mirrors.aliyun.com/pypi/simple --break-system-packages RUN python3 -m pip install -U 'mineru[core]' -i https://mirrors.aliyun.com/pypi/simple --break-system-packages
......
# Documentation:
# https://docs.sglang.ai/backend/server_arguments.html#common-launch-commands
services: services:
mineru-sglang: mineru-sglang-server:
image: mineru-sglang:latest image: mineru-sglang:latest
container_name: mineru-sglang container_name: mineru-sglang-server
restart: always restart: always
profiles: ["sglang-server"]
ports: ports:
- 30000:30000 - 30000:30000
environment: environment:
...@@ -30,3 +29,66 @@ services: ...@@ -30,3 +29,66 @@ services:
- driver: nvidia - driver: nvidia
device_ids: ["0"] device_ids: ["0"]
capabilities: [gpu] capabilities: [gpu]
mineru-api:
image: mineru-sglang:latest
container_name: mineru-api
restart: always
profiles: ["api"]
ports:
- 8000:8000
environment:
MINERU_MODEL_SOURCE: local
entrypoint: mineru-api
command:
--host 0.0.0.0
--port 8000
# parameters for sglang-engine
# --enable-torch-compile # You can also enable torch.compile to accelerate inference speed by approximately 15%
# --dp-size 2 # If using multiple GPUs, increase throughput using sglang's multi-GPU parallel mode
# --tp-size 2 # If you have more than one GPU, you can expand available VRAM using tensor parallelism (TP) mode.
# --mem-fraction-static 0.5 # If running on a single GPU and encountering VRAM shortage, reduce the KV cache size by this parameter, if VRAM issues persist, try lowering it further to `0.4` or below.
ulimits:
memlock: -1
stack: 67108864
ipc: host
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: [ "0" ]
capabilities: [ gpu ]
mineru-gradio:
image: mineru-sglang:latest
container_name: mineru-gradio
restart: always
profiles: ["gradio"]
ports:
- 7860:7860
environment:
MINERU_MODEL_SOURCE: local
entrypoint: mineru-gradio
command:
--server-name 0.0.0.0
--server-port 7860
--enable-sglang-engine true # Enable the sglang engine for Gradio
# --enable-api false # If you want to disable the API, set this to false
# --max-convert-pages 20 # If you want to limit the number of pages for conversion, set this to a specific number
# parameters for sglang-engine
# --enable-torch-compile # You can also enable torch.compile to accelerate inference speed by approximately 15%
# --dp-size 2 # If using multiple GPUs, increase throughput using sglang's multi-GPU parallel mode
# --tp-size 2 # If you have more than one GPU, you can expand available VRAM using tensor parallelism (TP) mode.
# --mem-fraction-static 0.5 # If running on a single GPU and encountering VRAM shortage, reduce the KV cache size by this parameter, if VRAM issues persist, try lowering it further to `0.4` or below.
ulimits:
memlock: -1
stack: 67108864
ipc: host
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: [ "0" ]
capabilities: [ gpu ]
...@@ -2,7 +2,12 @@ ...@@ -2,7 +2,12 @@
FROM lmsysorg/sglang:v0.4.8.post1-cu126 FROM lmsysorg/sglang:v0.4.8.post1-cu126
# Install libgl for opencv support # Install libgl for opencv support
RUN apt-get update && apt-get install -y libgl1 && apt-get clean && rm -rf /var/lib/apt/lists/* RUN apt-get update && \
apt-get install -y fonts-noto-core fonts-noto-cjk && \
apt-get install -y libgl1 && \
apt-get clean && \
fc-cache -fv && \
rm -rf /var/lib/apt/lists/*
# Install mineru latest # Install mineru latest
RUN python3 -m pip install -U 'mineru[core]' --break-system-packages RUN python3 -m pip install -U 'mineru[core]' --break-system-packages
......
# Frequently Asked Questions
### 1. Encountered the error `ImportError: libGL.so.1: cannot open shared object file: No such file or directory` in Ubuntu 22.04 on WSL2
The `libgl` library is missing in Ubuntu 22.04 on WSL2. You can install the `libgl` library with the following command to resolve the issue:
```bash
sudo apt-get install libgl1-mesa-glx
```
Reference: https://github.com/opendatalab/MinerU/issues/388
### 2. Error when installing MinerU on CentOS 7 or Ubuntu 18: `ERROR: Failed building wheel for simsimd`
The new version of albumentations (1.4.21) introduces a dependency on simsimd. Since the pre-built package of simsimd for Linux requires a glibc version greater than or equal to 2.28, this causes installation issues on some Linux distributions released before 2019. You can resolve this issue by using the following command:
```
conda create -n mineru python=3.11 -y
conda activate mineru
pip install -U "mineru[pipeline_old_linux]"
```
Reference: https://github.com/opendatalab/MinerU/issues/1004
# 常见问题解答
### 1.在WSL2的Ubuntu22.04中遇到报错`ImportError: libGL.so.1: cannot open shared object file: No such file or directory`
WSL2的Ubuntu22.04中缺少`libgl`库,可通过以下命令安装`libgl`库解决:
```bash
sudo apt-get install libgl1-mesa-glx
```
参考:https://github.com/opendatalab/MinerU/issues/388
### 2.在 CentOS 7 或 Ubuntu 18 系统安装MinerU时报错`ERROR: Failed building wheel for simsimd`
新版本albumentations(1.4.21)引入了依赖simsimd,由于simsimd在linux的预编译包要求glibc的版本大于等于2.28,导致部分2019年之前发布的Linux发行版无法正常安装,可通过如下命令安装:
```
conda create -n mineru python=3.11 -y
conda activate mineru
pip install -U "mineru[pipeline_old_linux]"
```
参考:https://github.com/opendatalab/MinerU/issues/1004
<script type="module" src="https://gradio.s3-us-west-2.amazonaws.com/5.35.0/gradio.js"></script>
<gradio-app src="https://opendatalab-mineru.hf.space"></gradio-app>
# Frequently Asked Questions
If your question is not listed, try using [DeepWiki](https://deepwiki.com/opendatalab/MinerU)'s AI assistant for common issues.
For unresolved problems, join our [Discord](https://discord.gg/Tdedn9GTXq) or [WeChat](http://mineru.space/s/V85Yl) community for support.
??? question "Encountered the error `ImportError: libGL.so.1: cannot open shared object file: No such file or directory` in Ubuntu 22.04 on WSL2"
The `libgl` library is missing in Ubuntu 22.04 on WSL2. You can install the `libgl` library with the following command to resolve the issue:
```bash
sudo apt-get install libgl1-mesa-glx
```
Reference: [#388](https://github.com/opendatalab/MinerU/issues/388)
??? question "Error when installing MinerU on CentOS 7 or Ubuntu 18: `ERROR: Failed building wheel for simsimd`"
The new version of albumentations (1.4.21) introduces a dependency on simsimd. Since the pre-built package of simsimd for Linux requires a glibc version greater than or equal to 2.28, this causes installation issues on some Linux distributions released before 2019. You can resolve this issue by using the following command:
```
conda create -n mineru python=3.11 -y
conda activate mineru
pip install -U "mineru[pipeline_old_linux]"
```
Reference: [#1004](https://github.com/opendatalab/MinerU/issues/1004)
??? question "Missing text information in parsing results when installing and using on Linux systems."
MinerU uses `pypdfium2` instead of `pymupdf` as the PDF page rendering engine in versions >=2.0 to resolve AGPLv3 license issues. On some Linux distributions, due to missing CJK fonts, some text may be lost during the process of rendering PDFs to images.
To solve this problem, you can install the noto font package with the following commands, which are effective on Ubuntu/Debian systems:
```bash
sudo apt update
sudo apt install fonts-noto-core
sudo apt install fonts-noto-cjk
fc-cache -fv
```
You can also directly use our [Docker deployment](../quick_start/docker_deployment.md) method to build the image, which includes the above font packages by default.
Reference: [#2915](https://github.com/opendatalab/MinerU/issues/2915)
<div align="center" xmlns="http://www.w3.org/1999/html">
<!-- logo -->
<p align="center">
<img src="https://opendatalab.github.io/MinerU/images/MinerU-logo.png" width="300px" style="vertical-align:middle;">
</p>
</div>
<!-- icon -->
[![stars](https://img.shields.io/github/stars/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU)
[![forks](https://img.shields.io/github/forks/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU)
[![open issues](https://img.shields.io/github/issues-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues)
[![issue resolution](https://img.shields.io/github/issues-closed-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues)
[![PyPI version](https://img.shields.io/pypi/v/mineru)](https://pypi.org/project/mineru/)
[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mineru)](https://pypi.org/project/mineru/)
[![Downloads](https://static.pepy.tech/badge/mineru)](https://pepy.tech/project/mineru)
[![Downloads](https://static.pepy.tech/badge/mineru/month)](https://pepy.tech/project/mineru)
[![OpenDataLab](https://img.shields.io/badge/webapp_on_mineru.net-blue?logo=&labelColor=white)](https://mineru.net/OpenSourceTools/Extractor?source=github)
[![HuggingFace](https://img.shields.io/badge/Demo_on_HuggingFace-yellow.svg?logo=&labelColor=white)](https://huggingface.co/spaces/opendatalab/MinerU)
[![ModelScope](https://img.shields.io/badge/Demo_on_ModelScope-purple?logo=&labelColor=white)](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/gist/myhloli/3b3a00a4a0a61577b6c30f989092d20d/mineru_demo.ipynb)
[![arXiv](https://img.shields.io/badge/arXiv-2409.18839-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2409.18839)
[![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/opendatalab/MinerU)
<div align="center">
<a href="https://trendshift.io/repositories/11174" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11174" alt="opendatalab%2FMinerU | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
<!-- hot link -->
<p align="center">
🚀<a href="https://mineru.net/?source=github">MinerU Official Website→✅ Zero-Install Online Version ✅ Full-Featured Client ✅ Developer API Online Access, skip deployment hassles, get all product formats with one click, go fast!</a>
</p>
<!-- join us -->
<p align="center">
👋 join us on <a href="https://discord.gg/Tdedn9GTXq" target="_blank">Discord</a> and <a href="http://mineru.space/s/V85Yl" target="_blank">WeChat</a>
</p>
</div>
## Project Introduction
MinerU is a tool that converts PDFs into machine-readable formats (e.g., markdown, JSON), allowing for easy extraction into any format.
MinerU was born during the pre-training process of [InternLM](https://github.com/InternLM/InternLM). We focus on solving symbol conversion issues in scientific literature and hope to contribute to technological development in the era of large models.
Compared to well-known commercial products domestically and internationally, MinerU is still young. If you encounter any issues or if the results are not as expected, please submit an issue on [GitHub Issues](https://github.com/opendatalab/MinerU/issues) and **attach the relevant PDF**.
![type:video](https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c)
## Key Features
- Remove headers, footers, footnotes, page numbers and other elements to ensure semantic coherence
- Output text in human reading order, suitable for single-column, multi-column and complex layouts
- Retain the original document structure, including titles, paragraphs, lists, etc.
- Extract images, image descriptions, tables, table titles and footnotes
- Automatically identify and convert formulas in documents to LaTeX format
- Automatically identify and convert tables in documents to HTML format
- Automatically detect scanned PDFs and garbled PDFs, and enable OCR functionality
- OCR supports detection and recognition of 84 languages
- Support multiple output formats, such as multimodal and NLP Markdown, reading-order-sorted JSON, and information-rich intermediate formats
- Support multiple visualization results, including layout visualization, span visualization, etc., for efficient confirmation of output effects and quality inspection
- Support pure CPU environment operation, and support GPU(CUDA)/NPU(CANN)/MPS acceleration
- Compatible with Windows, Linux and Mac platforms
## User Guide
- [Quick Start Guide](./quick_start/index.md)
- [Detailed Usage Instructions](./usage/index.md)
# Deploying MinerU with Docker
MinerU provides a convenient Docker deployment method, which helps quickly set up the environment and solve some tricky environment compatibility issues.
## Build Docker Image using Dockerfile
```bash
wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/docker/global/Dockerfile
docker build -t mineru-sglang:latest -f Dockerfile .
```
> [!TIP]
> The [Dockerfile](https://github.com/opendatalab/MinerU/blob/master/docker/global/Dockerfile) uses `lmsysorg/sglang:v0.4.8.post1-cu126` as the base image by default, supporting Turing/Ampere/Ada Lovelace/Hopper platforms.
> If you are using the newer `Blackwell` platform, please modify the base image to `lmsysorg/sglang:v0.4.8.post1-cu128-b200` before executing the build operation.
## Docker Description
MinerU's Docker uses `lmsysorg/sglang` as the base image, so it includes the `sglang` inference acceleration framework and necessary dependencies by default. Therefore, on compatible devices, you can directly use `sglang` to accelerate VLM model inference.
> [!NOTE]
> Requirements for using `sglang` to accelerate VLM model inference:
>
> - Device must have Turing architecture or later graphics cards with 8GB+ available VRAM.
> - The host machine's graphics driver should support CUDA 12.6 or higher; `Blackwell` platform should support CUDA 12.8 or higher. You can check the driver version using the `nvidia-smi` command.
> - Docker container must have access to the host machine's graphics devices.
>
> If your device doesn't meet the above requirements, you can still use other features of MinerU, but cannot use `sglang` to accelerate VLM model inference, meaning you cannot use the `vlm-sglang-engine` backend or start the `vlm-sglang-server` service.
## Start Docker Container:
```bash
docker run --gpus all \
--shm-size 32g \
-p 30000:30000 -p 7860:7860 -p 8000:8000 \
--ipc=host \
-it mineru-sglang:latest \
/bin/bash
```
After executing this command, you will enter the Docker container's interactive terminal with some ports mapped for potential services. You can directly run MinerU-related commands within the container to use MinerU's features.
You can also directly start MinerU services by replacing `/bin/bash` with service startup commands. For detailed instructions, please refer to the [Start the service via command](https://opendatalab.github.io/MinerU/usage/quick_usage/#advanced-usage-via-api-webui-sglang-clientserver).
## Start Services Directly with Docker Compose
We provide a [compose.yaml](https://github.com/opendatalab/MinerU/blob/master/docker/compose.yaml) file that you can use to quickly start MinerU services.
```bash
# Download compose.yaml file
wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/docker/compose.yaml
```
>[!NOTE]
>
>- The `compose.yaml` file contains configurations for multiple services of MinerU, you can choose to start specific services as needed.
>- Different services might have additional parameter configurations, which you can view and edit in the `compose.yaml` file.
>- Due to the pre-allocation of GPU memory by the `sglang` inference acceleration framework, you may not be able to run multiple `sglang` services simultaneously on the same machine. Therefore, ensure that other services that might use GPU memory have been stopped before starting the `vlm-sglang-server` service or using the `vlm-sglang-engine` backend.
---
### Start sglang-server service
connect to `sglang-server` via `vlm-sglang-client` backend
```bash
docker compose -f compose.yaml --profile mineru-sglang-server up -d
```
>[!TIP]
>In another terminal, connect to sglang server via sglang client (only requires CPU and network, no sglang environment needed)
> ```bash
> mineru -p <input_path> -o <output_path> -b vlm-sglang-client -u http://<server_ip>:30000
> ```
---
### Start Web API service
```bash
docker compose -f compose.yaml --profile mineru-api up -d
```
>[!TIP]
>Access `http://<server_ip>:8000/docs` in your browser to view the API documentation.
---
### Start Gradio WebUI service
```bash
docker compose -f compose.yaml --profile mineru-gradio up -d
```
>[!TIP]
>
>- Access `http://<server_ip>:7860` in your browser to use the Gradio WebUI.
>- Access `http://<server_ip>:7860/?view=api` to use the Gradio API.
# MinerU Extension Modules Installation Guide
MinerU supports installing extension modules on demand based on different needs to enhance functionality or support specific model backends.
## Common Scenarios
### Core Functionality Installation
The `core` module is the core dependency of MinerU, containing all functional modules except `sglang`. Installing this module ensures the basic functionality of MinerU works properly.
```bash
uv pip install mineru[core]
```
---
### Using `sglang` to Accelerate VLM Model Inference
The `sglang` module provides acceleration support for VLM model inference, suitable for graphics cards with Turing architecture and later (8GB+ VRAM). Installing this module can significantly improve model inference speed.
In the configuration, `all` includes both `core` and `sglang` modules, so `mineru[all]` and `mineru[core,sglang]` are equivalent.
```bash
uv pip install mineru[all]
```
> [!TIP]
> If exceptions occur during installation of the complete package including sglang, please refer to the [sglang official documentation](https://docs.sglang.ai/start/install.html) to try to resolve the issue, or directly use the [Docker](./docker_deployment.md) deployment method.
---
### Installing Lightweight Client to Connect to sglang-server
If you need to install a lightweight client on edge devices to connect to `sglang-server`, you can install the basic mineru package, which is very lightweight and suitable for devices with only CPU and network connectivity.
```bash
uv pip install mineru
```
---
### Using Pipeline Backend on Outdated Linux Systems
If your system is too outdated to meet the dependency requirements of `mineru[core]`, this option can minimally meet MinerU's runtime requirements, suitable for old systems that cannot be upgraded and only need to use the pipeline backend.
```bash
uv pip install mineru[pipeline_old_linux]
```
# Quick Start
If you encounter any installation issues, please check the [FAQ](../faq/index.md) first.
## Online Experience
### Official online web application
The official online version has the same functionality as the client, with a beautiful interface and rich features, requires login to use
- [![OpenDataLab](https://img.shields.io/badge/webapp_on_mineru.net-blue?logo=&labelColor=white)](https://mineru.net/OpenSourceTools/Extractor?source=github)
### Gradio-based online demo
A WebUI developed based on Gradio, with a simple interface and only core parsing functionality, no login required
- [![ModelScope](https://img.shields.io/badge/Demo_on_ModelScope-purple?logo=&labelColor=white)](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
- [![HuggingFace](https://img.shields.io/badge/Demo_on_HuggingFace-yellow.svg?logo=&labelColor=white)](https://huggingface.co/spaces/opendatalab/MinerU)
## Local Deployment
> [!WARNING]
> **Prerequisites - Hardware and Software Environment Support**
>
> To ensure the stability and reliability of the project, we have optimized and tested only specific hardware and software environments during development. This ensures that users can achieve optimal performance and encounter the fewest compatibility issues when deploying and running the project on recommended system configurations.
>
> By concentrating our resources and efforts on mainstream environments, our team can more efficiently resolve potential bugs and timely develop new features.
>
> In non-mainstream environments, due to the diversity of hardware and software configurations, as well as compatibility issues with third-party dependencies, we cannot guarantee 100% usability of the project. Therefore, for users who wish to use this project in non-recommended environments, we suggest carefully reading the documentation and FAQ first, as most issues have corresponding solutions in the FAQ. Additionally, we encourage community feedback on issues so that we can gradually expand our support range.
<table border="1">
<tr>
<td>Parsing Backend</td>
<td>pipeline</td>
<td>vlm-transformers</td>
<td>vlm-sglang</td>
</tr>
<tr>
<td>Operating System</td>
<td>Linux / Windows / macOS</td>
<td>Linux / Windows</td>
<td>Linux / Windows (via WSL2)</td>
</tr>
<tr>
<td>CPU Inference Support</td>
<td></td>
<td colspan="2"></td>
</tr>
<tr>
<td>GPU Requirements</td>
<td>Turing architecture and later, 6GB+ VRAM or Apple Silicon</td>
<td colspan="2">Turing architecture and later, 8GB+ VRAM</td>
</tr>
<tr>
<td>Memory Requirements</td>
<td colspan="3">Minimum 16GB+, recommended 32GB+</td>
</tr>
<tr>
<td>Disk Space Requirements</td>
<td colspan="3">20GB+, SSD recommended</td>
</tr>
<tr>
<td>Python Version</td>
<td colspan="3">3.10-3.13</td>
</tr>
</table>
### Install MinerU
#### Install MinerU using pip or uv
```bash
pip install --upgrade pip
pip install uv
uv pip install -U "mineru[core]"
```
#### Install MinerU from source code
```bash
git clone https://github.com/opendatalab/MinerU.git
cd MinerU
uv pip install -e .[core]
```
> [!TIP]
> `mineru[core]` includes all core features except `sglang` acceleration, compatible with Windows / Linux / macOS systems, suitable for most users.
> If you need to use `sglang` acceleration for VLM model inference or install a lightweight client on edge devices, please refer to the documentation [Extension Modules Installation Guide](./extension_modules.md).
---
#### Deploy MinerU using Docker
MinerU provides a convenient Docker deployment method, which helps quickly set up the environment and solve some tricky environment compatibility issues.
You can get the [Docker Deployment Instructions](./docker_deployment.md) in the documentation.
---
### Using MinerU
The simplest command line invocation is:
```bash
mineru -p <input_path> -o <output_path>
```
You can use MinerU for PDF parsing through various methods such as command line, API, and WebUI. For detailed instructions, please refer to the [Usage Guide](../usage/index.md).
\ No newline at end of file
# MinerU Output Files Documentation
## Overview ## Overview
After executing the `mineru` command, in addition to outputting files related to markdown, several other files unrelated to markdown will also be generated. These files will be introduced one by one. After executing the `mineru` command, in addition to the main markdown file output, multiple auxiliary files are generated for debugging, quality inspection, and further processing. These files include:
- **Visual debugging files**: Help users intuitively understand the document parsing process and results
- **Structured data files**: Contain detailed parsing data for secondary development
The following sections provide detailed descriptions of each file's purpose and format.
## Visual Debugging Files
### Layout Analysis File (layout.pdf)
**File naming format**: `{original_filename}_layout.pdf`
**Functionality**:
- Visualizes layout analysis results for each page
- Numbers in the top-right corner of each detection box indicate reading order
- Different background colors distinguish different types of content blocks
**Use cases**:
- Check if layout analysis is correct
- Verify if reading order is reasonable
- Debug layout-related issues
![layout page example](../images/layout_example.png)
### Text Spans File (spans.pdf)
> [!NOTE]
> Only applicable to pipeline backend
**File naming format**: `{original_filename}_spans.pdf`
**Functionality**:
- Uses different colored line boxes to annotate page content based on span type
- Used for quality inspection and issue troubleshooting
### some_pdf_layout.pdf **Use cases**:
Each page's layout consists of one or more bounding boxes. The number in the top-right corner of each box indicates the reading order. Additionally, different content blocks are highlighted with distinct background colors within the layout.pdf. - Quickly troubleshoot text loss issues
![layout example](images/layout_example.png) - Check inline formula recognition
- Verify text segmentation accuracy
### some_pdf_spans.pdf(Applicable only to the pipeline backend) ![span page example](../images/spans_example.png)
All spans on the page are drawn with different colored line frames according to the span type. This file can be used for quality control, allowing for quick identification of issues such as missing text or unrecognized inline formulas. ## Structured Data Files
![spans example](images/spans_example.png) ### Model Inference Results (model.json)
### some_pdf_model.json(Applicable only to the pipeline backend) > [!NOTE]
> Only applicable to pipeline backend
#### Structure Definition **File naming format**: `{original_filename}_model.json`
#### Data Structure Definition
```python ```python
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from enum import IntEnum from enum import IntEnum
class CategoryType(IntEnum): class CategoryType(IntEnum):
"""Content category enumeration"""
title = 0 # Title title = 0 # Title
plain_text = 1 # Text plain_text = 1 # Text
abandon = 2 # Includes headers, footers, page numbers, and page annotations abandon = 2 # Including headers, footers, page numbers, and page annotations
figure = 3 # Image figure = 3 # Image
figure_caption = 4 # Image description figure_caption = 4 # Image caption
table = 5 # Table table = 5 # Table
table_caption = 6 # Table description table_caption = 6 # Table caption
table_footnote = 7 # Table footnote table_footnote = 7 # Table footnote
isolate_formula = 8 # Block formula isolate_formula = 8 # Interline formula
formula_caption = 9 # Formula label formula_caption = 9 # Interline formula number
embedding = 13 # Inline formula embedding = 13 # Inline formula
isolated = 14 # Block formula isolated = 14 # Interline formula
text = 15 # OCR recognition result text = 15 # OCR recognition result
class PageInfo(BaseModel): class PageInfo(BaseModel):
page_no: int = Field(description="Page number, the first page is 0", ge=0) """Page information"""
page_no: int = Field(description="Page number, first page is 0", ge=0)
height: int = Field(description="Page height", gt=0) height: int = Field(description="Page height", gt=0)
width: int = Field(description="Page width", ge=0) width: int = Field(description="Page width", ge=0)
class ObjectInferenceResult(BaseModel): class ObjectInferenceResult(BaseModel):
"""Object recognition result"""
category_id: CategoryType = Field(description="Category", ge=0) category_id: CategoryType = Field(description="Category", ge=0)
poly: list[float] = Field(description="Quadrilateral coordinates, representing the coordinates of the top-left, top-right, bottom-right, and bottom-left points respectively") poly: list[float] = Field(description="Quadrilateral coordinates, format: [x0,y0,x1,y1,x2,y2,x3,y3]")
score: float = Field(description="Confidence of the inference result") score: float = Field(description="Confidence score of inference result")
latex: str | None = Field(description="LaTeX parsing result", default=None) latex: str | None = Field(description="LaTeX parsing result", default=None)
html: str | None = Field(description="HTML parsing result", default=None) html: str | None = Field(description="HTML parsing result", default=None)
class PageInferenceResults(BaseModel): class PageInferenceResults(BaseModel):
layout_dets: list[ObjectInferenceResult] = Field(description="Page recognition results", ge=0) """Page inference results"""
layout_dets: list[ObjectInferenceResult] = Field(description="Page recognition results")
page_info: PageInfo = Field(description="Page metadata") page_info: PageInfo = Field(description="Page metadata")
# Complete inference results
# The inference results of all pages, ordered by page number, are stored in a list as the inference results of MinerU
inference_result: list[PageInferenceResults] = [] inference_result: list[PageInferenceResults] = []
``` ```
The format of the poly coordinates is \[x0, y0, x1, y1, x2, y2, x3, y3\], representing the coordinates of the top-left, top-right, bottom-right, and bottom-left points respectively. #### Coordinate System Description
![Poly Coordinate Diagram](images/poly.png)
`poly` coordinate format: `[x0, y0, x1, y1, x2, y2, x3, y3]`
- Represents coordinates of top-left, top-right, bottom-right, bottom-left points respectively
- Coordinate origin is at the top-left corner of the page
![poly coordinate diagram](../images/poly.png)
#### example #### Sample Data
```json ```json
[ [
...@@ -116,142 +165,127 @@ The format of the poly coordinates is \[x0, y0, x1, y1, x2, y2, x3, y3\], repres ...@@ -116,142 +165,127 @@ The format of the poly coordinates is \[x0, y0, x1, y1, x2, y2, x3, y3\], repres
] ]
``` ```
### some_pdf_model_output.txt (Applicable only to the VLM backend) ### VLM Output Results (model_output.txt)
This file contains the output of the VLM model, with each page's output separated by `----`. > [!NOTE]
Each page's output consists of text blocks starting with `<|box_start|>` and ending with `<|md_end|>`. > Only applicable to VLM backend
The meaning of each field is as follows:
- `<|box_start|>x0 y0 x1 y1<|box_end|>`
x0 y0 x1 y1 represent the coordinates of a quadrilateral, indicating the top-left and bottom-right points. The values are based on a normalized page size of 1000x1000.
- `<|ref_start|>type<|ref_end|>`
`type` indicates the block type. Possible values are:
```json
{
"text": "Text",
"title": "Title",
"image": "Image",
"image_caption": "Image Caption",
"image_footnote": "Image Footnote",
"table": "Table",
"table_caption": "Table Caption",
"table_footnote": "Table Footnote",
"equation": "Interline Equation"
}
```
- `<|md_start|>Markdown content<|md_end|>`
This field contains the Markdown content of the block. If `type` is `text`, the end of the text may contain the `<|txt_contd|>` tag, indicating that this block can be connected with the following `text` block(s).
If `type` is `table`, the content is in `otsl` format and needs to be converted into HTML for rendering in Markdown.
### some_pdf_middle.json **File naming format**: `{original_filename}_model_output.txt`
| Field Name | Description | #### File Format Description
|:---------------| :------------------------------------------------------------------------------------------------------------- |
| pdf_info | list, each element is a dict representing the parsing result of each PDF page, see the table below for details |
| \_backend | pipeline \| vlm, used to indicate the mode used in this intermediate parsing state |
| \_version_name | string, indicates the version of mineru used in this parsing |
<br> - Uses `----` to separate output results for each page
- Each page contains multiple text blocks starting with `<|box_start|>` and ending with `<|md_end|>`
**pdf_info** #### Field Meanings
Field structure description | Tag | Format | Description |
|-----|--------|-------------|
| Bounding box | `<\|box_start\|>x0 y0 x1 y1<\|box_end\|>` | Quadrilateral coordinates (top-left, bottom-right points), coordinate values after scaling page to 1000×1000 |
| Type tag | `<\|ref_start\|>type<\|ref_end\|>` | Content block type identifier |
| Content | `<\|md_start\|>markdown content<\|md_end\|>` | Markdown content of the block |
| Field Name | Description | #### Supported Content Types
| :------------------ | :----------------------------------------------------------------------------------------------------------------- |
| preproc_blocks | Intermediate result after PDF preprocessing, not yet segmented |
| layout_bboxes | Layout segmentation results, containing layout direction (vertical, horizontal), and bbox, sorted by reading order |
| page_idx | Page number, starting from 0 |
| page_size | Page width and height |
| \_layout_tree | Layout tree structure |
| images | list, each element is a dict representing an img_block |
| tables | list, each element is a dict representing a table_block |
| interline_equations | list, each element is a dict representing an interline_equation_block |
| discarded_blocks | List, block information returned by the model that needs to be dropped |
| para_blocks | Result after segmenting preproc_blocks |
In the above table, `para_blocks` is an array of dicts, each dict representing a block structure. A block can support up to one level of nesting. ```json
{
<br> "text": "Text",
"title": "Title",
**block** "image": "Image",
"image_caption": "Image caption",
The outer block is referred to as a first-level block, and the fields in the first-level block include: "image_footnote": "Image footnote",
"table": "Table",
| Field Name | Description | "table_caption": "Table caption",
| :--------- | :------------------------------------------------------------- | "table_footnote": "Table footnote",
| type | Block type (table\|image) | "equation": "Interline formula"
| bbox | Block bounding box coordinates | }
| blocks | list, each element is a dict representing a second-level block | ```
<br>
There are only two types of first-level blocks: "table" and "image". All other blocks are second-level blocks.
The fields in a second-level block include: #### Special Tags
| Field Name | Description | - `<|txt_contd|>`: Appears at the end of text, indicating that this text block can be connected with subsequent text blocks
| :--------- | :---------------------------------------------------------------------------------------------------------- | - Table content uses `otsl` format and needs to be converted to HTML for rendering in Markdown
| type | Block type |
| bbox | Block bounding box coordinates |
| lines | list, each element is a dict representing a line, used to describe the composition of a line of information |
Detailed explanation of second-level block types ### Intermediate Processing Results (middle.json)
| type | Description | **File naming format**: `{original_filename}_middle.json`
| :----------------- | :--------------------- |
| image_body | Main body of the image |
| image_caption | Image description text |
| image_footnote | Image footnote |
| table_body | Main body of the table |
| table_caption | Table description text |
| table_footnote | Table footnote |
| text | Text block |
| title | Title block |
| index | Index block |
| list | List block |
| interline_equation | Block formula |
<br> #### Top-level Structure
**line** | Field Name | Type | Description |
|------------|------|-------------|
| `pdf_info` | `list[dict]` | Array of parsing results for each page |
| `_backend` | `string` | Parsing mode: `pipeline` or `vlm` |
| `_version_name` | `string` | MinerU version number |
The field format of a line is as follows: #### Page Information Structure (pdf_info)
| Field Name | Description | | Field Name | Description |
| :--------- | :------------------------------------------------------------------------------------------------------ | |------------|-------------|
| bbox | Bounding box coordinates of the line | | `preproc_blocks` | Unsegmented intermediate results after PDF preprocessing |
| spans | list, each element is a dict representing a span, used to describe the composition of the smallest unit | | `layout_bboxes` | Layout segmentation results, including layout direction and bounding boxes, sorted by reading order |
| `page_idx` | Page number, starting from 0 |
| `page_size` | Page width and height `[width, height]` |
| `_layout_tree` | Layout tree structure |
| `images` | Image block information list |
| `tables` | Table block information list |
| `interline_equations` | Interline formula block information list |
| `discarded_blocks` | Block information to be discarded |
| `para_blocks` | Content block results after segmentation |
#### Block Structure Hierarchy
<br> ```
Level 1 blocks (table | image)
└── Level 2 blocks
└── Lines
└── Spans
```
**span** #### Level 1 Block Fields
| Field Name | Description | | Field Name | Description |
| :------------------ | :------------------------------------------------------------------------------------------------------- | |------------|-------------|
| bbox | Bounding box coordinates of the span | | `type` | Block type: `table` or `image` |
| type | Type of the span | | `bbox` | Rectangular box coordinates of the block `[x0, y0, x1, y1]` |
| content \| img_path | Text spans use content, chart spans use img_path to store the actual text or screenshot path information | | `blocks` | List of contained level 2 blocks |
The types of spans are as follows:
| type | Description |
| :----------------- | :------------- |
| image | Image |
| table | Table |
| text | Text |
| inline_equation | Inline formula |
| interline_equation | Block formula |
**Summary**
A span is the smallest storage unit for all elements. #### Level 2 Block Fields
The elements stored within para_blocks are block information. | Field Name | Description |
|------------|-------------|
The block structure is as follows: | `type` | Block type (see table below) |
| `bbox` | Rectangular box coordinates of the block |
First-level block (if any) -> Second-level block -> Line -> Span | `lines` | List of contained line information |
#### example #### Level 2 Block Types
| Type | Description |
|------|-------------|
| `image_body` | Image body |
| `image_caption` | Image caption text |
| `image_footnote` | Image footnote |
| `table_body` | Table body |
| `table_caption` | Table caption text |
| `table_footnote` | Table footnote |
| `text` | Text block |
| `title` | Title block |
| `index` | Index block |
| `list` | List block |
| `interline_equation` | Interline formula block |
#### Line and Span Structure
**Line fields**:
- `bbox`: Rectangular box coordinates of the line
- `spans`: List of contained spans
**Span fields**:
- `bbox`: Rectangular box coordinates of the span
- `type`: Span type (`image`, `table`, `text`, `inline_equation`, `interline_equation`)
- `content` | `img_path`: Text content or image path
#### Sample Data
```json ```json
{ {
...@@ -354,29 +388,37 @@ First-level block (if any) -> Second-level block -> Line -> Span ...@@ -354,29 +388,37 @@ First-level block (if any) -> Second-level block -> Line -> Span
} }
``` ```
### Content List (content_list.json)
### some_pdf_content_list.json **File naming format**: `{original_filename}_content_list.json`
This file is a JSON array where each element is a dict storing all readable content blocks in the document in reading order. #### Functionality
`content_list` can be viewed as a simplified version of `middle.json`. The content block types are mostly consistent with those in `middle.json`, but layout information is not included.
The content has the following types: This is a simplified version of `middle.json` that stores all readable content blocks in reading order as a flat structure, removing complex layout information for easier subsequent processing.
| type | desc | #### Content Types
|:---------|:--------------|
| image | Image |
| table | Table |
| text | Text / Title |
| equation | Block formula |
Please note that both `title` and text blocks in `content_list` are uniformly represented using the text type. The `text_level` field is used to distinguish the hierarchy of text blocks: | Type | Description |
- A block without the `text_level` field or with `text_level=0` represents body text. |------|-------------|
- A block with `text_level=1` represents a level-1 heading. | `image` | Image |
- A block with `text_level=2` represents a level-2 heading, and so on. | `table` | Table |
| `text` | Text/Title |
| `equation` | Interline formula |
Each content contains the `page_idx` field, indicating the page number (starting from 0) where the content block resides. #### Text Level Identification
#### example Text levels are distinguished through the `text_level` field:
- No `text_level` or `text_level: 0`: Body text
- `text_level: 1`: Level 1 heading
- `text_level: 2`: Level 2 heading
- And so on...
#### Common Fields
All content blocks include a `page_idx` field indicating the page number (starting from 0).
#### Sample Data
```json ```json
[ [
...@@ -438,3 +480,12 @@ Each content contains the `page_idx` field, indicating the page number (starting ...@@ -438,3 +480,12 @@ Each content contains the `page_idx` field, indicating the page number (starting
} }
] ]
``` ```
## Summary
The above files constitute MinerU's complete output results. Users can choose appropriate files for subsequent processing based on their needs:
- **Model outputs**: Use raw outputs (model.json, model_output.txt)
- **Debugging and verification**: Use visualization files (layout.pdf, spans.pdf)
- **Content extraction**: Use simplified files (*.md, content_list.json)
- **Secondary development**: Use structured files (middle.json)
# Advanced Command Line Parameters
## SGLang Acceleration Parameter Optimization
### Memory Optimization Parameters
> [!TIP]
> SGLang acceleration mode currently supports running on Turing architecture graphics cards with a minimum of 8GB VRAM, but graphics cards with <24GB VRAM may encounter insufficient memory issues. You can optimize memory usage with the following parameters:
>
> - If you encounter insufficient VRAM when using a single graphics card, you may need to reduce the KV cache size with `--mem-fraction-static 0.5`. If VRAM issues persist, try reducing it further to `0.4` or lower.
> - If you have two or more graphics cards, you can try using tensor parallelism (TP) mode to simply expand available VRAM: `--tp-size 2`
### Performance Optimization Parameters
> [!TIP]
> If you can already use SGLang normally for accelerated VLM model inference but still want to further improve inference speed, you can try the following parameters:
>
> - If you have multiple graphics cards, you can use SGLang's multi-card parallel mode to increase throughput: `--dp-size 2`
> - You can also enable `torch.compile` to accelerate inference speed by approximately 15%: `--enable-torch-compile`
### Parameter Passing Instructions
> [!TIP]
> - All officially supported SGLang parameters can be passed to MinerU through command line arguments, including the following commands: `mineru`, `mineru-sglang-server`, `mineru-gradio`, `mineru-api`
> - If you want to learn more about `sglang` parameter usage, please refer to the [SGLang official documentation](https://docs.sglang.ai/backend/server_arguments.html#common-launch-commands)
## GPU Device Selection and Configuration
### CUDA_VISIBLE_DEVICES Basic Usage
> [!TIP]
> - In any situation, you can specify visible GPU devices by adding the `CUDA_VISIBLE_DEVICES` environment variable at the beginning of the command line. For example:
> ```bash
> CUDA_VISIBLE_DEVICES=1 mineru -p <input_path> -o <output_path>
> ```
> - This specification method is effective for all command line calls, including `mineru`, `mineru-sglang-server`, `mineru-gradio`, and `mineru-api`, and applies to both `pipeline` and `vlm` backends.
### Common Device Configuration Examples
> [!TIP]
> Here are some common `CUDA_VISIBLE_DEVICES` setting examples:
> ```bash
> CUDA_VISIBLE_DEVICES=1 # Only device 1 will be seen
> CUDA_VISIBLE_DEVICES=0,1 # Devices 0 and 1 will be visible
> CUDA_VISIBLE_DEVICES="0,1" # Same as above, quotation marks are optional
> CUDA_VISIBLE_DEVICES=0,2,3 # Devices 0, 2, 3 will be visible; device 1 is masked
> CUDA_VISIBLE_DEVICES="" # No GPU will be visible
> ```
## Practical Application Scenarios
> [!TIP]
> Here are some possible usage scenarios:
>
> - If you have multiple graphics cards and need to specify cards 0 and 1, using multi-card parallelism to start `sglang-server`, you can use the following command:
> ```bash
> CUDA_VISIBLE_DEVICES=0,1 mineru-sglang-server --port 30000 --dp-size 2
> ```
>
> - If you have multiple GPUs and need to specify GPU 0–3, and start the `sglang-server` using multi-GPU data parallelism and tensor parallelism, you can use the following command:
> ```bash
> CUDA_VISIBLE_DEVICES=0,1,2,3 mineru-sglang-server --port 30000 --dp-size 2 --tp-size 2
> ```
>
> - If you have multiple graphics cards and need to start two `fastapi` services on cards 0 and 1, listening on different ports respectively, you can use the following commands:
> ```bash
> # In terminal 1
> CUDA_VISIBLE_DEVICES=0 mineru-api --host 127.0.0.1 --port 8000
> # In terminal 2
> CUDA_VISIBLE_DEVICES=1 mineru-api --host 127.0.0.1 --port 8001
> ```
# Command Line Tools Usage Instructions
## View Help Information
To view help information for MinerU command line tools, you can use the `--help` parameter. Here are help information examples for various command line tools:
```bash
mineru --help
Usage: mineru [OPTIONS]
Options:
-v, --version Show version and exit
-p, --path PATH Input file path or directory (required)
-o, --output PATH Output directory (required)
-m, --method [auto|txt|ocr] Parsing method: auto (default), txt, ocr (pipeline backend only)
-b, --backend [pipeline|vlm-transformers|vlm-sglang-engine|vlm-sglang-client]
Parsing backend (default: pipeline)
-l, --lang [ch|ch_server|ch_lite|en|korean|japan|chinese_cht|ta|te|ka|latin|arabic|east_slavic|cyrillic|devanagari]
Specify document language (improves OCR accuracy, pipeline backend only)
-u, --url TEXT Service address when using sglang-client
-s, --start INTEGER Starting page number for parsing (0-based)
-e, --end INTEGER Ending page number for parsing (0-based)
-f, --formula BOOLEAN Enable formula parsing (default: enabled)
-t, --table BOOLEAN Enable table parsing (default: enabled)
-d, --device TEXT Inference device (e.g., cpu/cuda/cuda:0/npu/mps, pipeline backend only)
--vram INTEGER Maximum GPU VRAM usage per process (GB) (pipeline backend only)
--source [huggingface|modelscope|local]
Model source, default: huggingface
--help Show help information
```
```bash
mineru-api --help
Usage: mineru-api [OPTIONS]
Options:
--host TEXT Server host (default: 127.0.0.1)
--port INTEGER Server port (default: 8000)
--reload Enable auto-reload (development mode)
--help Show this message and exit.
```
```bash
mineru-gradio --help
Usage: mineru-gradio [OPTIONS]
Options:
--enable-example BOOLEAN Enable example files for input. The example
files to be input need to be placed in the
`example` folder within the directory where
the command is currently executed.
--enable-sglang-engine BOOLEAN Enable SgLang engine backend for faster
processing.
--enable-api BOOLEAN Enable gradio API for serving the
application.
--max-convert-pages INTEGER Set the maximum number of pages to convert
from PDF to Markdown.
--server-name TEXT Set the server name for the Gradio app.
--server-port INTEGER Set the server port for the Gradio app.
--latex-delimiters-type [a|b|all]
Set the type of LaTeX delimiters to use in
Markdown rendering: 'a' for type '$', 'b' for
type '()[]', 'all' for both types.
--help Show this message and exit.
```
## Environment Variables Description
Some parameters of MinerU command line tools have equivalent environment variable configurations. Generally, environment variable configurations have higher priority than command line parameters and take effect across all command line tools.
Here are the environment variables and their descriptions:
- `MINERU_DEVICE_MODE`: Used to specify inference device, supports device types like `cpu/cuda/cuda:0/npu/mps`, only effective for `pipeline` backend.
- `MINERU_VIRTUAL_VRAM_SIZE`: Used to specify maximum GPU VRAM usage per process (GB), only effective for `pipeline` backend.
- `MINERU_MODEL_SOURCE`: Used to specify model source, supports `huggingface/modelscope/local`, defaults to `huggingface`, can be switched to `modelscope` or local models through environment variables.
- `MINERU_TOOLS_CONFIG_JSON`: Used to specify configuration file path, defaults to `mineru.json` in user directory, can specify other configuration file paths through environment variables.
- `MINERU_FORMULA_ENABLE`: Used to enable formula parsing, defaults to `true`, can be set to `false` through environment variables to disable formula parsing.
- `MINERU_TABLE_ENABLE`: Used to enable table parsing, defaults to `true`, can be set to `false` through environment variables to disable table parsing.
# Usage Guide
This section provides comprehensive usage instructions for the project. We will help you progressively master the project's usage from basic to advanced through the following sections:
## Table of Contents
- [Quick Usage](./quick_usage.md) - Quick setup and basic usage
- [Model Source Configuration](./model_source.md) - Detailed configuration instructions for model sources
- [Command Line Tools](./cli_tools.md) - Detailed parameter descriptions for command line tools
- [Advanced Optimization Parameters](./advanced_cli_parameters.md) - Advanced parameter descriptions for command line tool adaptation
## Getting Started
We recommend reading the documentation in the order listed above, which will help you better understand and use the project features.
If you encounter issues during usage, please check the [FAQ](../faq/index.md)
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment