Merge pull request #969 from opendatalab/release-0.9.3

Release 0.9.3

Merge pull request #969 from opendatalab/release-0.9.3
Release 0.9.3
845a3ff0 · Xiaomeng Zhao · GitHub · d0558abb · 6083e109 · 845a3ff0
Unverified Commit 845a3ff0 authored Nov 15, 2024 by Xiaomeng Zhao Committed by GitHub Nov 15, 2024
20 changed files
--- a/next_docs/zh_cn/conf.py
+++ b/next_docs/zh_cn/conf.py
@@ -15,7 +15,8 @@ import subprocess
 import sys
 from sphinx.ext import autodoc
+from docutils import nodes
+from docutils.parsers.rst import Directive
 def install(package):
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])
@@ -33,8 +34,8 @@ sys.path.insert(0, os.path.abspath('../..'))
 # -- Project information -----------------------------------------------------
 project = 'MinerU'
-copyright = '2024, OpenDataLab'
+copyright = '2024, MinerU Contributors'
-author = 'MinerU Contributors'
+author = 'OpenDataLab'
 # The full version, including alpha/beta/rc tags
 version_file = '../../magic_pdf/libs/version.py'
@@ -58,10 +59,20 @@ extensions = [
    'sphinx_copybutton',
    'sphinx.ext.autodoc',
    'sphinx.ext.autosummary',
+    'sphinx.ext.inheritance_diagram',
    'myst_parser',
    'sphinxarg.ext',
+    'sphinxcontrib.autodoc_pydantic',
 ]
+# class hierarchy diagram
+inheritance_graph_attrs = dict(rankdir="LR", size='"8.0, 12.0"', fontsize=14, ratio='compress')
+inheritance_node_attrs = dict(shape='ellipse', fontsize=14, height=0.75)
+inheritance_edge_attrs = dict(arrow='vee')
+autodoc_pydantic_model_show_json = True
+autodoc_pydantic_model_show_config_summary = False
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
@@ -84,7 +95,7 @@ language = 'zh_CN'
 html_theme = 'sphinx_book_theme'
 html_logo = '_static/image/logo.png'
 html_theme_options = {
-    'path_to_docs': 'docs/zh_cn',
+    'path_to_docs': 'next_docs/zh_cn',
    'repository_url': 'https://github.com/opendatalab/MinerU',
    'use_repository_button': True,
 }
@@ -120,3 +131,21 @@ class MockedClassDocumenter(autodoc.ClassDocumenter):
 autodoc.ClassDocumenter = MockedClassDocumenter
 navigation_with_keys = False
+# add custom directive 
+class VideoDirective(Directive):
+    required_arguments = 1
+    optional_arguments = 0
+    final_argument_whitespace = True
+    option_spec = {}
+    def run(self):
+        url = self.arguments[0]
+        video_node = nodes.raw('', f'<iframe width="560" height="315" src="{url}" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>', format='html')
+        return [video_node]
+def setup(app):
+    app.add_directive('video', VideoDirective)
\ No newline at end of file
--- a/next_docs/zh_cn/index.rst
+++ b/next_docs/zh_cn/index.rst
@@ -3,7 +3,7 @@
   You can adapt this file completely to your liking, but it should at least
   contain the root `toctree` directive.
-欢迎来到 MinerU 的中文文档
+欢迎来到 MinerU 文档库
 ==============================================
 .. figure:: ./_static/image/logo.png
@@ -14,7 +14,7 @@
 .. raw:: html
   <p style="text-align:center">
-   <strong> 一站式开源高质量数据提取工具
+   <strong> 一站式、高质量的开源文档提取工具
   </strong>
   </p>
@@ -24,3 +24,58 @@
   <a class="github-button" href="https://github.com/opendatalab/MinerU/subscription" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
   <a class="github-button" href="https://github.com/opendatalab/MinerU/fork" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
   </p>
+项目介绍
+--------------------
+MinerU是一款将PDF转化为机器可读格式的工具（如markdown、json），可以很方便地抽取为任意格式。
+MinerU诞生于\ `书生-浦语 <https://github.com/InternLM/InternLM>`__\ 的预训练过程中，我们将会集中精力解决科技文献中的符号转化问题，希望在大模型时代为科技发展做出贡献。
+相比国内外知名商用产品MinerU还很年轻，如果遇到问题或者结果不及预期请到\ `issue <https://github.com/opendatalab/MinerU/issues>`__\ 提交问题，同时\ **附上相关PDF**\ 。
+.. video:: https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
+主要功能
+--------
+-  删除页眉、页脚、脚注、页码等元素，确保语义连贯
+-  输出符合人类阅读顺序的文本，适用于单栏、多栏及复杂排版
+-  保留原文档的结构，包括标题、段落、列表等
+-  提取图像、图片描述、表格、表格标题及脚注
+-  自动识别并转换文档中的公式为LaTeX格式
+-  自动识别并转换文档中的表格为LaTeX或HTML格式
+-  自动检测扫描版PDF和乱码PDF，并启用OCR功能
+-  OCR支持84种语言的检测与识别
+-  支持多种输出格式，如多模态与NLP的Markdown、按阅读顺序排序的JSON、含有丰富信息的中间格式等
+-  支持多种可视化结果，包括layout可视化、span可视化等，便于高效确认输出效果与质检
+-  支持CPU和GPU环境
+-  兼容Windows、Linux和Mac平台
+用户指南
+-------------
+.. toctree::
+   :maxdepth: 2
+   :caption: 用户指南
+   user_guide
+API 接口
+-------------
+本章节主要介绍函数、类、类方法的细节信息
+目前只提供英文版本的接口文档，请切换到英文版本的接口文档！
+附录
+------------------
+.. toctree::
+   :maxdepth: 1
+   :caption: 附录
+   additional_notes/known_issues
+   additional_notes/faq
+   additional_notes/glossary
--- a/next_docs/zh_cn/user_guide.rst
+++ b/next_docs/zh_cn/user_guide.rst
+.. toctree::
+    :maxdepth: 2
+    user_guide/install
+    user_guide/quick_start
+    user_guide/tutorial
+    user_guide/data
--- a/next_docs/zh_cn/user_guide/data.rst
+++ b/next_docs/zh_cn/user_guide/data.rst
+数据
+=========
+.. toctree::
+   :maxdepth: 2
+   :caption: 数据
+   data/dataset
+   data/read_api
+   data/data_reader_writer 
+   data/io
--- a/next_docs/zh_cn/user_guide/data/data_reader_writer.rst
+++ b/next_docs/zh_cn/user_guide/data/data_reader_writer.rst
+数据读取和写入类 
+=================
+旨在从不同的媒介读取或写入字节。如果 MinerU 没有提供合适的类，你可以实现新的类以满足个人场景的需求。实现新的类非常容易，唯一的要求是继承自 DataReader 或 DataWriter。
+.. code:: python
+    class SomeReader(DataReader):
+        def read(self, path: str) -> bytes:
+            pass
+        def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
+            pass
+    class SomeWriter(DataWriter):
+        def write(self, path: str, data: bytes) -> None:
+            pass
+        def write_string(self, path: str, data: str) -> None:
+            pass
+读者可能会对 io 和本节的区别感到好奇。乍一看，这两部分非常相似。io 提供基本功能，而本节则更注重应用层面。用户可以构建自己的类以满足特定应用需求，这些类可能共享相同的基本 IO 功能。这就是为什么我们有 io。
+重要类
+------------
+.. code:: python
+    class FileBasedDataReader(DataReader):
+        def __init__(self, parent_dir: str = ''):
+            pass
+    class FileBasedDataWriter(DataWriter):
+        def __init__(self, parent_dir: str = '') -> None:
+            pass
+类 FileBasedDataReader 使用单个参数 parent_dir 初始化。这意味着 FileBasedDataReader 提供的每个方法将具有以下特性：
+#. 从绝对路径文件读取内容，parent_dir 将被忽略。
+#. 从相对路径读取文件，首先将路径与 parent_dir 连接，然后从合并后的路径读取内容。
+.. note::
+    `FileBasedDataWriter` 与 `FileBasedDataReader` 具有相同的行为。
+.. code:: python
+    class MultiS3Mixin:
+        def __init__(self, default_prefix: str, s3_configs: list[S3Config]):
+            pass
+    class MultiBucketS3DataReader(DataReader, MultiS3Mixin):
+        pass
+MultiBucketS3DataReader 提供的所有读取相关方法将具有以下特性：
+#. 从完整的 S3 格式路径读取对象，例如 s3://test_bucket/test_object，default_prefix 将被忽略。
+#. 从相对路径读取对象，首先将路径与 default_prefix 连接并去掉 bucket_name，然后读取内容。bucket_name 是将 default_prefix 用分隔符 \ 分割后的第一个元素。
+.. note::
+    MultiBucketS3DataWriter 与 MultiBucketS3DataReader 具有类似的行为。
+.. code:: python
+    class S3DataReader(MultiBucketS3DataReader):
+        pass
+S3DataReader 基于 MultiBucketS3DataReader 构建，但仅支持单个桶。S3DataWriter 也是类似的情况。
+读取示例
+---------
+.. code:: python
+    from magic_pdf.data.data_reader_writer import * 
+    # 文件相关的
+    file_based_reader1 = FileBasedDataReader('')
+    ## 将读取文件 abc 
+    file_based_reader1.read('abc') 
+    file_based_reader2 = FileBasedDataReader('/tmp')
+    ## 将读取 /tmp/abc
+    file_based_reader2.read('abc')
+    ## 将读取 /var/logs/message.txt
+    file_based_reader2.read('/var/logs/message.txt')
+    # 多桶 S3 相关的
+    multi_bucket_s3_reader1 = MultiBucketS3DataReader("test_bucket1/test_prefix", list[S3Config(
+            bucket_name=test_bucket1, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
+        ),
+        S3Config(
+            bucket_name=test_bucket_2,
+            access_key=ak_2,
+            secret_key=sk_2,
+            endpoint_url=endpoint_url_2,
+        )])
+    ## 将读取 s3://test_bucket1/test_prefix/abc
+    multi_bucket_s3_reader1.read('abc')
+    ## 将读取 s3://test_bucket1/efg
+    multi_bucket_s3_reader1.read('s3://test_bucket1/efg')
+    ## 将读取 s3://test_bucket2/abc
+    multi_bucket_s3_reader1.read('s3://test_bucket2/abc')
+    # S3 相关的
+    s3_reader1 = S3DataReader(
+        default_prefix_without_bucket = "test_prefix",
+        bucket: "test_bucket",
+        ak: "ak",
+        sk: "sk",
+        endpoint_url: "localhost"
+    )
+    ## 将读取 s3://test_bucket/test_prefix/abc 
+    s3_reader1.read('abc')
+    ## 将读取 s3://test_bucket/efg
+    s3_reader1.read('s3://test_bucket/efg')
+写入示例
+----------
+.. code:: python
+    from magic_pdf.data.data_reader_writer import *
+    # 文件相关的
+    file_based_writer1 = FileBasedDataWriter('')
+    ## 将写入 123 到 abc
+    file_based_writer1.write('abc', '123'.encode()) 
+    ## 将写入 123 到 abc
+    file_based_writer1.write_string('abc', '123') 
+    file_based_writer2 = FileBasedDataWriter('/tmp')
+    ## 将写入 123 到 /tmp/abc
+    file_based_writer2.write_string('abc', '123')
+    ## 将写入 123 到 /var/logs/message.txt
+    file_based_writer2.write_string('/var/logs/message.txt', '123')
+    # 多桶 S3 相关的
+    multi_bucket_s3_writer1 = MultiBucketS3DataWriter("test_bucket1/test_prefix", list[S3Config(
+            bucket_name=test_bucket1, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
+        ),
+        S3Config(
+            bucket_name=test_bucket_2,
+            access_key=ak_2,
+            secret_key=sk_2,
+            endpoint_url=endpoint_url_2,
+        )])
+    ## 将写入 123 到 s3://test_bucket1/test_prefix/abc
+    multi_bucket_s3_writer1.write_string('abc', '123')
+    ## 将写入 123 到 s3://test_bucket1/test_prefix/abc
+    multi_bucket_s3_writer1.write('abc', '123'.encode())
+    ## 将写入 123 到 s3://test_bucket1/efg
+    multi_bucket_s3_writer1.write('s3://test_bucket1/efg', '123'.encode())
+    ## 将写入 123 到 s3://test_bucket2/abc
+    multi_bucket_s3_writer1.write('s3://test_bucket2/abc', '123'.encode())
+    # S3 相关的
+    s3_writer1 = S3DataWriter(
+        default_prefix_without_bucket = "test_prefix",
+        bucket: "test_bucket",
+        ak: "ak",
+        sk: "sk",
+        endpoint_url: "localhost"
+    )
+    ## 将写入 123 到 s3://test_bucket/test_prefix/abc 
+    s3_writer1.write('abc', '123'.encode())
+    ## 将写入 123 到 s3://test_bucket/test_prefix/abc 
+    s3_writer1.write_string('abc', '123')
+    ## 将写入 123 到 s3://test_bucket/efg
+    s3_writer1.write('s3://test_bucket/efg', '123'.encode())
--- a/next_docs/zh_cn/user_guide/data/dataset.rst
+++ b/next_docs/zh_cn/user_guide/data/dataset.rst
+数据集
+======
+导入数据类
+-----------
+数据集
+^^^^^^^^
+每个 PDF 或图像将形成一个 Dataset。众所周知，PDF 有两种类别：:ref:`TXT <digital_method_section>` 或 :ref:`OCR <ocr_method_section>` 方法部分。从图像中可以获得 ImageDataset，它是 Dataset 的子类；从 PDF 文件中可以获得 PymuDocDataset。ImageDataset 和 PymuDocDataset 之间的区别在于 ImageDataset 仅支持 OCR 解析方法，而 PymuDocDataset 支持 OCR 和 TXT 两种方法。
+.. note::
+    实际上，有些 PDF 可能是由图像生成的，这意味着它们不支持 `TXT` 方法。目前，由用户保证不会调用 `TXT` 方法来解析图像生成的 PDF
+PDF 解析方法
+---------------
+.. _ocr_method_section:
+OCR
+^^^^
+通过 光学字符识别 技术提取字符。
+.. _digital_method_section:
+TXT
+^^^^^^^^
+通过第三方库提取字符，目前我们使用的是 pymupdf。
--- a/next_docs/zh_cn/user_guide/data/io.rst
+++ b/next_docs/zh_cn/user_guide/data/io.rst
+IO
+====
+旨在从不同的媒介读取或写入字节。目前，我们提供了 S3Reader 和 S3Writer 用于兼容 AWS S3 的媒介，以及 HttpReader 和 HttpWriter 用于远程 HTTP 文件。如果 MinerU 没有提供合适的类，你可以实现新的类以满足个人场景的需求。实现新的类非常容易，唯一的要求是继承自 IOReader 或 IOWriter。
+.. code:: python
+    class SomeReader(IOReader):
+        def read(self, path: str) -> bytes:
+            pass
+        def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
+            pass
+    class SomeWriter(IOWriter):
+        def write(self, path: str, data: bytes) -> None:
+            pass
--- a/next_docs/zh_cn/user_guide/data/read_api.rst
+++ b/next_docs/zh_cn/user_guide/data/read_api.rst
+read_api
+=========
+从文件或目录读取内容以创建 Dataset。目前，我们提供了几个覆盖某些场景的函数。如果你有新的、大多数用户都会遇到的场景，可以在官方 GitHub 问题页面上发布详细描述。同时，实现你自己的读取相关函数也非常容易。
+重要函数
+---------
+read_jsonl
+^^^^^^^^^^^^^^^^
+从本地机器或远程 S3 上的 JSONL 文件读取内容。如果你想了解更多关于 JSONL 的信息，请参阅 :doc:`../../additional_notes/glossary`。
+.. code:: python
+    from magic_pdf.data.io.read_api import *
+    # 从本地机器读取 JSONL
+    datasets = read_jsonl("tt.jsonl", None)
+    # 从远程 S3 读取 JSONL
+    datasets = read_jsonl("s3://bucket_1/tt.jsonl", s3_reader)
+read_local_pdfs
+^^^^^^^^^^^^^^^^
+从路径或目录读取 PDF 文件。
+.. code:: python
+    from magic_pdf.data.io.read_api import *
+    # 读取 PDF 路径
+    datasets = read_local_pdfs("tt.pdf")
+    # 读取目录下的 PDF 文件
+    datasets = read_local_pdfs("pdfs/")
+read_local_images
+^^^^^^^^^^^^^^^^^^^
+从路径或目录读取图像。
+.. code:: python
+    from magic_pdf.data.io.read_api import *
+    # 从图像路径读取
+    datasets = read_local_images("tt.png")
+    # 从目录读取以 suffixes 数组中指定后缀结尾的文件
+    datasets = read_local_images("images/", suffixes=["png", "jpg"])
--- a/next_docs/zh_cn/user_guide/install.rst
+++ b/next_docs/zh_cn/user_guide/install.rst
+安装
+==============
+.. toctree::
+   :maxdepth: 1
+   :caption: 安装文档
+   install/install
+   install//boost_with_cuda
+   install/download_model_weight_files
--- a/next_docs/zh_cn/user_guide/install/boost_with_cuda.rst
+++ b/next_docs/zh_cn/user_guide/install/boost_with_cuda.rst
+使用 CUDA 加速
+================
+如果您的设备支持 CUDA 并符合主线环境的 GPU 要求，您可以使用 GPU 加速。请选择适合您系统的指南：
+-  :ref:`ubuntu_22_04_lts_section`
+-  :ref:`windows_10_or_11_section`
+-  使用 Docker 快速部署
+.. admonition:: Important
+    :class: tip
+    Docker 需要至少 16GB 显存的 GPU，并且所有加速功能默认启用。
+    在运行此 Docker 容器之前，您可以使用以下命令检查您的设备是否支持 Docker 上的 CUDA 加速。
+    .. code-block:: sh
+      docker run --rm --gpus=all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
+.. code:: sh
+    wget https://github.com/opendatalab/MinerU/raw/master/Dockerfile
+    docker build -t mineru:latest .
+    docker run --rm -it --gpus=all mineru:latest /bin/bash
+    magic-pdf --help
+.. _ubuntu_22_04_lts_section:
+Ubuntu 22.04 LTS
+----------------
+1. 检测是否已安装 nvidia 驱动
+---------------------------
+.. code:: bash
+   nvidia-smi
+如果看到类似如下的信息，说明已经安装了 nvidia 驱动，可以跳过步骤2
+.. admonition:: Important
+    :class: tip
+    ``CUDA Version`` 显示的版本号应 >=12.1，如显示的版本号小于12.1，请升级驱动
+.. code:: text
+   +---------------------------------------------------------------------------------------+
+   | NVIDIA-SMI 537.34                 Driver Version: 537.34       CUDA Version: 12.2     |
+   |-----------------------------------------+----------------------+----------------------+
+   | GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
+   | Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
+   |                                         |                      |               MIG M. |
+   |=========================================+======================+======================|
+   |   0  NVIDIA GeForce RTX 3060 Ti   WDDM  | 00000000:01:00.0  On |                  N/A |
+   |  0%   51C    P8              12W / 200W |   1489MiB /  8192MiB |      5%      Default |
+   |                                         |                      |                  N/A |
+   +-----------------------------------------+----------------------+----------------------+
+2. 安装驱动
+-----------
+如没有驱动，则通过如下命令
+.. code:: bash
+   sudo apt-get update
+   sudo apt-get install nvidia-driver-545
+安装专有驱动，安装完成后，重启电脑
+.. code:: bash
+   reboot
+3. 安装 anacoda
+--------------
+如果已安装 conda，可以跳过本步骤
+.. code:: bash
+   wget -U NoSuchBrowser/1.0 https://mirrors.tuna.tsinghua.edu.cn/anaconda/archive/Anaconda3-2024.06-1-Linux-x86_64.sh
+   bash Anaconda3-2024.06-1-Linux-x86_64.sh
+最后一步输入yes，关闭终端重新打开
+4. 使用 conda 创建环境
+---------------------
+需指定 python 版本为3.10
+.. code:: bash
+   conda create -n MinerU python=3.10
+   conda activate MinerU
+5. 安装应用
+-----------
+.. code:: bash
+   pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple
+.. admonition:: Important
+    :class: tip
+    下载完成后，务必通过以下命令确认magic-pdf的版本是否正确
+    .. code:: bash
+       magic-pdf --version
+    如果版本号小于0.7.0，请到issue中向我们反馈
+6. 下载模型
+-----------
+详细参考 :doc:`download_model_weight_files`
+7. 了解配置文件存放的位置
+-------------------------
+完成\ `6.下载模型 <#6-下载模型>`__\ 步骤后，脚本会自动生成用户目录下的magic-pdf.json文件，并自动配置默认模型路径。您可在【用户目录】下找到magic-pdf.json文件。
+.. admonition:: Tip
+    :class: tip
+    linux用户目录为 “/home/用户名”
+8. 第一次运行
+-------------
+从仓库中下载样本文件，并测试
+.. code:: bash
+   wget https://gitee.com/myhloli/MinerU/raw/master/demo/small_ocr.pdf
+   magic-pdf -p small_ocr.pdf -o ./output
+9. 测试CUDA加速
+---------------
+如果您的显卡显存大于等于 **8GB**
+，可以进行以下流程，测试CUDA解析加速效果
+**1.修改【用户目录】中配置文件 magic-pdf.json 中”device-mode”的值**
+.. code:: json
+   {
+     "device-mode":"cuda"
+   }
+**2.运行以下命令测试 cuda 加速效果**
+.. code:: bash
+   magic-pdf -p small_ocr.pdf -o ./output
+.. admonition:: Tip
+    :class: tip
+    CUDA 加速是否生效可以根据 log 中输出的各个阶段 cost 耗时来简单判断，通常情况下， ``layout detection cost`` 和 ``mfr time`` 应提速10倍以上。
+10. 为 ocr 开启 cuda 加速
+---------------------
+**1.下载paddlepaddle-gpu, 安装完成后会自动开启ocr加速**
+.. code:: bash
+   python -m pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
+**2.运行以下命令测试ocr加速效果**
+.. code:: bash
+   magic-pdf -p small_ocr.pdf -o ./output
+.. admonition:: Tip
+    :class: tip
+    CUDA 加速是否生效可以根据 log 中输出的各个阶段 cost 耗时来简单判断，通常情况下， ``ocr cost`` 应提速10倍以上。
+.. _windows_10_or_11_section:
+Windows 10/11
+--------------
+1. 安装 cuda 和 cuDNN
+------------------
+需要安装的版本 CUDA 11.8 + cuDNN 8.7.0
+-  CUDA 11.8 https://developer.nvidia.com/cuda-11-8-0-download-archive
+-  cuDNN v8.7.0 (November 28th, 2022), for CUDA 11.x https://developer.nvidia.com/rdp/cudnn-archive
+2. 安装 anaconda
+---------------
+如果已安装 conda，可以跳过本步骤
+下载链接：https://mirrors.tuna.tsinghua.edu.cn/anaconda/archive/Anaconda3-2024.06-1-Windows-x86_64.exe
+3. 使用 conda 创建环境
+---------------------
+需指定python版本为3.10
+.. code:: bash
+   conda create -n MinerU python=3.10
+   conda activate MinerU
+4. 安装应用
+-----------
+.. code:: bash
+   pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple
+.. admonition:: Important
+    :class: tip
+    下载完成后，务必通过以下命令确认magic-pdf的版本是否正确
+    .. code:: bash
+      magic-pdf --version
+    如果版本号小于0.7.0，请到issue中向我们反馈
+5. 下载模型
+-----------
+详细参考 :doc:`download_model_weight_files`
+6. 了解配置文件存放的位置
+-------------------------
+完成\ `5.下载模型 <#5-下载模型>`__\ 步骤后，脚本会自动生成用户目录下的magic-pdf.json文件，并自动配置默认模型路径。您可在【用户目录】下找到 magic-pdf.json 文件。
+.. admonition:: Tip
+    :class: tip
+    windows 用户目录为 “C:/Users/用户名”
+7. 第一次运行
+-------------
+从仓库中下载样本文件，并测试
+.. code:: powershell
+    wget https://github.com/opendatalab/MinerU/raw/master/demo/small_ocr.pdf -O small_ocr.pdf
+    magic-pdf -p small_ocr.pdf -o ./output
+8. 测试 CUDA 加速
+---------------
+如果您的显卡显存大于等于 **8GB**，可以进行以下流程，测试 CUDA 解析加速效果
+**1.覆盖安装支持cuda的torch和torchvision**
+.. code:: bash
+   pip install --force-reinstall torch==2.3.1 torchvision==0.18.1 --index-url https://download.pytorch.org/whl/cu118
+.. admonition:: Important
+    :class: tip
+    务必在命令中指定以下版本
+    .. code:: bash
+      torch==2.3.1 torchvision==0.18.1
+    这是我们支持的最高版本，如果不指定版本会自动安装更高版本导致程序无法运行
+**2.修改【用户目录】中配置文件magic-pdf.json中”device-mode”的值**
+.. code:: json
+   {
+     "device-mode":"cuda"
+   }
+**3.运行以下命令测试cuda加速效果**
+.. code:: bash
+   magic-pdf -p small_ocr.pdf -o ./output
+.. admonition:: Tip
+    :class: tip
+    CUDA 加速是否生效可以根据 log 中输出的各个阶段的耗时来简单判断，通常情况下， ``layout detection time`` 和 ``mfr time`` 应提速10倍以上。
+9. 为 ocr 开启 cuda 加速
+--------------------
+**1.下载paddlepaddle-gpu, 安装完成后会自动开启ocr加速**
+.. code:: bash
+   pip install paddlepaddle-gpu==2.6.1
+**2.运行以下命令测试ocr加速效果**
+.. code:: bash
+   magic-pdf -p small_ocr.pdf -o ./output
+.. admonition:: Tip
+    :class: tip
+    CUDA 加速是否生效可以根据 log 中输出的各个阶段 cost 耗时来简单判断，通常情况下， ``ocr time`` 应提速10倍以上。
--- a/next_docs/zh_cn/user_guide/install/download_model_weight_files.rst
+++ b/next_docs/zh_cn/user_guide/install/download_model_weight_files.rst
+下载模型权重文件
+==================
+模型下载分为初始下载和更新到模型目录。请参考相应的文档以获取如何操作的指示。
+首次下载模型文件
+-----------------
+模型文件可以从 Hugging Face 或 Model Scope下载，由于网络原因，国内用户访问HF可能会失败，请使用 ModelScope。
+方法一：从 Hugging Face 下载模型
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+使用python脚本 从Hugging Face下载模型文件
+.. code:: bash
+   pip install huggingface_hub
+   wget https://gitee.com/myhloli/MinerU/raw/master/scripts/download_models_hf.py -O download_models_hf.py
+   python download_models_hf.py
+python脚本会自动下载模型文件并配置好配置文件中的模型目录
+方法二：从 ModelScope 下载模型
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+使用python脚本从 ModelScope 下载模型文件
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. code:: bash
+   pip install modelscope
+   wget https://gitee.com/myhloli/MinerU/raw/master/scripts/download_models.py -O download_models.py
+   python download_models.py
+python脚本会自动下载模型文件并配置好配置文件中的模型目录
+配置文件可以在用户目录中找到，文件名为\ ``magic-pdf.json``
+.. admonition:: Tip
+    :class: tip
+    windows的用户目录为 “C:\Users\用户名”, linux用户目录为 “/home/用户名”, macOS用户目录为 “/Users/用户名”
+此前下载过模型，如何更新
+--------------------
+1. 通过 git lfs 下载过模型
+^^^^^^^^^^^^^^^^^^^^^^^
+.. admonition:: Important
+    :class: tip
+    由于部分用户反馈通过git lfs下载模型文件遇到下载不全和模型文件损坏情况，现已不推荐使用该方式下载。
+    0.9.x及以后版本由于PDF-Extract-Kit 1.0更换仓库和新增layout排序模型，不能通过 ``git pull``\命令更新，需要使用python脚本一键更新。
+当magic-pdf <= 0.8.1时，如此前通过 git lfs 下载过模型文件，可以进入到之前的下载目录中，通过 ``git pull`` 命令更新模型。
+2. 通过 Hugging Face 或 Model Scope 下载过模型
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+如此前通过 HuggingFace 或 Model Scope 下载过模型，可以重复执行此前的模型下载 python 脚本，将会自动将模型目录更新到最新版本。
\ No newline at end of file
--- a/next_docs/zh_cn/user_guide/install/install.rst
+++ b/next_docs/zh_cn/user_guide/install/install.rst
+安装
+=====
+如果您遇到任何安装问题，请首先查阅 :doc:`../../additional_notes/faq`。如果解析结果不如预期，可参考 :doc:`../../additional_notes/known_issues`。
+.. admonition:: Warning
+    :class: tip
+    **安装前必看——软硬件环境支持说明**
+    为了确保项目的稳定性和可靠性，我们在开发过程中仅对特定的软硬件环境进行优化和测试。这样当用户在推荐的系统配置上部署和运行项目时，能够获得最佳的性能表现和最少的兼容性问题。
+    通过集中资源和精力于主线环境，我们团队能够更高效地解决潜在的BUG，及时开发新功能。
+    在非主线环境中，由于硬件、软件配置的多样性，以及第三方依赖项的兼容性问题，我们无法100%保证项目的完全可用性。因此，对于希望在非推荐环境中使用本项目的用户，我们建议先仔细阅读文档以及 :doc:`../../additional_notes/faq` ，大多数问题已经在 :doc:`../../additional_notes/faq` 中有对应的解决方案，除此之外我们鼓励社区反馈问题，以便我们能够逐步扩大支持范围。
+.. raw:: html
+    <style>
+        table, th, td {
+        border: 1px solid black;
+        border-collapse: collapse;
+        }
+    </style>
+    <table>
+        <tr>
+            <td colspan="3" rowspan="2">操作系统</td>
+        </tr>
+        <tr>
+            <td>Ubuntu 22.04 LTS</td>
+            <td>Windows 10 / 11</td>
+            <td>macOS 11+</td>
+        </tr>
+        <tr>
+            <td colspan="3">CPU</td>
+            <td>x86_64(暂不支持ARM Linux)</td>
+            <td>x86_64(暂不支持ARM Windows)</td>
+            <td>x86_64 / arm64</td>
+        </tr>
+        <tr>
+            <td colspan="3">内存</td>
+            <td colspan="3">大于等于16GB，推荐32G以上</td>
+        </tr>
+        <tr>
+            <td colspan="3">python版本</td>
+            <td colspan="3">3.10 (请务必通过conda创建3.10虚拟环境)</td>
+        </tr>
+        <tr>
+            <td colspan="3">Nvidia Driver 版本</td>
+            <td>latest(专有驱动)</td>
+            <td>latest</td>
+            <td>None</td>
+        </tr>
+        <tr>
+            <td colspan="3">CUDA环境</td>
+            <td>自动安装[12.1(pytorch)+11.8(paddle)]</td>
+            <td>11.8(手动安装)+cuDNN v8.7.0(手动安装)</td>
+            <td>None</td>
+        </tr>
+        <tr>
+            <td rowspan="2">GPU硬件支持列表</td>
+            <td colspan="2">最低要求 8G+显存</td>
+            <td colspan="2">3060ti/3070/4060<br>
+            8G显存可开启layout、公式识别和ocr加速</td>
+            <td rowspan="2">None</td>
+        </tr>
+        <tr>
+            <td colspan="2">推荐配置 10G+显存</td>
+            <td colspan="2">3080/3080ti/3090/3090ti/4070/4070ti/4070tisuper/4080/4090<br>
+            10G显存及以上可以同时开启layout、公式识别和ocr加速和表格识别加速<br>
+            </td>
+        </tr>
+    </table>
+创建环境
+~~~~~~~~~~
+.. code-block:: shell
+    conda create -n MinerU python=3.10
+    conda activate MinerU
+    pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple
+下载模型权重文件
+~~~~~~~~~~~~~~~
+.. code-block:: shell
+    pip install huggingface_hub
+    wget https://gitee.com/myhloli/MinerU/raw/master/scripts/download_models_hf.py -O download_models_hf.py
+    python download_models_hf.py
+MinerU 已安装，查看 :doc:`../quick_start` 或阅读 :doc:`boost_with_cuda` 以加速推理。
--- a/next_docs/zh_cn/user_guide/quick_start.rst
+++ b/next_docs/zh_cn/user_guide/quick_start.rst
+快速开始 
+==============
+从这里开始学习 MinerU 基本使用方法。若还没有安装，请参考安装文档进行安装
+.. toctree::
+    :maxdepth: 1
+    :caption: 快速开始
+    quick_start/command_line
+    quick_start/to_markdown
--- a/next_docs/zh_cn/user_guide/quick_start/command_line.rst
+++ b/next_docs/zh_cn/user_guide/quick_start/command_line.rst
+命令行
+========
+.. code:: bash
+   magic-pdf --help
+   Usage: magic-pdf [OPTIONS]
+   Options:
+     -v, --version                display the version and exit
+     -p, --path PATH              local pdf filepath or directory  [required]
+     -o, --output-dir PATH        output local directory  [required]
+     -m, --method [ocr|txt|auto]  the method for parsing pdf. ocr: using ocr
+                                  technique to extract information from pdf. txt:
+                                  suitable for the text-based pdf only and
+                                  outperform ocr. auto: automatically choose the
+                                  best method for parsing pdf from ocr and txt.
+                                  without method specified, auto will be used by
+                                  default.
+     -l, --lang TEXT              Input the languages in the pdf (if known) to
+                                  improve OCR accuracy.  Optional. You should
+                                  input "Abbreviation" with language form url: ht
+                                  tps://paddlepaddle.github.io/PaddleOCR/en/ppocr
+                                  /blog/multi_languages.html#5-support-languages-
+                                  and-abbreviations
+     -d, --debug BOOLEAN          Enables detailed debugging information during
+                                  the execution of the CLI commands.
+     -s, --start INTEGER          The starting page for PDF parsing, beginning
+                                  from 0.
+     -e, --end INTEGER            The ending page for PDF parsing, beginning from
+                                  0.
+     --help                       Show this message and exit.
+   ## show version
+   magic-pdf -v
+   ## command line example
+   magic-pdf -p {some_pdf} -o {some_output_dir} -m auto
+``{some_pdf}`` 可以是单个 PDF 文件或者一个包含多个 PDF 文件的目录。 解析的结果文件存放在目录 ``{some_output_dir}`` 下。 生成的结果文件列表如下所示：
+.. code:: text
+   ├── some_pdf.md                          # markdown 文件
+   ├── images                               # 存放图片目录
+   ├── some_pdf_layout.pdf                  # layout 绘图 （包含layout阅读顺序）
+   ├── some_pdf_middle.json                 # minerU 中间处理结果
+   ├── some_pdf_model.json                  # 模型推理结果
+   ├── some_pdf_origin.pdf                  # 原 pdf 文件
+   ├── some_pdf_spans.pdf                   # 最小粒度的bbox位置信息绘图
+   └── some_pdf_content_list.json           # 按阅读顺序排列的富文本json
+.. admonition:: Tip
+   :class: tip
+   欲知更多有关结果文件的信息，请参考 :doc:`../tutorial/output_file_description`
--- a/next_docs/zh_cn/user_guide/quick_start/to_markdown.rst
+++ b/next_docs/zh_cn/user_guide/quick_start/to_markdown.rst
+转换为 Markdown 文件
+========================
+.. code:: python
+    import os
+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
+    from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
+    from magic_pdf.pipe.OCRPipe import OCRPipe
+    ## args
+    model_list = []
+    pdf_file_name = "abc.pdf"  # replace with the real pdf path
+    ## prepare env
+    local_image_dir, local_md_dir = "output/images", "output"
+    os.makedirs(local_image_dir, exist_ok=True)
+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
+        local_md_dir
+    ) # create 00
+    image_dir = str(os.path.basename(local_image_dir))
+    reader1 = FileBasedDataReader("")
+    pdf_bytes = reader1.read(pdf_file_name)   # read the pdf content
+    pipe = OCRPipe(pdf_bytes, model_list, image_writer)
+    pipe.pipe_classify()
+    pipe.pipe_analyze()
+    pipe.pipe_parse()
+    pdf_info = pipe.pdf_mid_data["pdf_info"]
+    md_content = pipe.pipe_mk_markdown(
+        image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.MM_MD
+    )
+    if isinstance(md_content, list):
+        md_writer.write_string(f"{pdf_file_name}.md", "\n".join(md_content))
+    else:
+        md_writer.write_string(f"{pdf_file_name}.md", md_content)
+前去 :doc:`../data/data_reader_writer` 获取更多有关 **读写** 示例
--- a/next_docs/zh_cn/user_guide/tutorial.rst
+++ b/next_docs/zh_cn/user_guide/tutorial.rst
+教程
+===========
+让我们通过构建一个最小项目来学习 MinerU 
+.. toctree::
+    :maxdepth: 1
+    :caption: 教程
+    tutorial/output_file_description
--- a/next_docs/zh_cn/user_guide/tutorial/output_file_description.rst
+++ b/next_docs/zh_cn/user_guide/tutorial/output_file_description.rst
+输出文件格式介绍
+===============
+``magic-pdf`` 命令执行后除了输出和 markdown
+有关的文件以外，还会生成若干个和 markdown
+无关的文件。现在将一一介绍这些文件
+some_pdf_layout.pdf
+~~~~~~~~~~~~~~~~~~~
+每一页的 layout 均由一个或多个框组成。
+每个框左上脚的数字表明它们的序号。此外 layout.pdf
+框内用不同的背景色块圈定不同的内容块。
+.. figure:: ../../_static/image/layout_example.png
+   :alt: layout 页面示例
+   layout 页面示例
+some_pdf_spans.pdf
+~~~~~~~~~~~~~~~~~~
+根据 span 类型的不同，采用不同颜色线框绘制页面上所有
+span。该文件可以用于质检，可以快速排查出文本丢失、行间公式未识别等问题。
+.. figure:: ../../_static/image/spans_example.png
+   :alt: span 页面示例
+   span 页面示例
+some_pdf_model.json
+~~~~~~~~~~~~~~~~~~~
+结构定义
+^^^^^^^^
+.. code:: python
+   from pydantic import BaseModel, Field
+   from enum import IntEnum
+   class CategoryType(IntEnum):
+        title = 0               # 标题
+        plain_text = 1          # 文本
+        abandon = 2             # 包括页眉页脚页码和页面注释
+        figure = 3              # 图片
+        figure_caption = 4      # 图片描述
+        table = 5               # 表格
+        table_caption = 6       # 表格描述
+        table_footnote = 7      # 表格注释
+        isolate_formula = 8     # 行间公式
+        formula_caption = 9     # 行间公式的标号
+        embedding = 13          # 行内公式
+        isolated = 14           # 行间公式
+        text = 15               # ocr 识别结果
+   class PageInfo(BaseModel):
+       page_no: int = Field(description="页码序号，第一页的序号是 0", ge=0)
+       height: int = Field(description="页面高度", gt=0)
+       width: int = Field(description="页面宽度", ge=0)
+   class ObjectInferenceResult(BaseModel):
+       category_id: CategoryType = Field(description="类别", ge=0)
+       poly: list[float] = Field(description="四边形坐标, 分别是 左上，右上，右下，左下 四点的坐标")
+       score: float = Field(description="推理结果的置信度")
+       latex: str | None = Field(description="latex 解析结果", default=None)
+       html: str | None = Field(description="html 解析结果", default=None)
+   class PageInferenceResults(BaseModel):
+        layout_dets: list[ObjectInferenceResult] = Field(description="页面识别结果", ge=0)
+        page_info: PageInfo = Field(description="页面元信息")
+   # 所有页面的推理结果按照页码顺序依次放到列表中即为 minerU 推理结果
+   inference_result: list[PageInferenceResults] = []
+poly 坐标的格式 [x0, y0, x1, y1, x2, y2, x3, y3],
+分别表示左上、右上、右下、左下四点的坐标 |poly 坐标示意图|
+示例数据
+^^^^^^^^
+.. code:: json
+   [
+       {
+           "layout_dets": [
+               {
+                   "category_id": 2,
+                   "poly": [
+                       99.1906967163086,
+                       100.3119125366211,
+                       730.3707885742188,
+                       100.3119125366211,
+                       730.3707885742188,
+                       245.81326293945312,
+                       99.1906967163086,
+                       245.81326293945312
+                   ],
+                   "score": 0.9999997615814209
+               }
+           ],
+           "page_info": {
+               "page_no": 0,
+               "height": 2339,
+               "width": 1654
+           }
+       },
+       {
+           "layout_dets": [
+               {
+                   "category_id": 5,
+                   "poly": [
+                       99.13092803955078,
+                       2210.680419921875,
+                       497.3183898925781,
+                       2210.680419921875,
+                       497.3183898925781,
+                       2264.78076171875,
+                       99.13092803955078,
+                       2264.78076171875
+                   ],
+                   "score": 0.9999997019767761
+               }
+           ],
+           "page_info": {
+               "page_no": 1,
+               "height": 2339,
+               "width": 1654
+           }
+       }
+   ]
+some_pdf_middle.json
+~~~~~~~~~~~~~~~~~~~~
+-----------+----------------------------------------------------------+
+| 字段名    | 解释                                                     |
+===========+==========================================================+
+| pdf_info  | list，每个                                               |
+|           | 元素都是一个dict,这个dict是每一页pdf的解析结果，详见下表 |
+-----------+----------------------------------------------------------+
+| \_p       | ocr \| txt，用来标识本次解析的中间态使用的模式           |
+| arse_type |                                                          |
+-----------+----------------------------------------------------------+
+| \_ver     | string, 表示本次解析使用的 magic-pdf 的版本号            |
+| sion_name |                                                          |
+-----------+----------------------------------------------------------+
+**pdf_info** 字段结构说明
+--------------+-------------------------------------------------------+
+| 字段名       | 解释                                                  |
+==============+=======================================================+
+| pr           | pdf预处理后，未分段的中间结果                         |
+| eproc_blocks |                                                       |
+--------------+-------------------------------------------------------+
+| l            | 布局分割的结果，                                      |
+| ayout_bboxes | 含有布局的方向（垂直、水平），和bbox，按阅读顺序排序  |
+--------------+-------------------------------------------------------+
+| page_idx     | 页码，从0开始                                         |
+--------------+-------------------------------------------------------+
+| page_size    | 页面的宽度和高度                                      |
+--------------+-------------------------------------------------------+
+| \            | 布局树状结构                                          |
+| _layout_tree |                                                       |
+--------------+-------------------------------------------------------+
+| images       | list，每个元素是一个dict，每个dict表示一个img_block   |
+--------------+-------------------------------------------------------+
+| tables       | list，每个元素是一个dict，每个dict表示一个table_block |
+--------------+-------------------------------------------------------+
+| interli      | list，每个元素                                        |
+| ne_equations | 是一个dict，每个dict表示一个interline_equation_block  |
+--------------+-------------------------------------------------------+
+| disc         | List, 模型返回的需要drop的block信息                   |
+| arded_blocks |                                                       |
+--------------+-------------------------------------------------------+
+| para_blocks  | 将preproc_blocks进行分段之后的结果                    |
+--------------+-------------------------------------------------------+
+上表中 ``para_blocks``
+是个dict的数组，每个dict是一个block结构，block最多支持一次嵌套
+**block**
+外层block被称为一级block，一级block中的字段包括
+====== ===============================================
+字段名 解释
+====== ===============================================
+type   block类型（table|image）
+bbox   block矩形框坐标
+blocks list，里面的每个元素都是一个dict格式的二级block
+====== ===============================================
+一级block只有”table”和”image”两种类型，其余block均为二级block
+二级block中的字段包括
+-----+----------------------------------------------------------------+
+| 字  | 解释                                                           |
+| 段  |                                                                |
+| 名  |                                                                |
+=====+================================================================+
+| t   | block类型                                                      |
+| ype |                                                                |
+-----+----------------------------------------------------------------+
+| b   | block矩形框坐标                                                |
+| box |                                                                |
+-----+----------------------------------------------------------------+
+| li  | list，每个元素都是一个dict表示的line，用来描述一行信息的构成   |
+| nes |                                                                |
+-----+----------------------------------------------------------------+
+二级block的类型详解
+================== ==============
+type               desc
+================== ==============
+image_body         图像的本体
+image_caption      图像的描述文本
+image_footnote     图像的脚注
+table_body         表格本体
+table_caption      表格的描述文本
+table_footnote     表格的脚注
+text               文本块
+title              标题块
+index              目录块
+list               列表块
+interline_equation 行间公式块
+================== ==============
+**line**
+line 的 字段格式如下
+----+-----------------------------------------------------------------+
+| 字 | 解释                                                            |
+| 段 |                                                                 |
+| 名 |                                                                 |
+====+=================================================================+
+| bb | line的矩形框坐标                                                |
+| ox |                                                                 |
+----+-----------------------------------------------------------------+
+| s  | list，                                                          |
+| pa | 每个元素都是一个dict表示的span，用来描述一个最小组成单元的构成  |
+| ns |                                                                 |
+----+-----------------------------------------------------------------+
+**span**
+------------+---------------------------------------------------------+
+| 字段名     | 解释                                                    |
+============+=========================================================+
+| bbox       | span的矩形框坐标                                        |
+------------+---------------------------------------------------------+
+| type       | span的类型                                              |
+------------+---------------------------------------------------------+
+| content \| | 文本类型的span使用content，图表类使用img_path           |
+| img_path   | 用来存储实际的文本或者截图路径信息                      |
+------------+---------------------------------------------------------+
+span 的类型有如下几种
+================== ========
+type               desc
+================== ========
+image              图片
+table              表格
+text               文本
+inline_equation    行内公式
+interline_equation 行间公式
+================== ========
+**总结**
+span是所有元素的最小存储单元
+para_blocks内存储的元素为区块信息
+区块结构为
+一级block(如有)->二级block->line->span
+.. _示例数据-1:
+示例数据
+^^^^^^^^
+.. code:: json
+   {
+       "pdf_info": [
+           {
+               "preproc_blocks": [
+                   {
+                       "type": "text",
+                       "bbox": [
+                           52,
+                           61.956024169921875,
+                           294,
+                           82.99800872802734
+                       ],
+                       "lines": [
+                           {
+                               "bbox": [
+                                   52,
+                                   61.956024169921875,
+                                   294,
+                                   72.0000228881836
+                               ],
+                               "spans": [
+                                   {
+                                       "bbox": [
+                                           54.0,
+                                           61.956024169921875,
+                                           296.2261657714844,
+                                           72.0000228881836
+                                       ],
+                                       "content": "dependent on the service headway and the reliability of the departure ",
+                                       "type": "text",
+                                       "score": 1.0
+                                   }
+                               ]
+                           }
+                       ]
+                   }
+               ],
+               "layout_bboxes": [
+                   {
+                       "layout_bbox": [
+                           52,
+                           61,
+                           294,
+                           731
+                       ],
+                       "layout_label": "V",
+                       "sub_layout": []
+                   }
+               ],
+               "page_idx": 0,
+               "page_size": [
+                   612.0,
+                   792.0
+               ],
+               "_layout_tree": [],
+               "images": [],
+               "tables": [],
+               "interline_equations": [],
+               "discarded_blocks": [],
+               "para_blocks": [
+                   {
+                       "type": "text",
+                       "bbox": [
+                           52,
+                           61.956024169921875,
+                           294,
+                           82.99800872802734
+                       ],
+                       "lines": [
+                           {
+                               "bbox": [
+                                   52,
+                                   61.956024169921875,
+                                   294,
+                                   72.0000228881836
+                               ],
+                               "spans": [
+                                   {
+                                       "bbox": [
+                                           54.0,
+                                           61.956024169921875,
+                                           296.2261657714844,
+                                           72.0000228881836
+                                       ],
+                                       "content": "dependent on the service headway and the reliability of the departure ",
+                                       "type": "text",
+                                       "score": 1.0
+                                   }
+                               ]
+                           }
+                       ]
+                   }
+               ]
+           }
+       ],
+       "_parse_type": "txt",
+       "_version_name": "0.6.1"
+   }
+.. |poly 坐标示意图| image:: ../../_static/image/poly.png
--- a/projects/gradio_app/app.py
+++ b/projects/gradio_app/app.py
@@ -191,7 +191,7 @@ if __name__ == "__main__":
        gr.HTML(header)
        with gr.Row():
            with gr.Column(variant='panel', scale=5):
-                file = gr.File(label="Please upload a PDF or image", file_types=[".pdf", ".png", ".jpeg", "jpg"])
+                file = gr.File(label="Please upload a PDF or image", file_types=[".pdf", ".png", ".jpeg", ".jpg"])
                max_pages = gr.Slider(1, 10, 5, step=1, label="Max convert pages")
                with gr.Row():
                    layout_mode = gr.Dropdown(["layoutlmv3", "doclayout_yolo"], label="Layout model", value="layoutlmv3")

--- a/requirements-docker.txt
+++ b/requirements-docker.txt
@@ -12,7 +12,11 @@ matplotlib
 ultralytics
 paddleocr==2.7.3
 paddlepaddle==3.0.0b1
-pypandoc
+struct-eqtable==0.3.2
-struct-eqtable==0.1.0
+einops
+accelerate
+doclayout_yolo==0.0.2
+rapidocr-paddle
+rapid_table
 doclayout-yolo==0.0.2
 detectron2
--- a/setup.py
+++ b/setup.py
@@ -47,6 +47,9 @@ if __name__ == '__main__':
                     "einops",  # struct-eqtable依赖
                     "accelerate",  # struct-eqtable依赖
                     "doclayout_yolo==0.0.2",  # doclayout_yolo
+                     "rapidocr-paddle",  # rapidocr-paddle
+                     "rapid_table",  # rapid_table
+                     "PyYAML",  # yaml
                     "detectron2"
                     ],
        },