Merge pull request #1261 from opendatalab/release-0.10.6

Release 0.10.6

Merge pull request #1261 from opendatalab/release-0.10.6
Release 0.10.6
b4f7b53e · Xiaomeng Zhao · GitHub · a962824b · d3b51aa5 · b4f7b53e
Unverified Commit b4f7b53e authored Dec 11, 2024 by Xiaomeng Zhao Committed by GitHub Dec 11, 2024
6 changed files
--- a/next_docs/zh_cn/user_guide/tutorial/output_file_description.rst
+++ b/next_docs/zh_cn/user_guide/tutorial/output_file_description.rst
@@ -137,49 +137,45 @@ poly 坐标的格式 [x0, y0, x1, y1, x2, y2, x3, y3],
 some_pdf_middle.json
 ~~~~~~~~~~~~~~~~~~~~

-+-----------+----------------------------------------------------------+
-| 字段名    | 解释                                                     |
-+===========+==========================================================+
-| pdf_info  | list，每个                                               |
-|           | 元素都是一个dict,这个dict是每一页pdf的解析结果，详见下表 |
-+-----------+----------------------------------------------------------+
-|              | ocr \| txt，用来标识本次解析的中间态使用的模式           |
-| \_parse_type |                                                          |
-+-----------+----------------------------------------------------------+
-|                | string, 表示本次解析使用的 magic-pdf 的版本号            |
-| \_version_name |                                                          |
-+-----------+----------------------------------------------------------+
+--------------------+----------------------------------------------------------+
+| 字段名              | 解释                                                    |
+====================+==========================================================+
+| pdf_info           | list，每个元素都是一个                                   |
+|                    | dict，这个dict是每一页pdf的解析结果，详见下表            |
+--------------------+----------------------------------------------------------+
+| \_parse_type       | ocr \| txt，用来标识本次解析的中间态使用的模式           |
+--------------------+----------------------------------------------------------+
+| \_version_name     | string，表示本次解析使用的 magic-pdf 的版本号            |
+-------------------------------------------------------------------------------+

 **pdf_info** 字段结构说明

-+--------------+-------------------------------------------------------+
-| 字段名       | 解释                                                  |
-+==============+=======================================================+
-|                 | pdf预处理后，未分段的中间结果                         |
-| preeproc_blocks |                                                       |
-+--------------+-------------------------------------------------------+
-|               | 布局分割的结果，                                      |
-| layout_bboxes | 含有布局的方向（垂直、水平），和bbox，按阅读顺序排序  |
-+--------------+-------------------------------------------------------+
-| page_idx     | 页码，从0开始                                         |
-+--------------+-------------------------------------------------------+
-| page_size    | 页面的宽度和高度                                      |
-+--------------+-------------------------------------------------------+
-| \            | 布局树状结构                                          |
-| _layout_tree |                                                       |
-+--------------+-------------------------------------------------------+
-| images       | list，每个元素是一个dict，每个dict表示一个img_block   |
-+--------------+-------------------------------------------------------+
-| tables       | list，每个元素是一个dict，每个dict表示一个table_block |
-+--------------+-------------------------------------------------------+
-|                     | list，每个元素                                        |
-| interline_equations | 是一个dict，每个dict表示一个interline_equation_block  |
-+--------------+-------------------------------------------------------+
-|                  | List, 模型返回的需要drop的block信息                   |
-| discarded_blocks |                                                       |
-+--------------+-------------------------------------------------------+
-| para_blocks  | 将preproc_blocks进行分段之后的结果                    |
-+--------------+-------------------------------------------------------+
+---------------------+-------------------------------------------------------+
+| 字段名               | 解释                                                 |
+=====================+=======================================================+
+| preproc_blocks      | pdf预处理后，未分段的中间结果                         |
+---------------------+-------------------------------------------------------+
+|                     | 布局分割的结果，                                      |
+| layout_bboxes       | 含有布局的方向（垂直、水平），和bbox，按阅读顺序排序  |
+---------------------+-------------------------------------------------------+
+| page_idx            | 页码，从0开始                                         |
+---------------------+-------------------------------------------------------+
+| page_size           | 页面的宽度和高度                                      |
+---------------------+-------------------------------------------------------+
+| \_layout_tree       | 布局树状结构                                          |
+---------------------+-------------------------------------------------------+
+| images              | list，每个元素是一个dict，每个dict表示一个img_block   |
+---------------------+-------------------------------------------------------+
+| tables              | list，每个元素是一个dict，每个dict表示一个table_block |
+---------------------+-------------------------------------------------------+
+|                     | list，每个元素是一个                                  |
+| interline_equations | dict，每个dict表示一个interline_equation_block        |
+---------------------+-------------------------------------------------------+
+|                     | List, 模型返回的需要drop的block信息                   |
+| discarded_blocks    |                                                       |
+---------------------+-------------------------------------------------------+
+| para_blocks         | 将preproc_blocks进行分段之后的结果                    |
+---------------------+-------------------------------------------------------+

 上表中 ``para_blocks``
 是个dict的数组，每个dict是一个block结构，block最多支持一次嵌套
@@ -200,20 +196,18 @@ blocks list，里面的每个元素都是一个dict格式的二级block

 二级block中的字段包括

-+-----+----------------------------------------------------------------+
-| 字  | 解释                                                           |
-| 段  |                                                                |
-| 名  |                                                                |
-+=====+================================================================+
-|      | block类型                                                      |
-| type |                                                                |
-+-----+----------------------------------------------------------------+
-|      | block矩形框坐标                                                |
-| bbox |                                                                |
-+-----+----------------------------------------------------------------+
-|       | list，每个元素都是一个dict表示的line，用来描述一行信息的构成   |
-| lines |                                                                |
-+-----+----------------------------------------------------------------+
+----------+----------------------------------------------------------------+
+| 字       | 解释                                                           |
+| 段       |                                                                |
+| 名       |                                                                |
+==========+================================================================+
+|          | block类型                                                      |
+| type     |                                                                |
+----------+----------------------------------------------------------------+
+| bbox     | block矩形框坐标                                                |
+----------+----------------------------------------------------------------+
+| lines    | list，每个元素都是一个dict表示的line，用来描述一行信息的构成   |
+----------+----------------------------------------------------------------+

 二级block的类型详解

@@ -237,22 +231,21 @@ interline_equation 行间公式块

 line 的 字段格式如下

-+----+-----------------------------------------------------------------+
-| 字 | 解释                                                            |
-| 段 |                                                                 |
-| 名 |                                                                 |
-+====+=================================================================+
-| bbox  | line的矩形框坐标                                                |
-|       |                                                                 |
-+----+-----------------------------------------------------------------+
-| spans  | list，                                                       |
-|        | 每个元素都是一个dict表示的span，用来描述一个最小组成单元的构成  |
-+----+-----------------------------------------------------------------+
+-----------+-----------------------------------------------------------------+
+| 字        | 解释                                                            |
+| 段        |                                                                 |
+| 名        |                                                                 |
+===========+=================================================================+
+| bbox      | line的矩形框坐标                                                |
+-----------+-----------------------------------------------------------------+
+| spans     | list，                                                          |
+|           | 每个元素都是一个dict表示的span，用来描述一个最小组成单元的构成  |
+-----------+-----------------------------------------------------------------+

 **span**

 +------------+---------------------------------------------------------+
-| 字段名     | 解释                                                    |
+| 字段名      | 解释                                                   |
 +============+=========================================================+
 | bbox       | span的矩形框坐标                                        |
 +------------+---------------------------------------------------------+

--- a/next_docs/zh_cn/user_guide/tutorial/pipeline.rst
+++ b/next_docs/zh_cn/user_guide/tutorial/pipeline.rst
+
+流水线管道
+===========
+
+
+极简示例
+^^^^^^^^
+
+.. code:: python
+
+    import os
+
+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
+    from magic_pdf.data.dataset import PymuDocDataset
+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+
+    # args
+    pdf_file_name = "abc.pdf"  # replace with the real pdf path
+    name_without_suff = pdf_file_name.split(".")[0]
+
+    # prepare env
+    local_image_dir, local_md_dir = "output/images", "output"
+    image_dir = str(os.path.basename(local_image_dir))
+
+    os.makedirs(local_image_dir, exist_ok=True)
+
+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
+        local_md_dir
+    )
+    image_dir = str(os.path.basename(local_image_dir))
+
+    # read bytes
+    reader1 = FileBasedDataReader("")
+    pdf_bytes = reader1.read(pdf_file_name)  # read the pdf content
+
+    # proc
+    ## Create Dataset Instance
+    ds = PymuDocDataset(pdf_bytes)
+
+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
+
+
+运行以上的代码，会得到如下的结果
+
+.. code:: bash 
+
+    output/
+    ├── abc.md
+    └── images
+
+
+除去初始化环境，如建立目录、导入依赖库等逻辑。真正将 ``pdf`` 转换为 ``markdown`` 的代码片段如下
+
+.. code::
+
+    # read bytes
+    reader1 = FileBasedDataReader("")
+    pdf_bytes = reader1.read(pdf_file_name)  # read the pdf content
+
+    # proc
+    ## Create Dataset Instance
+    ds = PymuDocDataset(pdf_bytes)
+
+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
+
+
+``ds.apply(doc_analyze, ocr=True)`` 会生成 ``InferenceResult`` 对象。 ``InferenceResult`` 对象执行 ``pipe_ocr_mode`` 方法会生成 ``PipeResult`` 对象。
+``PipeResult`` 对象执行 ``dump_md`` 会在指定位置生成 ``markdown`` 文件。
+
+
+pipeline 的执行过程如下图所示
+
+.. image:: ../../_static/image/pipeline.drawio.svg 
+
+.. raw:: html 
+
+    <br> </br>
+
+目前划分出数据、推理、程序处理三个阶段，分别对应着图上的 ``Dataset``， ``InferenceResult``， ``PipeResult`` 这三个实体。通过 ``apply`` ， ``doc_analyze`` 或 ``pipe_ocr_mode`` 等方法链接在一起。
+
+
+.. admonition:: Tip
+    :class: tip
+
+    要想获得更多有关 Dataset、InferenceResult、PipeResult 的使用示例子，请前往 :doc:`../quick_start/to_markdown`
+
+    要想获得更多有关 Dataset、InferenceResult、PipeResult 的细节信息请前往英文版 MinerU 文档进行查看!
+
+
+
+管道组合
+^^^^^^^^^
+
+.. code:: python
+
+    class Dataset(ABC):
+        @abstractmethod
+        def apply(self, proc: Callable, *args, **kwargs):
+            """Apply callable method which.
+
+            Args:
+                proc (Callable): invoke proc as follows:
+                    proc(self, *args, **kwargs)
+
+            Returns:
+                Any: return the result generated by proc
+            """
+            pass
+
+    class InferenceResult(InferenceResultBase):
+
+        def apply(self, proc: Callable, *args, **kwargs):
+            """Apply callable method which.
+
+            Args:
+                proc (Callable): invoke proc as follows:
+                    proc(inference_result, *args, **kwargs)
+
+            Returns:
+                Any: return the result generated by proc
+            """
+            return proc(copy.deepcopy(self._infer_res), *args, **kwargs)
+
+        def pipe_ocr_mode(
+            self,
+            imageWriter: DataWriter,
+            start_page_id=0,
+            end_page_id=None,
+            debug_mode=False,
+            lang=None,
+            ) -> PipeResult:
+            pass
+
+    class PipeResult:
+        def apply(self, proc: Callable, *args, **kwargs):
+            """Apply callable method which.
+
+            Args:
+                proc (Callable): invoke proc as follows:
+                    proc(pipeline_result, *args, **kwargs)
+
+            Returns:
+                Any: return the result generated by proc
+            """
+            return proc(copy.deepcopy(self._pipe_res), *args, **kwargs)
+
+``Dataset`` 、 ``InferenceResult`` 和 ``PipeResult`` 类均有 ``apply`` method。可用于组合不同阶段的运算过程。
+如下所示，``MinerU`` 提供一套组合这些类的计算过程。
+
+.. code:: python 
+
+    # proc
+    ## Create Dataset Instance
+    ds = PymuDocDataset(pdf_bytes)
+
+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
+
+用户可以根据的需求，自行实现一些组合用的函数。比如用户通过 ``apply`` 方法实现一个统计 ``pdf`` 文件页数的功能。
+
+.. code:: python 
+
+    from magic_pdf.data.data_reader_writer import  FileBasedDataReader
+    from magic_pdf.data.dataset import PymuDocDataset
+
+    # args
+    pdf_file_name = "abc.pdf"  # replace with the real pdf path
+
+    # read bytes
+    reader1 = FileBasedDataReader("")
+    pdf_bytes = reader1.read(pdf_file_name)  # read the pdf content
+
+    # proc
+    ## Create Dataset Instance
+    ds = PymuDocDataset(pdf_bytes)
+
+    def count_page(ds)-> int:
+        return len(ds)
+
+    print("page number: ", ds.apply(count_page)) # will output the page count of `abc.pdf`
--- a/requirements-docker.txt
+++ b/requirements-docker.txt
@@ -7,9 +7,9 @@ numpy>=1.21.6,<2.0.0
 fast-langdetect==0.2.0
 scikit-learn>=1.0.2
 pdfminer.six==20231228
-unimernet==0.2.1
+unimernet==0.2.2
 matplotlib
-ultralytics
+ultralytics>=8.3.48
 paddleocr==2.7.3
 paddlepaddle==3.0.0b1
 struct-eqtable==0.3.2

--- a/requirements.txt
+++ b/requirements.txt
@@ -7,7 +7,7 @@ numpy>=1.21.6,<2.0.0
 pydantic>=2.7.2,<2.8.0
 PyMuPDF>=1.24.9
 scikit-learn>=1.0.2
-torch>=2.2.2,<=2.3.1
+torch>=2.2.2
 transformers
-# pdfminer.six==20231228
+pdfminer.six==20231228
 # The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator.
--- a/setup.py
+++ b/setup.py
@@ -36,10 +36,12 @@ if __name__ == '__main__':
                     "paddlepaddle==3.0.0b1;platform_system=='Linux'",
                     "paddlepaddle==2.6.1;platform_system=='Windows' or platform_system=='Darwin'",
                     ],
-            "full": ["unimernet==0.2.1",  # unimernet升级0.2.1
+            "full": ["unimernet==0.2.2",  # unimernet升级0.2.2,移除torchtext的依赖
+                     "torch>=2.2.2,<=2.3.1",  # torch2.4.0及之后版本未测试，先卡住版本上限
+                     "torchvision>=0.17.2,<=0.18.1",  # torchvision 受torch版本约束
                     "matplotlib<=3.9.0;platform_system=='Windows'",  # 3.9.1及之后不提供windows的预编译包，避免一些没有编译环境的windows设备安装失败
                     "matplotlib;platform_system=='Linux' or platform_system=='Darwin'",  # linux 和 macos 不应限制matplotlib的最高版本，以避免无法更新导致的一些bug
-                     "ultralytics",  # yolov8,公式检测
+                     "ultralytics>=8.3.48",  # yolov8,公式检测
                     "paddleocr==2.7.3",  # 2.8.0及2.8.1版本与detectron2有冲突，需锁定2.7.3
                     "paddlepaddle==3.0.0b1;platform_system=='Linux'",  # 解决linux的段异常问题
                     "paddlepaddle==2.6.1;platform_system=='Windows' or platform_system=='Darwin'",  # windows版本3.0.0b1效率下降，需锁定2.6.1

--- a/tests/test_cli/test_cli_sdk.py
+++ b/tests/test_cli/test_cli_sdk.py
@@ -7,8 +7,11 @@ from lib import common
 import time
 import magic_pdf.model as model_config
 from magic_pdf.pipe.UNIPipe import UNIPipe
-from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
-from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
+import os
+from magic_pdf.data.data_reader_writer import FileBasedDataWriter
+from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
+from magic_pdf.config.make_content_config import DropMode, MakeMode
+from magic_pdf.pipe.OCRPipe import OCRPipe
 model_config.__use_inside_model__ = True
 pdf_res_path = conf.conf['pdf_res_path']
 code_path = conf.conf['code_path']
@@ -41,7 +44,7 @@ class TestCli:
            pdf_bytes = open(pdf_path, 'rb').read()
            local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
            image_dir = str(os.path.basename(local_image_dir))
-            image_writer = DiskReaderWriter(local_image_dir)
+            image_writer = FileBasedDataWriter(local_image_dir)
            model_json = list()
            jso_useful_key = {'_pdf_type': '', 'model_list': model_json}
            pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
@@ -77,7 +80,7 @@ class TestCli:
            pdf_bytes = open(pdf_path, 'rb').read()
            local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
            image_dir = str(os.path.basename(local_image_dir))
-            image_writer = DiskReaderWriter(local_image_dir)
+            image_writer = FileBasedDataWriter(local_image_dir)
            model_json = list()
            jso_useful_key = {'_pdf_type': 'ocr', 'model_list': model_json}
            pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
@@ -112,7 +115,7 @@ class TestCli:
            pdf_bytes = open(pdf_path, 'rb').read()
            local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
            image_dir = str(os.path.basename(local_image_dir))
-            image_writer = DiskReaderWriter(local_image_dir)
+            image_writer = FileBasedDataWriter(local_image_dir)
            model_json = list()
            jso_useful_key = {'_pdf_type': 'txt', 'model_list': model_json}
            pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
@@ -284,12 +287,13 @@ class TestCli:
        pdf_endpoint = os.environ.get('pdf_endpoint', "")
        s3_pdf_path = conf.conf["s3_pdf_path"]
        image_dir = "s3://" + pdf_bucket + "/mineru/test/output"
-        print (image_dir)
-        s3pdf_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint)
-        s3image_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint, parent_path=image_dir)
-        pdf_bytes = s3pdf_cli.read(s3_pdf_path, mode=s3pdf_cli.MODE_BIN)
-        jso_useful_key = {"_pdf_type": "", "model_list": []}
-        pipe = UNIPipe(pdf_bytes, jso_useful_key, s3image_cli)
+        prefix = "mineru/test/output"
+        reader = S3DataReader(prefix, pdf_bucket, pdf_ak, pdf_sk, pdf_endpoint)
+        # = S3DataWriter(prefix, pdf_bucket, pdf_ak, pdf_sk, pdf_endpoint)
+        image_writer = S3DataWriter(prefix, pdf_bucket, pdf_ak, pdf_sk, pdf_endpoint)
+        pdf_bytes = reader.read(s3_pdf_path)
+        model_list = []
+        pipe = OCRPipe(pdf_bytes, model_list, image_writer)
        pipe.pipe_classify()
        pipe.pipe_analyze()
        pipe.pipe_parse()
@@ -427,3 +431,4 @@ class TestCli:
 
 if __name__ == '__main__':
    pytest.main()
+