Unverified Commit fa113b57 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #1178 from icecraft/refactor/add_user_api

Refactor/add user api
parents 1c10dc55 e4ed6023
......@@ -114,7 +114,7 @@ autodoc_mock_imports = [
'sentencepiece',
'vllm.cuda_utils',
'vllm._C',
'numpy',
# 'numpy',
'tqdm',
]
......
......@@ -12,17 +12,17 @@ Local File Example
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
## args
model_list = []
# args
pdf_file_name = "abc.pdf" # replace with the real pdf path
name_without_suff = pdf_file_name.split(".")[0]
## prepare env
# prepare env
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
......@@ -30,27 +30,31 @@ Local File Example
)
image_dir = str(os.path.basename(local_image_dir))
# read bytes
reader1 = FileBasedDataReader("")
pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
pipe = OCRPipe(pdf_bytes, model_list, image_writer)
## inference
infer_result = ds.apply(doc_analyze, ocr=True)
pipe.pipe_classify()
pipe.pipe_analyze()
pipe.pipe_parse()
### draw model result on each page
infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
pdf_info = pipe.pdf_mid_data["pdf_info"]
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
### draw layout result on each page
pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
md_content = pipe.pipe_mk_markdown(
image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.MM_MD
)
### draw spans result on each page
pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
if isinstance(md_content, list):
md_writer.write_string(f"{pdf_file_name}.md", "\n".join(md_content))
else:
md_writer.write_string(f"{pdf_file_name}.md", md_content)
### dump markdown
pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
S3 File Example
......@@ -61,8 +65,8 @@ S3 File Example
import os
from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
bucket_name = "{Your S3 Bucket Name}" # replace with real bucket name
ak = "{Your S3 access key}" # replace with real s3 access key
......@@ -74,29 +78,39 @@ S3 File Example
writer = S3DataWriter('unittest/tmp', bucket_name, ak, sk, endpoint_url)
image_writer = S3DataWriter('unittest/tmp/images', bucket_name, ak, sk, endpoint_url)
## args
model_list = []
pdf_file_name = f"s3://{bucket_name}/{fake pdf path}" # replace with the real s3 path
# args
pdf_file_name = (
"s3://llm-pdf-text-1/unittest/tmp/bug5-11.pdf" # replace with the real s3 path
)
# prepare env
local_dir = "output"
name_without_suff = os.path.basename(pdf_file_name).split(".")[0]
# read bytes
pdf_bytes = reader.read(pdf_file_name) # read the pdf content
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
pipe = OCRPipe(pdf_bytes, model_list, image_writer)
## inference
infer_result = ds.apply(doc_analyze, ocr=True)
pipe.pipe_classify()
pipe.pipe_analyze()
pipe.pipe_parse()
### draw model result on each page
infer_result.draw_model(os.path.join(local_dir, f'{name_without_suff}_model.pdf')) # dump to local
pdf_info = pipe.pdf_mid_data["pdf_info"]
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
md_content = pipe.pipe_mk_markdown(
"unittest/tmp/images", drop_mode=DropMode.NONE, md_make_mode=MakeMode.MM_MD
)
### draw layout result on each page
pipe_result.draw_layout(os.path.join(local_dir, f'{name_without_suff}_layout.pdf')) # dump to local
### draw spans result on each page
pipe_result.draw_span(os.path.join(local_dir, f'{name_without_suff}_spans.pdf')) # dump to local
if isinstance(md_content, list):
writer.write_string(f"{pdf_file_name}.md", "\n".join(md_content))
else:
writer.write_string(f"{pdf_file_name}.md", md_content)
### dump markdown
pipe_result.dump_md(writer, f'{name_without_suff}.md', "unittest/tmp/images") # dump to remote s3
Check :doc:`../data/data_reader_writer` for more [reader | writer] examples
Check :doc:`../data/data_reader_writer` for more [reader | writer] examples and check :doc:`../../api/pipe_operators` or :doc:`../../api/model_operators` for api details
......@@ -7,4 +7,6 @@ From the beginning to the end, Show how to using mineru via a minimal project
.. toctree::
:maxdepth: 1
tutorial/output_file_description
\ No newline at end of file
tutorial/output_file_description
tutorial/pipeline
Pipeline
==========
Minimal Example
^^^^^^^^^^^^^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
# args
pdf_file_name = "abc.pdf" # replace with the real pdf path
name_without_suff = pdf_file_name.split(".")[0]
# prepare env
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
image_dir = str(os.path.basename(local_image_dir))
# read bytes
reader1 = FileBasedDataReader("")
pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
Running the above code will result in the following
.. code:: bash
output/
├── abc.md
└── images
Excluding the setup of the environment, such as creating directories and importing dependencies, the actual code snippet for converting pdf to markdown is as follows
.. code:: python
# read bytes
reader1 = FileBasedDataReader("")
pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
``ds.apply(doc_analyze, ocr=True)`` generates an ``InferenceResult`` object. The ``InferenceResult`` object, when executing the ``pipe_ocr_mode`` method, produces a ``PipeResult`` object.
The ``PipeResult`` object, upon executing ``dump_md``, generates a ``markdown`` file at the specified location.
The pipeline execution process is illustrated in the following diagram
.. image:: ../../_static/image/pipeline.drawio.svg
.. raw:: html
<br> </br>
Currently, the process is divided into three stages: data, inference, and processing, which correspond to the ``Dataset``, ``InferenceResult``, and ``PipeResult`` entities in the diagram.
These stages are linked together through methods like ``apply``, ``doc_analyze``, or ``pipe_ocr_mode``
.. admonition:: Tip
:class: tip
For more examples on how to use ``Dataset``, ``InferenceResult``, and ``PipeResult``, please refer to :doc:`../quick_start/to_markdown`
For more detailed information about ``Dataset``, ``InferenceResult``, and ``PipeResult``, please refer to :doc:`../../api/dataset`, :doc:`../../api/model_operators`, :doc:`../../api/pipe_operators`
Pipeline Composition
^^^^^^^^^^^^^^^^^^^^^
.. code:: python
class Dataset(ABC):
@abstractmethod
def apply(self, proc: Callable, *args, **kwargs):
"""Apply callable method which.
Args:
proc (Callable): invoke proc as follows:
proc(self, *args, **kwargs)
Returns:
Any: return the result generated by proc
"""
pass
class InferenceResult(InferenceResultBase):
def apply(self, proc: Callable, *args, **kwargs):
"""Apply callable method which.
Args:
proc (Callable): invoke proc as follows:
proc(inference_result, *args, **kwargs)
Returns:
Any: return the result generated by proc
"""
return proc(copy.deepcopy(self._infer_res), *args, **kwargs)
def pipe_ocr_mode(
self,
imageWriter: DataWriter,
start_page_id=0,
end_page_id=None,
debug_mode=False,
lang=None,
) -> PipeResult:
pass
class PipeResult:
def apply(self, proc: Callable, *args, **kwargs):
"""Apply callable method which.
Args:
proc (Callable): invoke proc as follows:
proc(pipeline_result, *args, **kwargs)
Returns:
Any: return the result generated by proc
"""
return proc(copy.deepcopy(self._pipe_res), *args, **kwargs)
The ``Dataset``, ``InferenceResult``, and ``PipeResult`` classes all have an ``apply`` method, which can be used to chain different stages of the computation.
As shown below, ``MinerU`` provides a set of methods to compose these classes.
.. code:: python
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
Users can implement their own functions for chaining as needed. For example, a user could use the ``apply`` method to create a function that counts the number of pages in a ``pdf`` file.
.. code:: python
from magic_pdf.data.data_reader_writer import FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
# args
pdf_file_name = "abc.pdf" # replace with the real pdf path
# read bytes
reader1 = FileBasedDataReader("")
pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
def count_page(ds)-> int:
return len(ds)
print("page number: ", ds.apply(count_page)) # will output the page count of `abc.pdf`
numpy==1.26.4
click==8.1.7
fast-langdetect==0.2.2
Brotli==1.1.0
boto3>=1.28.43
loguru>=0.6.0
myst-parser
......@@ -9,4 +13,4 @@ sphinx-argparse>=0.5.2
sphinx-book-theme>=1.1.3
sphinx-copybutton>=0.5.2
sphinx_rtd_theme>=3.0.1
autodoc_pydantic>=2.2.0
\ No newline at end of file
autodoc_pydantic>=2.2.0
This diff is collapsed.
转换为 Markdown 文件
========================
本地文件示例
^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
## args
model_list = []
# args
pdf_file_name = "abc.pdf" # replace with the real pdf path
name_without_suff = pdf_file_name.split(".")[0]
## prepare env
# prepare env
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
......@@ -30,39 +28,43 @@
)
image_dir = str(os.path.basename(local_image_dir))
# read bytes
reader1 = FileBasedDataReader("")
pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
pipe = OCRPipe(pdf_bytes, model_list, image_writer)
## inference
infer_result = ds.apply(doc_analyze, ocr=True)
pipe.pipe_classify()
pipe.pipe_analyze()
pipe.pipe_parse()
### draw model result on each page
infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
pdf_info = pipe.pdf_mid_data["pdf_info"]
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
### draw layout result on each page
pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
md_content = pipe.pipe_mk_markdown(
image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.MM_MD
)
### draw spans result on each page
pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
if isinstance(md_content, list):
md_writer.write_string(f"{pdf_file_name}.md", "\n".join(md_content))
else:
md_writer.write_string(f"{pdf_file_name}.md", md_content)
### dump markdown
pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
对象存储使用示例
^^^^^^^^^^^^^^^
对象存储文件示例
^^^^^^^^^^^^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
bucket_name = "{Your S3 Bucket Name}" # replace with real bucket name
ak = "{Your S3 access key}" # replace with real s3 access key
......@@ -74,30 +76,39 @@
writer = S3DataWriter('unittest/tmp', bucket_name, ak, sk, endpoint_url)
image_writer = S3DataWriter('unittest/tmp/images', bucket_name, ak, sk, endpoint_url)
## args
model_list = []
pdf_file_name = f"s3://{bucket_name}/{fake pdf path}" # replace with the real s3 path
# args
pdf_file_name = (
"s3://llm-pdf-text-1/unittest/tmp/bug5-11.pdf" # replace with the real s3 path
)
# prepare env
local_dir = "output"
name_without_suff = os.path.basename(pdf_file_name).split(".")[0]
# read bytes
pdf_bytes = reader.read(pdf_file_name) # read the pdf content
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
pipe = OCRPipe(pdf_bytes, model_list, image_writer)
## inference
infer_result = ds.apply(doc_analyze, ocr=True)
pipe.pipe_classify()
pipe.pipe_analyze()
pipe.pipe_parse()
### draw model result on each page
infer_result.draw_model(os.path.join(local_dir, f'{name_without_suff}_model.pdf')) # dump to local
pdf_info = pipe.pdf_mid_data["pdf_info"]
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
md_content = pipe.pipe_mk_markdown(
"unittest/tmp/images", drop_mode=DropMode.NONE, md_make_mode=MakeMode.MM_MD
)
### draw layout result on each page
pipe_result.draw_layout(os.path.join(local_dir, f'{name_without_suff}_layout.pdf')) # dump to local
if isinstance(md_content, list):
writer.write_string(f"{pdf_file_name}.md", "\n".join(md_content))
else:
writer.write_string(f"{pdf_file_name}.md", md_content)
### draw spans result on each page
pipe_result.draw_span(os.path.join(local_dir, f'{name_without_suff}_spans.pdf')) # dump to local
### dump markdown
pipe_result.dump_md(writer, f'{name_without_suff}.md', "unittest/tmp/images") # dump to remote s3
前去 :doc:`../data/data_reader_writer` 获取更多有关 **读写** 示例
......@@ -9,3 +9,5 @@
:caption: 教程
tutorial/output_file_description
tutorial/pipeline
......@@ -137,49 +137,45 @@ poly 坐标的格式 [x0, y0, x1, y1, x2, y2, x3, y3],
some_pdf_middle.json
~~~~~~~~~~~~~~~~~~~~
+-----------+----------------------------------------------------------+
| 字段名 | 解释 |
+===========+==========================================================+
| pdf_info | list,每个 |
| | 元素都是一个dict,这个dict是每一页pdf的解析结果,详见下表 |
+-----------+----------------------------------------------------------+
| | ocr \| txt,用来标识本次解析的中间态使用的模式 |
| \_parse_type | |
+-----------+----------------------------------------------------------+
| | string, 表示本次解析使用的 magic-pdf 的版本号 |
| \_version_name | |
+-----------+----------------------------------------------------------+
+--------------------+----------------------------------------------------------+
| 字段名 | 解释 |
+====================+==========================================================+
| pdf_info | list,每个元素都是一个 |
| | dict,这个dict是每一页pdf的解析结果,详见下表 |
+--------------------+----------------------------------------------------------+
| \_parse_type | ocr \| txt,用来标识本次解析的中间态使用的模式 |
+--------------------+----------------------------------------------------------+
| \_version_name | string,表示本次解析使用的 magic-pdf 的版本号 |
+-------------------------------------------------------------------------------+
**pdf_info** 字段结构说明
+--------------+-------------------------------------------------------+
| 字段名 | 解释 |
+==============+=======================================================+
| | pdf预处理后,未分段的中间结果 |
| preeproc_blocks | |
+--------------+-------------------------------------------------------+
| | 布局分割的结果, |
| layout_bboxes | 含有布局的方向(垂直、水平),和bbox,按阅读顺序排序 |
+--------------+-------------------------------------------------------+
| page_idx | 页码,从0开始 |
+--------------+-------------------------------------------------------+
| page_size | 页面的宽度和高度 |
+--------------+-------------------------------------------------------+
| \ | 布局树状结构 |
| _layout_tree | |
+--------------+-------------------------------------------------------+
| images | list,每个元素是一个dict,每个dict表示一个img_block |
+--------------+-------------------------------------------------------+
| tables | list,每个元素是一个dict,每个dict表示一个table_block |
+--------------+-------------------------------------------------------+
| | list,每个元素 |
| interline_equations | 是一个dict,每个dict表示一个interline_equation_block |
+--------------+-------------------------------------------------------+
| | List, 模型返回的需要drop的block信息 |
| discarded_blocks | |
+--------------+-------------------------------------------------------+
| para_blocks | 将preproc_blocks进行分段之后的结果 |
+--------------+-------------------------------------------------------+
+---------------------+-------------------------------------------------------+
| 字段名 | 解释 |
+=====================+=======================================================+
| preproc_blocks | pdf预处理后,未分段的中间结果 |
+---------------------+-------------------------------------------------------+
| | 布局分割的结果, |
| layout_bboxes | 含有布局的方向(垂直、水平),和bbox,按阅读顺序排序 |
+---------------------+-------------------------------------------------------+
| page_idx | 页码,从0开始 |
+---------------------+-------------------------------------------------------+
| page_size | 页面的宽度和高度 |
+---------------------+-------------------------------------------------------+
| \_layout_tree | 布局树状结构 |
+---------------------+-------------------------------------------------------+
| images | list,每个元素是一个dict,每个dict表示一个img_block |
+---------------------+-------------------------------------------------------+
| tables | list,每个元素是一个dict,每个dict表示一个table_block |
+---------------------+-------------------------------------------------------+
| | list,每个元素是一个 |
| interline_equations | dict,每个dict表示一个interline_equation_block |
+---------------------+-------------------------------------------------------+
| | List, 模型返回的需要drop的block信息 |
| discarded_blocks | |
+---------------------+-------------------------------------------------------+
| para_blocks | 将preproc_blocks进行分段之后的结果 |
+---------------------+-------------------------------------------------------+
上表中 ``para_blocks``
是个dict的数组,每个dict是一个block结构,block最多支持一次嵌套
......@@ -200,20 +196,18 @@ blocks list,里面的每个元素都是一个dict格式的二级block
二级block中的字段包括
+-----+----------------------------------------------------------------+
| 字 | 解释 |
| 段 | |
| 名 | |
+=====+================================================================+
| | block类型 |
| type | |
+-----+----------------------------------------------------------------+
| | block矩形框坐标 |
| bbox | |
+-----+----------------------------------------------------------------+
| | list,每个元素都是一个dict表示的line,用来描述一行信息的构成 |
| lines | |
+-----+----------------------------------------------------------------+
+----------+----------------------------------------------------------------+
| 字 | 解释 |
| 段 | |
| 名 | |
+==========+================================================================+
| | block类型 |
| type | |
+----------+----------------------------------------------------------------+
| bbox | block矩形框坐标 |
+----------+----------------------------------------------------------------+
| lines | list,每个元素都是一个dict表示的line,用来描述一行信息的构成 |
+----------+----------------------------------------------------------------+
二级block的类型详解
......@@ -237,22 +231,21 @@ interline_equation 行间公式块
line 的 字段格式如下
+----+-----------------------------------------------------------------+
| 字 | 解释 |
| 段 | |
| 名 | |
+====+=================================================================+
| bbox | line的矩形框坐标 |
| | |
+----+-----------------------------------------------------------------+
| spans | list, |
| | 每个元素都是一个dict表示的span,用来描述一个最小组成单元的构成 |
+----+-----------------------------------------------------------------+
+-----------+-----------------------------------------------------------------+
| 字 | 解释 |
| 段 | |
| 名 | |
+===========+=================================================================+
| bbox | line的矩形框坐标 |
+-----------+-----------------------------------------------------------------+
| spans | list, |
| | 每个元素都是一个dict表示的span,用来描述一个最小组成单元的构成 |
+-----------+-----------------------------------------------------------------+
**span**
+------------+---------------------------------------------------------+
| 字段名 | 解释 |
| 字段名 | 解释 |
+============+=========================================================+
| bbox | span的矩形框坐标 |
+------------+---------------------------------------------------------+
......
流水线管道
===========
极简示例
^^^^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
# args
pdf_file_name = "abc.pdf" # replace with the real pdf path
name_without_suff = pdf_file_name.split(".")[0]
# prepare env
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
image_dir = str(os.path.basename(local_image_dir))
# read bytes
reader1 = FileBasedDataReader("")
pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
运行以上的代码,会得到如下的结果
.. code:: bash
output/
├── abc.md
└── images
除去初始化环境,如建立目录、导入依赖库等逻辑。真正将 ``pdf`` 转换为 ``markdown`` 的代码片段如下
.. code::
# read bytes
reader1 = FileBasedDataReader("")
pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
``ds.apply(doc_analyze, ocr=True)`` 会生成 ``InferenceResult`` 对象。 ``InferenceResult`` 对象执行 ``pipe_ocr_mode`` 方法会生成 ``PipeResult`` 对象。
``PipeResult`` 对象执行 ``dump_md`` 会在指定位置生成 ``markdown`` 文件。
pipeline 的执行过程如下图所示
.. image:: ../../_static/image/pipeline.drawio.svg
.. raw:: html
<br> </br>
目前划分出数据、推理、程序处理三个阶段,分别对应着图上的 ``Dataset``, ``InferenceResult``, ``PipeResult`` 这三个实体。通过 ``apply`` , ``doc_analyze`` 或 ``pipe_ocr_mode`` 等方法链接在一起。
.. admonition:: Tip
:class: tip
要想获得更多有关 Dataset、InferenceResult、PipeResult 的使用示例子,请前往 :doc:`../quick_start/to_markdown`
要想获得更多有关 Dataset、InferenceResult、PipeResult 的细节信息请前往英文版 MinerU 文档进行查看!
管道组合
^^^^^^^^^
.. code:: python
class Dataset(ABC):
@abstractmethod
def apply(self, proc: Callable, *args, **kwargs):
"""Apply callable method which.
Args:
proc (Callable): invoke proc as follows:
proc(self, *args, **kwargs)
Returns:
Any: return the result generated by proc
"""
pass
class InferenceResult(InferenceResultBase):
def apply(self, proc: Callable, *args, **kwargs):
"""Apply callable method which.
Args:
proc (Callable): invoke proc as follows:
proc(inference_result, *args, **kwargs)
Returns:
Any: return the result generated by proc
"""
return proc(copy.deepcopy(self._infer_res), *args, **kwargs)
def pipe_ocr_mode(
self,
imageWriter: DataWriter,
start_page_id=0,
end_page_id=None,
debug_mode=False,
lang=None,
) -> PipeResult:
pass
class PipeResult:
def apply(self, proc: Callable, *args, **kwargs):
"""Apply callable method which.
Args:
proc (Callable): invoke proc as follows:
proc(pipeline_result, *args, **kwargs)
Returns:
Any: return the result generated by proc
"""
return proc(copy.deepcopy(self._pipe_res), *args, **kwargs)
``Dataset`` 、 ``InferenceResult`` 和 ``PipeResult`` 类均有 ``apply`` method。可用于组合不同阶段的运算过程。
如下所示,``MinerU`` 提供一套组合这些类的计算过程。
.. code:: python
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
用户可以根据的需求,自行实现一些组合用的函数。比如用户通过 ``apply`` 方法实现一个统计 ``pdf`` 文件页数的功能。
.. code:: python
from magic_pdf.data.data_reader_writer import FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
# args
pdf_file_name = "abc.pdf" # replace with the real pdf path
# read bytes
reader1 = FileBasedDataReader("")
pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
def count_page(ds)-> int:
return len(ds)
print("page number: ", ds.apply(count_page)) # will output the page count of `abc.pdf`
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment