Unverified Commit bcbbee8c authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #2622 from myhloli/dev

Dev
parents 3cc3f754 ced5a7b4
转换为 Markdown 文件
========================
本地文件示例
^^^^^^^^^^^^^^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod
# args
pdf_file_name = "abc.pdf" # replace with the real pdf path
name_without_suff = pdf_file_name.split(".")[0]
# prepare env
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
image_dir = str(os.path.basename(local_image_dir))
# read bytes
reader1 = FileBasedDataReader("")
pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
## inference
if ds.classify() == SupportedPdfParseMethod.OCR:
infer_result = ds.apply(doc_analyze, ocr=True)
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
else:
infer_result = ds.apply(doc_analyze, ocr=False)
## pipeline
pipe_result = infer_result.pipe_txt_mode(image_writer)
### draw model result on each page
infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
### draw layout result on each page
pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
### draw spans result on each page
pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
### dump markdown
pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
### dump content list
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
对象存储文件示例
^^^^^^^^^^^^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
bucket_name = "{Your S3 Bucket Name}" # replace with real bucket name
ak = "{Your S3 access key}" # replace with real s3 access key
sk = "{Your S3 secret key}" # replace with real s3 secret key
endpoint_url = "{Your S3 endpoint_url}" # replace with real s3 endpoint_url
reader = S3DataReader('unittest/tmp/', bucket_name, ak, sk, endpoint_url) # replace `unittest/tmp` with the real s3 prefix
writer = S3DataWriter('unittest/tmp', bucket_name, ak, sk, endpoint_url)
image_writer = S3DataWriter('unittest/tmp/images', bucket_name, ak, sk, endpoint_url)
# args
pdf_file_name = (
"s3://llm-pdf-text-1/unittest/tmp/bug5-11.pdf" # replace with the real s3 path
)
# prepare env
local_dir = "output"
name_without_suff = os.path.basename(pdf_file_name).split(".")[0]
# read bytes
pdf_bytes = reader.read(pdf_file_name) # read the pdf content
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
## inference
if ds.classify() == SupportedPdfParseMethod.OCR:
infer_result = ds.apply(doc_analyze, ocr=True)
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
else:
infer_result = ds.apply(doc_analyze, ocr=False)
## pipeline
pipe_result = infer_result.pipe_txt_mode(image_writer)
### draw model result on each page
infer_result.draw_model(os.path.join(local_dir, f'{name_without_suff}_model.pdf')) # dump to local
### draw layout result on each page
pipe_result.draw_layout(os.path.join(local_dir, f'{name_without_suff}_layout.pdf')) # dump to local
### draw spans result on each page
pipe_result.draw_span(os.path.join(local_dir, f'{name_without_suff}_spans.pdf')) # dump to local
### dump markdown
pipe_result.dump_md(writer, f'{name_without_suff}.md', "unittest/tmp/images") # dump to remote s3
### dump content list
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
前去 :doc:`../data/data_reader_writer` 获取更多有关 **读写** 示例
教程
===========
让我们通过构建一个最小项目来学习 MinerU
.. toctree::
:maxdepth: 1
:caption: 教程
tutorial/output_file_description
tutorial/pipeline
输出文件格式介绍
===============
``magic-pdf`` 命令执行后除了输出和 markdown
有关的文件以外,还会生成若干个和 markdown
无关的文件。现在将一一介绍这些文件
some_pdf_layout.pdf
~~~~~~~~~~~~~~~~~~~
每一页的 layout 均由一个或多个框组成。
每个框左上脚的数字表明它们的序号。此外 layout.pdf
框内用不同的背景色块圈定不同的内容块。
.. figure:: ../../_static/image/layout_example.png
:alt: layout 页面示例
layout 页面示例
some_pdf_spans.pdf
~~~~~~~~~~~~~~~~~~
根据 span 类型的不同,采用不同颜色线框绘制页面上所有
span。该文件可以用于质检,可以快速排查出文本丢失、行间公式未识别等问题。
.. figure:: ../../_static/image/spans_example.png
:alt: span 页面示例
span 页面示例
some_pdf_model.json
~~~~~~~~~~~~~~~~~~~
结构定义
^^^^^^^^
.. code:: python
from pydantic import BaseModel, Field
from enum import IntEnum
class CategoryType(IntEnum):
title = 0 # 标题
plain_text = 1 # 文本
abandon = 2 # 包括页眉页脚页码和页面注释
figure = 3 # 图片
figure_caption = 4 # 图片描述
table = 5 # 表格
table_caption = 6 # 表格描述
table_footnote = 7 # 表格注释
isolate_formula = 8 # 行间公式
formula_caption = 9 # 行间公式的标号
embedding = 13 # 行内公式
isolated = 14 # 行间公式
text = 15 # ocr 识别结果
class PageInfo(BaseModel):
page_no: int = Field(description="页码序号,第一页的序号是 0", ge=0)
height: int = Field(description="页面高度", gt=0)
width: int = Field(description="页面宽度", ge=0)
class ObjectInferenceResult(BaseModel):
category_id: CategoryType = Field(description="类别", ge=0)
poly: list[float] = Field(description="四边形坐标, 分别是 左上,右上,右下,左下 四点的坐标")
score: float = Field(description="推理结果的置信度")
latex: str | None = Field(description="latex 解析结果", default=None)
html: str | None = Field(description="html 解析结果", default=None)
class PageInferenceResults(BaseModel):
layout_dets: list[ObjectInferenceResult] = Field(description="页面识别结果", ge=0)
page_info: PageInfo = Field(description="页面元信息")
# 所有页面的推理结果按照页码顺序依次放到列表中即为 minerU 推理结果
inference_result: list[PageInferenceResults] = []
poly 坐标的格式 [x0, y0, x1, y1, x2, y2, x3, y3],
分别表示左上、右上、右下、左下四点的坐标 |poly 坐标示意图|
示例数据
^^^^^^^^
.. code:: json
[
{
"layout_dets": [
{
"category_id": 2,
"poly": [
99.1906967163086,
100.3119125366211,
730.3707885742188,
100.3119125366211,
730.3707885742188,
245.81326293945312,
99.1906967163086,
245.81326293945312
],
"score": 0.9999997615814209
}
],
"page_info": {
"page_no": 0,
"height": 2339,
"width": 1654
}
},
{
"layout_dets": [
{
"category_id": 5,
"poly": [
99.13092803955078,
2210.680419921875,
497.3183898925781,
2210.680419921875,
497.3183898925781,
2264.78076171875,
99.13092803955078,
2264.78076171875
],
"score": 0.9999997019767761
}
],
"page_info": {
"page_no": 1,
"height": 2339,
"width": 1654
}
}
]
some_pdf_middle.json
~~~~~~~~~~~~~~~~~~~~
+--------------------+----------------------------------------------------------+
| 字段名 | 解释 |
+====================+==========================================================+
| pdf_info | list,每个元素都是一个 |
| | dict,这个dict是每一页pdf的解析结果,详见下表 |
+--------------------+----------------------------------------------------------+
| \_parse_type | ocr \| txt,用来标识本次解析的中间态使用的模式 |
+--------------------+----------------------------------------------------------+
| \_version_name | string,表示本次解析使用的 magic-pdf 的版本号 |
+-------------------------------------------------------------------------------+
**pdf_info** 字段结构说明
+---------------------+-------------------------------------------------------+
| 字段名 | 解释 |
+=====================+=======================================================+
| preproc_blocks | pdf预处理后,未分段的中间结果 |
+---------------------+-------------------------------------------------------+
| | 布局分割的结果, |
| layout_bboxes | 含有布局的方向(垂直、水平),和bbox,按阅读顺序排序 |
+---------------------+-------------------------------------------------------+
| page_idx | 页码,从0开始 |
+---------------------+-------------------------------------------------------+
| page_size | 页面的宽度和高度 |
+---------------------+-------------------------------------------------------+
| \_layout_tree | 布局树状结构 |
+---------------------+-------------------------------------------------------+
| images | list,每个元素是一个dict,每个dict表示一个img_block |
+---------------------+-------------------------------------------------------+
| tables | list,每个元素是一个dict,每个dict表示一个table_block |
+---------------------+-------------------------------------------------------+
| | list,每个元素是一个 |
| interline_equations | dict,每个dict表示一个interline_equation_block |
+---------------------+-------------------------------------------------------+
| | List, 模型返回的需要drop的block信息 |
| discarded_blocks | |
+---------------------+-------------------------------------------------------+
| para_blocks | 将preproc_blocks进行分段之后的结果 |
+---------------------+-------------------------------------------------------+
上表中 ``para_blocks``
是个dict的数组,每个dict是一个block结构,block最多支持一次嵌套
**block**
外层block被称为一级block,一级block中的字段包括
====== ===============================================
字段名 解释
====== ===============================================
type block类型(table|image)
bbox block矩形框坐标
blocks list,里面的每个元素都是一个dict格式的二级block
====== ===============================================
一级block只有”table”和”image”两种类型,其余block均为二级block
二级block中的字段包括
+----------+----------------------------------------------------------------+
| 字 | 解释 |
| 段 | |
| 名 | |
+==========+================================================================+
| | block类型 |
| type | |
+----------+----------------------------------------------------------------+
| bbox | block矩形框坐标 |
+----------+----------------------------------------------------------------+
| lines | list,每个元素都是一个dict表示的line,用来描述一行信息的构成 |
+----------+----------------------------------------------------------------+
二级block的类型详解
================== ==============
type desc
================== ==============
image_body 图像的本体
image_caption 图像的描述文本
image_footnote 图像的脚注
table_body 表格本体
table_caption 表格的描述文本
table_footnote 表格的脚注
text 文本块
title 标题块
index 目录块
list 列表块
interline_equation 行间公式块
================== ==============
**line**
line 的 字段格式如下
+-----------+-----------------------------------------------------------------+
| 字 | 解释 |
| 段 | |
| 名 | |
+===========+=================================================================+
| bbox | line的矩形框坐标 |
+-----------+-----------------------------------------------------------------+
| spans | list, |
| | 每个元素都是一个dict表示的span,用来描述一个最小组成单元的构成 |
+-----------+-----------------------------------------------------------------+
**span**
+------------+---------------------------------------------------------+
| 字段名 | 解释 |
+============+=========================================================+
| bbox | span的矩形框坐标 |
+------------+---------------------------------------------------------+
| type | span的类型 |
+------------+---------------------------------------------------------+
| content \| | 文本类型的span使用content,图表类使用img_path |
| img_path | 用来存储实际的文本或者截图路径信息 |
+------------+---------------------------------------------------------+
span 的类型有如下几种
================== ========
type desc
================== ========
image 图片
table 表格
text 文本
inline_equation 行内公式
interline_equation 行间公式
================== ========
**总结**
span是所有元素的最小存储单元
para_blocks内存储的元素为区块信息
区块结构为
一级block(如有)->二级block->line->span
.. _示例数据-1:
示例数据
^^^^^^^^
.. code:: json
{
"pdf_info": [
{
"preproc_blocks": [
{
"type": "text",
"bbox": [
52,
61.956024169921875,
294,
82.99800872802734
],
"lines": [
{
"bbox": [
52,
61.956024169921875,
294,
72.0000228881836
],
"spans": [
{
"bbox": [
54.0,
61.956024169921875,
296.2261657714844,
72.0000228881836
],
"content": "dependent on the service headway and the reliability of the departure ",
"type": "text",
"score": 1.0
}
]
}
]
}
],
"layout_bboxes": [
{
"layout_bbox": [
52,
61,
294,
731
],
"layout_label": "V",
"sub_layout": []
}
],
"page_idx": 0,
"page_size": [
612.0,
792.0
],
"_layout_tree": [],
"images": [],
"tables": [],
"interline_equations": [],
"discarded_blocks": [],
"para_blocks": [
{
"type": "text",
"bbox": [
52,
61.956024169921875,
294,
82.99800872802734
],
"lines": [
{
"bbox": [
52,
61.956024169921875,
294,
72.0000228881836
],
"spans": [
{
"bbox": [
54.0,
61.956024169921875,
296.2261657714844,
72.0000228881836
],
"content": "dependent on the service headway and the reliability of the departure ",
"type": "text",
"score": 1.0
}
]
}
]
}
]
}
],
"_parse_type": "txt",
"_version_name": "0.6.1"
}
.. |poly 坐标示意图| image:: ../../_static/image/poly.png
流水线管道
===========
极简示例
^^^^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
# args
pdf_file_name = "abc.pdf" # replace with the real pdf path
name_without_suff = pdf_file_name.split(".")[0]
# prepare env
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
image_dir = str(os.path.basename(local_image_dir))
# read bytes
reader1 = FileBasedDataReader("")
pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
运行以上的代码,会得到如下的结果
.. code:: bash
output/
├── abc.md
└── images
除去初始化环境,如建立目录、导入依赖库等逻辑。真正将 ``pdf`` 转换为 ``markdown`` 的代码片段如下
.. code::
# read bytes
reader1 = FileBasedDataReader("")
pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
``ds.apply(doc_analyze, ocr=True)`` 会生成 ``InferenceResult`` 对象。 ``InferenceResult`` 对象执行 ``pipe_ocr_mode`` 方法会生成 ``PipeResult`` 对象。
``PipeResult`` 对象执行 ``dump_md`` 会在指定位置生成 ``markdown`` 文件。
pipeline 的执行过程如下图所示
.. image:: ../../_static/image/pipeline.drawio.svg
.. raw:: html
<br> </br>
目前划分出数据、推理、程序处理三个阶段,分别对应着图上的 ``Dataset``, ``InferenceResult``, ``PipeResult`` 这三个实体。通过 ``apply`` , ``doc_analyze`` 或 ``pipe_ocr_mode`` 等方法链接在一起。
.. admonition:: Tip
:class: tip
要想获得更多有关 Dataset、InferenceResult、PipeResult 的使用示例子,请前往 :doc:`../quick_start/to_markdown`
要想获得更多有关 Dataset、InferenceResult、PipeResult 的细节信息请前往英文版 MinerU 文档进行查看!
管道组合
^^^^^^^^^
.. code:: python
class Dataset(ABC):
@abstractmethod
def apply(self, proc: Callable, *args, **kwargs):
"""Apply callable method which.
Args:
proc (Callable): invoke proc as follows:
proc(self, *args, **kwargs)
Returns:
Any: return the result generated by proc
"""
pass
class InferenceResult(InferenceResultBase):
def apply(self, proc: Callable, *args, **kwargs):
"""Apply callable method which.
Args:
proc (Callable): invoke proc as follows:
proc(inference_result, *args, **kwargs)
Returns:
Any: return the result generated by proc
"""
return proc(copy.deepcopy(self._infer_res), *args, **kwargs)
def pipe_ocr_mode(
self,
imageWriter: DataWriter,
start_page_id=0,
end_page_id=None,
debug_mode=False,
lang=None,
) -> PipeResult:
pass
class PipeResult:
def apply(self, proc: Callable, *args, **kwargs):
"""Apply callable method which.
Args:
proc (Callable): invoke proc as follows:
proc(pipeline_result, *args, **kwargs)
Returns:
Any: return the result generated by proc
"""
return proc(copy.deepcopy(self._pipe_res), *args, **kwargs)
``Dataset`` 、 ``InferenceResult`` 和 ``PipeResult`` 类均有 ``apply`` method。可用于组合不同阶段的运算过程。
如下所示,``MinerU`` 提供一套组合这些类的计算过程。
.. code:: python
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
用户可以根据的需求,自行实现一些组合用的函数。比如用户通过 ``apply`` 方法实现一个统计 ``pdf`` 文件页数的功能。
.. code:: python
from magic_pdf.data.data_reader_writer import FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
# args
pdf_file_name = "abc.pdf" # replace with the real pdf path
# read bytes
reader1 = FileBasedDataReader("")
pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
def count_page(ds)-> int:
return len(ds)
print("page number: ", ds.apply(count_page)) # will output the page count of `abc.pdf`
......@@ -2,8 +2,10 @@
## Project List
- [llama_index_rag](./llama_index_rag/README.md): Build a lightweight RAG system based on llama_index
- [gradio_app](./gradio_app/README.md): Build a web app based on gradio
- ~~[web_demo](./web_demo/README.md): MinerU online [demo](https://opendatalab.com/OpenSourceTools/Extractor/PDF/) localized deployment version~~(Deprecated)
- [web_api](./web_api/README.md): Web API Based on FastAPI
- [multi_gpu](./multi_gpu/README.md): Multi-GPU parallel processing based on LitServe
- Projects compatible with version 2.0:
- [gradio_app](./gradio_app/README.md): Web application based on Gradio
- Projects not yet compatible with version 2.0:
- [web_api](./web_api/README.md): Web API based on FastAPI
- [multi_gpu](./multi_gpu/README.md): Multi-GPU parallel processing based on LitServe
......@@ -2,8 +2,9 @@
## 项目列表
- [llama_index_rag](./llama_index_rag/README_zh-CN.md): 基于 llama_index 构建轻量级 RAG 系统
- [gradio_app](./gradio_app/README_zh-CN.md): 基于 Gradio 的 Web 应用
- ~~[web_demo](./web_demo/README_zh-CN.md): MinerU在线[demo](https://opendatalab.com/OpenSourceTools/Extractor/PDF/)本地化部署版本~~(已过时)
- [web_api](./web_api/README.md): 基于 FastAPI 的 Web API
- [multi_gpu](./multi_gpu/README.md): 基于 LitServe 的多 GPU 并行处理
- 已兼容2.0版本的项目列表
- [gradio_app](./gradio_app/README_zh-CN.md): 基于 Gradio 的 Web 应用
- 未兼容2.0版本的项目列表
- [web_api](./web_api/README.md): 基于 FastAPI 的 Web API
- [multi_gpu](./multi_gpu/README.md): 基于 LitServe 的多 GPU 并行处理
## Installation
MinerU
```bash
git clone https://github.com/opendatalab/MinerU.git
cd MinerU
conda create -n MinerU python=3.10
conda activate MinerU
pip install .[full] --extra-index-url https://wheels.myhloli.com
```
Third-party software
```bash
# install
pip install llama-index-vector-stores-elasticsearch==0.2.0
pip install llama-index-embeddings-dashscope==0.2.0
pip install llama-index-core==0.10.68
pip install einops==0.7.0
pip install transformers-stream-generator==0.0.5
pip install accelerate==0.33.0
# uninstall
pip uninstall transformer-engine
```
## Environment Configuration
```
export DASHSCOPE_API_KEY={some_key}
export ES_USER={some_es_user}
export ES_PASSWORD={some_es_password}
export ES_URL=http://{es_url}:9200
```
For instructions on obtaining a DASHSCOPE_API_KEY, refer to [documentation](https://help.aliyun.com/zh/dashscope/opening-service)
## Usage
### Data Ingestion
```bash
python data_ingestion.py -p some.pdf # load data from pdf
or
python data_ingestion.py -p /opt/data/some_pdf_directory/ # load data from multiples pdf which under the directory of {some_pdf_directory}
```
### Query
```bash
python query.py --question '{the_question_you_want_to_ask}'
```
## Example
````bash
# Start the es service
docker compose up -d
or
docker-compose up -d
# Set environment variables
export ES_USER=elastic
export ES_PASSWORD=llama_index
export ES_URL=http://127.0.0.1:9200
export DASHSCOPE_API_KEY={some_key}
# Ingest data
python data_ingestion.py example/data/declaration_of_the_rights_of_man_1789.pdf
# Ask a question
python query.py -q 'how about the rights of men'
## outputs
Please answer the question based on the content within ```:
```
I. Men are born, and always continue, free and equal in respect of their rights. Civil distinctions, therefore, can be founded only on public utility.
```
My question is:how about the rights of men。
question: how about the rights of men
answer: The statement implies that men are born free and equal in terms of their rights. Civil distinctions should only be based on public utility. However, it does not specify what those rights are. It is up to society and individual countries to determine and protect the specific rights of their citizens.
````
## Development
`MinerU` provides a `RAG` integration interface, allowing users to specify a single input `pdf` file or a directory. `MinerU` will automatically parse the input files and return an iterable interface for retrieving the data.
### API Interface
```python
from magic_pdf.integrations.rag.type import Node
class RagPageReader:
def get_rel_map(self) -> list[ElementRelation]:
# Retrieve the relationships between nodes
pass
...
class RagDocumentReader:
...
class DataReader:
def __init__(self, path_or_directory: str, method: str, output_dir: str):
pass
def get_documents_count(self) -> int:
"""Get the number of pdf documents"""
pass
def get_document_result(self, idx: int) -> RagDocumentReader | None:
"""Retrieve the parsed content of a specific pdf"""
pass
def get_document_filename(self, idx: int) -> Path:
"""Retrieve the path of a specific pdf"""
pass
```
Type Definitions
```python
class Node(BaseModel):
category_type: CategoryType = Field(description='Category') # Category
text: str | None = Field(description='Text content', default=None)
image_path: str | None = Field(description='Path to image or table (table may be stored as an image)', default=None)
anno_id: int = Field(description='Unique ID', default=-1)
latex: str | None = Field(description='LaTeX output for equations or tables', default=None)
html: str | None = Field(description='HTML output for tables', default=None)
```
Tables can be stored in one of three formats: image, LaTeX, or HTML.
`anno_id` is a globally unique ID for each Node. It can be used later to match this Node with other Nodes. The relationships between nodes can be retrieved using the `get_rel_map` method. Users can use `anno_id` to link nodes and construct a RAG index that includes node relationships.
### Node Relationship Matrix
| | image_body | table_body |
| -------------- | ---------- | ---------- |
| image_caption | sibling | |
| table_caption | | sibling |
| table_footnote | | sibling |
<details open="open">
<summary><h2 style="display: inline-block">目录</h2></summary>
<li><a href="#介绍">介绍</a></li>
<li><a href="#安装">安装</a></li>
<li><a href="#示例">示例</a></li>
<li><a href="#开发">开发</a></li>
</ol>
</details>
## 介绍
`MinerU` 提供数据 `API接口` 以支持用户导入数据到 `RAG` 系统。本项目将基于`通义千问`展示如何构建一个轻量级的 `RAG` 系统。
<p align="center">
<img src="rag_data_api.png" width="300px" style="vertical-align:middle;">
</p>
## 安装
环境要求
```text
NVIDIA A100 80GB,
Centos 7 3.10.0-957.el7.x86_64
Client: Docker Engine - Community
Version: 24.0.5
API version: 1.43
Go version: go1.20.6
Git commit: ced0996
Built: Fri Jul 21 20:39:02 2023
OS/Arch: linux/amd64
Context: default
Server: Docker Engine - Community
Engine:
Version: 24.0.5
API version: 1.43 (minimum version 1.12)
Go version: go1.20.6
Git commit: a61e2b4
Built: Fri Jul 21 20:38:05 2023
OS/Arch: linux/amd64
Experimental: false
containerd:
Version: 1.6.25
GitCommit: d8f198a4ed8892c764191ef7b3b06d8a2eeb5c7f
runc:
Version: 1.1.10
GitCommit: v1.1.10-0-g18a0cb0
docker-init:
Version: 0.19.0
GitCommit: de40ad0
```
请参考[文档](../../README_zh-CN.md) 安装 MinerU
第三方软件
```bash
# install
pip install modelscope==1.14.0
pip install llama-index-vector-stores-elasticsearch==0.2.0
pip install llama-index-embeddings-dashscope==0.2.0
pip install llama-index-core==0.10.68
pip install einops==0.7.0
pip install transformers-stream-generator==0.0.5
pip install accelerate==0.33.0
# uninstall
pip uninstall transformer-engine
```
## 示例
````bash
cd projects/llama_index_rag
docker compose up -d
or
docker-compose up -d
# 配置环境变量
export ES_USER=elastic
export ES_PASSWORD=llama_index
export ES_URL=http://127.0.0.1:9200
export DASHSCOPE_API_KEY={some_key}
DASHSCOPE_API_KEY 开通参考[文档](https://help.aliyun.com/zh/dashscope/opening-service)
# 未导入数据,查询问题。返回通义千问默认答案
python query.py -q 'how about the rights of men'
## outputs
question: how about the rights of men
answer: The topic of men's rights often refers to discussions around legal, social, and political issues that affect men specifically or differently from women. Movements related to men's rights advocate for addressing areas where men face discrimination or unique challenges, such as:
Child Custody: Ensuring that men have equal opportunities for custody of their children following divorce or separation.
Domestic Violence: Recognizing that men can also be victims of domestic abuse and ensuring they have access to support services.
Mental Health and Suicide Rates: Addressing the higher rates of suicide among men and providing mental health resources.
Military Conscription: In some countries, only men are required to register for military service, which is seen as a gender-based obligation.
Workplace Safety: Historically, more men than women have been employed in high-risk occupations, leading to higher workplace injury and death rates.
Parental Leave: Advocating for paternity leave policies that allow men to take time off work for family care.
Men's rights activism often intersects with broader discussions on gender equality and aims to promote fairness and equity across genders. It's important to note that while advocating for these issues, it should be done in a way that does not detract from or oppose the goals of gender equality and the rights of other groups. The focus should be on creating a fair society where everyone has equal opportunities and protections under the law.
# 导入数据
python data_ingestion.py -p example/data/
or
python data_ingestion.py -p example/data/declaration_of_the_rights_of_man_1789.pdf
# 导入数据后,查询问题。通义千问模型会根据 RAG 系统的检索结果,结合上下文,给出答案。
python query.py -q 'how about the rights of men'
## outputs
请基于```内的内容回答问题。"
```
I. Men are born, and always continue, free and equal in respect of their rights. Civil distinctions, therefore, can be founded only on public utility.
```
我的问题是:how about the rights of men。
question: how about the rights of men
answer: The statement implies that men are born free and equal in terms of their rights. Civil distinctions should only be based on public utility. However, it does not specify what those rights are. It is up to society and individual countries to determine and protect the specific rights of their citizens.
````
## 开发
`MinerU` 提供了 `RAG` 集成接口,用户可以通过指定输入单个 `pdf` 文件或者某个目录。`MinerU` 会自动解析输入文件并返回可以迭代的接口用于获取数据
### API 接口
```python
from magic_pdf.integrations.rag.type import Node
class RagPageReader:
def get_rel_map(self) -> list[ElementRelation]:
# 获取节点间的关系
pass
...
class RagDocumentReader:
...
class DataReader:
def __init__(self, path_or_directory: str, method: str, output_dir: str):
pass
def get_documents_count(self) -> int:
"""获取 pdf 文档数量"""
pass
def get_document_result(self, idx: int) -> RagDocumentReader | None:
"""获取某个 pdf 的解析内容"""
pass
def get_document_filename(self, idx: int) -> Path:
"""获取某个 pdf 的具体路径"""
pass
```
类型定义
```python
class Node(BaseModel):
category_type: CategoryType = Field(description='类别') # 类别
text: str | None = Field(description='文本内容',
default=None)
image_path: str | None = Field(description='图或者表格(表可能用图片形式存储)的存储路径',
default=None)
anno_id: int = Field(description='unique id', default=-1)
latex: str | None = Field(description='公式或表格 latex 解析结果', default=None)
html: str | None = Field(description='表格的 html 解析结果', default=None)
```
表格存储形式可能会是 图片、latex、html 三种形式之一。
anno_id 是该 Node 的在全局唯一ID。后续可以用于匹配该 Node 和其他 Node 的关系。节点的关系可以通过方法 `get_rel_map` 获取。用户可以用 `anno_id` 匹配节点之间的关系,并用于构建具备节点的关系的 rag index。
### 节点类型关系矩阵
| | image_body | table_body |
| -------------- | ---------- | ---------- |
| image_caption | sibling | |
| table_caption | | sibling |
| table_footnote | | sibling |
import os
import click
from llama_index.core.schema import TextNode
from llama_index.embeddings.dashscope import (DashScopeEmbedding,
DashScopeTextEmbeddingModels,
DashScopeTextEmbeddingType)
from llama_index.vector_stores.elasticsearch import ElasticsearchStore
from magic_pdf.integrations.rag.api import DataReader
es_vec_store = ElasticsearchStore(
index_name='rag_index',
es_url=os.getenv('ES_URL', 'http://127.0.0.1:9200'),
es_user=os.getenv('ES_USER', 'elastic'),
es_password=os.getenv('ES_PASSWORD', 'llama_index'),
)
# Create embeddings
# text_type=`document` to build index
def embed_node(node):
embedder = DashScopeEmbedding(
model_name=DashScopeTextEmbeddingModels.TEXT_EMBEDDING_V2,
text_type=DashScopeTextEmbeddingType.TEXT_TYPE_DOCUMENT,
)
result_embeddings = embedder.get_text_embedding(node.text)
node.embedding = result_embeddings
return node
@click.command()
@click.option(
'-p',
'--path',
'path',
type=click.Path(exists=True),
required=True,
help='local pdf filepath or directory',
)
def cli(path):
output_dir = '/tmp/magic_pdf/integrations/rag/'
os.makedirs(output_dir, exist_ok=True)
documents = DataReader(path, 'ocr', output_dir)
# build nodes
nodes = []
for idx in range(documents.get_documents_count()):
doc = documents.get_document_result(idx)
if doc is None: # something wrong happens when parse pdf !
continue
for page in iter(
doc): # iterate documents from initial page to last page !
for element in iter(page): # iterate the element from all page !
if element.text is None:
continue
nodes.append(
embed_node(
TextNode(text=element.text,
metadata={'purpose': 'demo'})))
es_vec_store.add(nodes)
if __name__ == '__main__':
cli()
services:
es:
container_name: es
image: docker.elastic.co/elasticsearch/elasticsearch:8.11.3
volumes:
- esdata01:/usr/share/elasticsearch/data
ports:
- 9200:9200
environment:
- node.name=es
- ELASTIC_PASSWORD=llama_index
- bootstrap.memory_lock=false
- discovery.type=single-node
- xpack.security.enabled=true
- xpack.security.http.ssl.enabled=false
- xpack.security.transport.ssl.enabled=false
ulimits:
memlock:
soft: -1
hard: -1
restart: always
volumes:
esdata01:
driver: local
import os
import click
from llama_index.core.vector_stores.types import VectorStoreQuery
from llama_index.embeddings.dashscope import (DashScopeEmbedding,
DashScopeTextEmbeddingModels,
DashScopeTextEmbeddingType)
from llama_index.vector_stores.elasticsearch import (AsyncDenseVectorStrategy,
ElasticsearchStore)
# initialize qwen 7B model
from modelscope import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
es_vector_store = ElasticsearchStore(
index_name='rag_index',
es_url=os.getenv('ES_URL', 'http://127.0.0.1:9200'),
es_user=os.getenv('ES_USER', 'elastic'),
es_password=os.getenv('ES_PASSWORD', 'llama_index'),
retrieval_strategy=AsyncDenseVectorStrategy(),
)
def embed_text(text):
embedder = DashScopeEmbedding(
model_name=DashScopeTextEmbeddingModels.TEXT_EMBEDDING_V2,
text_type=DashScopeTextEmbeddingType.TEXT_TYPE_DOCUMENT,
)
return embedder.get_text_embedding(text)
def search(vector_store: ElasticsearchStore, query: str):
query_vec = VectorStoreQuery(query_embedding=embed_text(query))
result = vector_store.query(query_vec)
return '\n'.join([node.text for node in result.nodes])
@click.command()
@click.option(
'-q',
'--question',
'question',
required=True,
help='ask what you want to know!',
)
def cli(question):
tokenizer = AutoTokenizer.from_pretrained('qwen/Qwen-7B-Chat',
revision='v1.0.5',
trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained('qwen/Qwen-7B-Chat',
revision='v1.0.5',
device_map='auto',
trust_remote_code=True,
fp32=True).eval()
model.generation_config = GenerationConfig.from_pretrained(
'Qwen/Qwen-7B-Chat', revision='v1.0.5', trust_remote_code=True)
# define a prompt template for the vectorDB-enhanced LLM generation
def answer_question(question, context, model):
if context == '':
prompt = question
else:
prompt = f'''请基于```内的内容回答问题。"
```
{context}
```
我的问题是:{question}
'''
history = None
print(prompt)
response, history = model.chat(tokenizer, prompt, history=None)
return response
answer = answer_question(question, search(es_vector_store, question),
model)
print(f'question: {question}\n'
f'answer: {answer}')
"""
python query.py -q 'how about the rights of men'
"""
if __name__ == '__main__':
cli()
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
pnpm-debug.log*
lerna-debug.log*
node_modules
dist
dist-ssr
*.local
# Editor directories and files
.vscode/*
!.vscode/extensions.json
.idea
.DS_Store
*.suo
*.ntvs*
*.njsproj
*.sln
*.sw?
# MinerU web
## Table of Contents
- [Local Frontend Development](#local-frontend-development)
- [Technology Stack](#technology-stack)
## Local Frontend Development
### Prerequisites
- Node.js 18.x
- pnpm
### Installation Steps
1. Install Node.js 18
- Visit the [Node.js official website](https://nodejs.org/) to download and install Node.js version 18.x
2. Install pnpm
```bash
npm install -g pnpm
3. Clone the repository
```git clone https://github.com/opendatalab/MinerU
cd ./projects/web
```
4. Install dependencies
```
pnpm install
```
5. Run the development server
```
pnpm run dev
```
6. ⚠️ Note: This command is for local development only, do not use for deployment!
Open your browser and visit http://localhost:5173 (or another address output in the console)
7. Ensure that the backend service in ./projects/web_demo is running
8. If you encounter an error when executing `pnpm install`, you can switch to an alternative package manager.
```
npm install -g yarn
yarn
yarn start
```
## Building the Project
```
pnpm run build
```
## Technology Stack
- React
- Tailwind CSS
- typeScript
- zustand
- ahooks
# MinerU web
## 目录
- [前端本地开发](#前端本地开发)
- [技术栈](#技术栈)
## 前端本地开发
### 前置条件
- Node.js 18.x
- pnpm
### 安装步骤
1. 安装 Node.js 18
- 访问 [Node.js 官网](https://nodejs.org/) 下载并安装 Node.js 18.x 版本
2. 安装 pnpm
```bash
npm install -g pnpm
```
3. 克隆仓库
```
1. git clone https://github.com/opendatalab/MinerU
2. cd ./projects/web
```
4. 安装依赖
```
pnpm install
```
5. 运行开发服务器
```
pnpm run dev
```
6. ⚠️ 注意:此命令仅用于本地开发,不要用于部署!
打开浏览器访问 http://localhost:5173(或控制台输出的其他地址)
构建项目
要构建生产版本,请执行以下命令:
```
pnpm run build
```
7. 请确保./projects/web_demo后端服务启动
8. 如果pnpm install执行error,可更换包管理器
```
npm install -g yarn
yarn
yarn start
```
## 技术栈
- React
- Tailwind CSS
- typeScript
- zustand
- ahooks
import js from '@eslint/js'
import globals from 'globals'
import reactHooks from 'eslint-plugin-react-hooks'
import reactRefresh from 'eslint-plugin-react-refresh'
import tseslint from 'typescript-eslint'
export default tseslint.config(
{ ignores: ['dist'] },
{
extends: [js.configs.recommended, ...tseslint.configs.recommended],
files: ['**/*.{ts,tsx}'],
languageOptions: {
ecmaVersion: 2020,
globals: globals.browser,
},
plugins: {
'react-hooks': reactHooks,
'react-refresh': reactRefresh,
},
rules: {
...reactHooks.configs.recommended.rules,
'react-refresh/only-export-components': [
'warn',
{ allowConstantExport: true },
],
},
},
)
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<link rel="icon" type="image/svg+xml" href="/logo.svg" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>MinerU</title>
</head>
<body>
<div id="root"></div>
<script type="module" src="/src/main.tsx"></script>
</body>
</html>
This source diff could not be displayed because it is too large. You can view the blob instead.
{
"name": "my-react-app",
"private": true,
"version": "0.0.0",
"type": "module",
"scripts": {
"dev": "vite --host ",
"build": "tsc --noEmit && vite build",
"lint": "eslint .",
"preview": "vite preview"
},
"dependencies": {
"@ant-design/icons": "^5.4.0",
"@codemirror/view": "^6.33.0",
"@tanstack/react-query": "^5.56.2",
"@types/lodash": "^4.17.7",
"@types/qs": "^6.9.15",
"@types/react-copy-to-clipboard": "^5.0.7",
"@types/react-syntax-highlighter": "^15.5.13",
"@uiw/codemirror-extensions-langs": "^4.23.0",
"@uiw/react-codemirror": "^4.23.0",
"ahooks": "^3.8.1",
"antd": "^5.20.3",
"axios": "^1.7.5",
"canvas": "^2.11.2",
"classnames": "^2.5.1",
"js-cookie": "^3.0.5",
"lodash": "^4.17.21",
"path2d": "^0.2.1",
"qs": "^6.13.0",
"react": "^18.3.1",
"react-copy-to-clipboard": "^5.1.0",
"react-dom": "^18.3.1",
"react-intl": "^6.6.8",
"react-markdown": "^9.0.1",
"react-query": "^3.39.3",
"react-router-dom": "^6.26.1",
"react-syntax-highlighter": "^15.5.0",
"rehype-katex": "^7.0.1",
"rehype-raw": "^7.0.0",
"remark-gfm": "^4.0.0",
"remark-math": "^6.0.0",
"zustand": "^4.5.5"
},
"devDependencies": {
"@eslint/js": "^9.9.0",
"@types/js-cookie": "^3.0.6",
"@types/node": "^22.5.1",
"@types/react": "^18.3.3",
"@types/react-dom": "^18.3.0",
"@vitejs/plugin-react": "^4.3.1",
"autoprefixer": "^10.4.20",
"eslint": "^9.9.0",
"eslint-plugin-react-hooks": "^5.1.0-rc.0",
"eslint-plugin-react-refresh": "^0.4.9",
"globals": "^15.9.0",
"less": "^4.2.0",
"postcss": "^8.4.41",
"sass-embedded": "^1.77.8",
"tailwindcss": "^3.4.10",
"ts-prune": "^0.10.3",
"typescript": "^5.5.3",
"typescript-eslint": "^8.0.1",
"vite": "^5.4.1"
}
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment