Unverified Commit 4bb54393 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #1427 from opendatalab/release-1.0.0

Release 1.0.0
parents 04f084ac 1c9f9942
......@@ -4,7 +4,9 @@
:maxdepth: 2
user_guide/install
user_guide/usage
user_guide/quick_start
user_guide/tutorial
user_guide/data
user_guide/inference_result
user_guide/pipe_result
......@@ -87,56 +87,70 @@ Read Examples
.. code:: python
import os
from magic_pdf.data.data_reader_writer import *
from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
from magic_pdf.data.schemas import S3Config
# file based related
# file based related
file_based_reader1 = FileBasedDataReader('')
## will read file abc
file_based_reader1.read('abc')
## will read file abc
file_based_reader1.read('abc')
file_based_reader2 = FileBasedDataReader('/tmp')
## will read /tmp/abc
file_based_reader2.read('abc')
## will read /var/logs/message.txt
file_based_reader2.read('/var/logs/message.txt')
## will read /tmp/logs/message.txt
file_based_reader2.read('/tmp/logs/message.txt')
# multi bucket s3 releated
multi_bucket_s3_reader1 = MultiBucketS3DataReader("test_bucket1/test_prefix", list[S3Config(
bucket_name=test_bucket1, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
bucket = "bucket" # replace with real bucket
ak = "ak" # replace with real access key
sk = "sk" # replace with real secret key
endpoint_url = "endpoint_url" # replace with real endpoint_url
bucket_2 = "bucket_2" # replace with real bucket
ak_2 = "ak_2" # replace with real access key
sk_2 = "sk_2" # replace with real secret key
endpoint_url_2 = "endpoint_url_2" # replace with real endpoint_url
test_prefix = 'test/unittest'
multi_bucket_s3_reader1 = MultiBucketS3DataReader(f"{bucket}/{test_prefix}", [S3Config(
bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=test_bucket_2,
bucket_name=bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
)])
## will read s3://test_bucket1/test_prefix/abc
## will read s3://{bucket}/{test_prefix}/abc
multi_bucket_s3_reader1.read('abc')
## will read s3://test_bucket1/efg
multi_bucket_s3_reader1.read('s3://test_bucket1/efg')
## will read s3://{bucket}/{test_prefix}/efg
multi_bucket_s3_reader1.read(f's3://{bucket}/{test_prefix}/efg')
## will read s3://test_bucket2/abc
multi_bucket_s3_reader1.read('s3://test_bucket2/abc')
## will read s3://{bucket2}/{test_prefix}/abc
multi_bucket_s3_reader1.read(f's3://{bucket_2}/{test_prefix}/abc')
# s3 related
s3_reader1 = S3DataReader(
default_prefix_without_bucket = "test_prefix"
bucket: "test_bucket",
ak: "ak",
sk: "sk",
endpoint_url: "localhost"
test_prefix,
bucket,
ak,
sk,
endpoint_url
)
## will read s3://test_bucket/test_prefix/abc
## will read s3://{bucket}/{test_prefix}/abc
s3_reader1.read('abc')
## will read s3://test_bucket/efg
s3_reader1.read('s3://test_bucket/efg')
## will read s3://{bucket}/efg
s3_reader1.read(f's3://{bucket}/efg')
Write Examples
......@@ -144,65 +158,79 @@ Write Examples
.. code:: python
import os
from magic_pdf.data.data_reader_writer import *
from magic_pdf.data.data_reader_writer import MultiBucketS3DataWriter
from magic_pdf.data.schemas import S3Config
# file based related
file_based_writer1 = FileBasedDataWriter('')
# file based related
file_based_writer1 = FileBasedDataWriter("")
## will write 123 to abc
file_based_writer1.write('abc', '123'.encode())
file_based_writer1.write("abc", "123".encode())
## will write 123 to abc
file_based_writer1.write_string('abc', '123')
file_based_writer1.write_string("abc", "123")
file_based_writer2 = FileBasedDataWriter('/tmp')
file_based_writer2 = FileBasedDataWriter("/tmp")
## will write 123 to /tmp/abc
file_based_writer2.write_string('abc', '123')
file_based_writer2.write_string("abc", "123")
## will write 123 to /var/logs/message.txt
file_based_writer2.write_string('/var/logs/message.txt', '123')
## will write 123 to /tmp/logs/message.txt
file_based_writer2.write_string("/tmp/logs/message.txt", "123")
# multi bucket s3 releated
multi_bucket_s3_writer1 = MultiBucketS3DataWriter("test_bucket1/test_prefix", list[S3Config(
bucket_name=test_bucket1, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=test_bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
)])
## will write 123 to s3://test_bucket1/test_prefix/abc
multi_bucket_s3_writer1.write_string('abc', '123')
bucket = "bucket" # replace with real bucket
ak = "ak" # replace with real access key
sk = "sk" # replace with real secret key
endpoint_url = "endpoint_url" # replace with real endpoint_url
bucket_2 = "bucket_2" # replace with real bucket
ak_2 = "ak_2" # replace with real access key
sk_2 = "sk_2" # replace with real secret key
endpoint_url_2 = "endpoint_url_2" # replace with real endpoint_url
test_prefix = "test/unittest"
multi_bucket_s3_writer1 = MultiBucketS3DataWriter(
f"{bucket}/{test_prefix}",
[
S3Config(
bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
),
],
)
## will write 123 to s3://test_bucket1/test_prefix/abc
multi_bucket_s3_writer1.write('abc', '123'.encode())
## will write 123 to s3://{bucket}/{test_prefix}/abc
multi_bucket_s3_writer1.write_string("abc", "123")
## will write 123 to s3://test_bucket1/efg
multi_bucket_s3_writer1.write('s3://test_bucket1/efg', '123'.encode())
## will write 123 to s3://{bucket}/{test_prefix}/abc
multi_bucket_s3_writer1.write("abc", "123".encode())
## will write 123 to s3://test_bucket2/abc
multi_bucket_s3_writer1.write('s3://test_bucket2/abc', '123'.encode())
## will write 123 to s3://{bucket}/{test_prefix}/efg
multi_bucket_s3_writer1.write(f"s3://{bucket}/{test_prefix}/efg", "123".encode())
## will write 123 to s3://{bucket_2}/{test_prefix}/abc
multi_bucket_s3_writer1.write(f's3://{bucket_2}/{test_prefix}/abc', '123'.encode())
# s3 related
s3_writer1 = S3DataWriter(
default_prefix_without_bucket = "test_prefix"
bucket: "test_bucket",
ak: "ak",
sk: "sk",
endpoint_url: "localhost"
)
s3_writer1 = S3DataWriter(test_prefix, bucket, ak, sk, endpoint_url)
## will write 123 to s3://{bucket}/{test_prefix}/abc
s3_writer1.write("abc", "123".encode())
## will write 123 to s3://test_bucket/test_prefix/abc
s3_writer1.write('abc', '123'.encode())
## will write 123 to s3://{bucket}/{test_prefix}/abc
s3_writer1.write_string("abc", "123")
## will write 123 to s3://test_bucket/test_prefix/abc
s3_writer1.write_string('abc', '123')
## will write 123 to s3://{bucket}/efg
s3_writer1.write(f"s3://{bucket}/efg", "123".encode())
## will write 123 to s3://test_bucket/efg
s3_writer1.write('s3://test_bucket/efg', '123'.encode())
Check :doc:`../../api/data_reader_writer` for more details
......@@ -18,24 +18,50 @@ Read the contet from jsonl which may located on local machine or remote s3. if y
.. code:: python
from magic_pdf.data.io.read_api import *
from magic_pdf.data.read_api import *
from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
from magic_pdf.data.schemas import S3Config
# read jsonl from local machine
datasets = read_jsonl("tt.jsonl", None)
# read jsonl from local machine
datasets = read_jsonl("tt.jsonl", None) # replace with real jsonl file
# read jsonl from remote s3
datasets = read_jsonl("s3://bucket_1/tt.jsonl", s3_reader)
bucket = "bucket_1" # replace with real s3 bucket
ak = "access_key_1" # replace with real s3 access key
sk = "secret_key_1" # replace with real s3 secret key
endpoint_url = "endpoint_url_1" # replace with real s3 endpoint url
bucket_2 = "bucket_2" # replace with real s3 bucket
ak_2 = "access_key_2" # replace with real s3 access key
sk_2 = "secret_key_2" # replace with real s3 secret key
endpoint_url_2 = "endpoint_url_2" # replace with real s3 endpoint url
s3configs = [
S3Config(
bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
),
]
s3_reader = MultiBucketS3DataReader(bucket, s3configs)
datasets = read_jsonl(f"s3://bucket_1/tt.jsonl", s3_reader) # replace with real s3 jsonl file
read_local_pdfs
^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^
Read pdf from path or directory.
.. code:: python
from magic_pdf.data.io.read_api import *
from magic_pdf.data.read_api import *
# read pdf path
datasets = read_local_pdfs("tt.pdf")
......@@ -51,13 +77,30 @@ Read images from path or directory
.. code:: python
from magic_pdf.data.io.read_api import *
from magic_pdf.data.read_api import *
# read from image path
datasets = read_local_images("tt.png") # replace with real file path
# read files from directory that endswith suffix in suffixes array
datasets = read_local_images("images/", suffixes=[".png", ".jpg"]) # replace with real directory
read_local_office
^^^^^^^^^^^^^^^^^^^^
Read MS-Office files from path or directory
.. code:: python
from magic_pdf.data.read_api import *
# read from image path
datasets = read_local_images("tt.png")
datasets = read_local_office("tt.doc") # replace with real file path
# read files from directory that endswith suffix in suffixes array
datasets = read_local_images("images/", suffixes=["png", "jpg"])
datasets = read_local_office("docs/") # replace with real directory
Check :doc:`../../api/read_api` for more details
\ No newline at end of file
Inference Result
==================
.. admonition:: Tip
:class: tip
Please first navigate to :doc:`tutorial/pipeline` to get an initial understanding of how the pipeline works; this will help in understanding the content of this section.
The **InferenceResult** class is a container for storing model inference results and implements a series of methods related to these results, such as draw_model, dump_model.
Checkout :doc:`../api/model_operators` for more details about **InferenceResult**
Model Inference Result
-----------------------
Structure Definition
^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: python
from pydantic import BaseModel, Field
from enum import IntEnum
class CategoryType(IntEnum):
title = 0 # Title
plain_text = 1 # Text
abandon = 2 # Includes headers, footers, page numbers, and page annotations
figure = 3 # Image
figure_caption = 4 # Image description
table = 5 # Table
table_caption = 6 # Table description
table_footnote = 7 # Table footnote
isolate_formula = 8 # Block formula
formula_caption = 9 # Formula label
embedding = 13 # Inline formula
isolated = 14 # Block formula
text = 15 # OCR recognition result
class PageInfo(BaseModel):
page_no: int = Field(description="Page number, the first page is 0", ge=0)
height: int = Field(description="Page height", gt=0)
width: int = Field(description="Page width", ge=0)
class ObjectInferenceResult(BaseModel):
category_id: CategoryType = Field(description="Category", ge=0)
poly: list[float] = Field(description="Quadrilateral coordinates, representing the coordinates of the top-left, top-right, bottom-right, and bottom-left points respectively")
score: float = Field(description="Confidence of the inference result")
latex: str | None = Field(description="LaTeX parsing result", default=None)
html: str | None = Field(description="HTML parsing result", default=None)
class PageInferenceResults(BaseModel):
layout_dets: list[ObjectInferenceResult] = Field(description="Page recognition results", ge=0)
page_info: PageInfo = Field(description="Page metadata")
Example
^^^^^^^^^^^
.. code:: json
[
{
"layout_dets": [
{
"category_id": 2,
"poly": [
99.1906967163086,
100.3119125366211,
730.3707885742188,
100.3119125366211,
730.3707885742188,
245.81326293945312,
99.1906967163086,
245.81326293945312
],
"score": 0.9999997615814209
}
],
"page_info": {
"page_no": 0,
"height": 2339,
"width": 1654
}
},
{
"layout_dets": [
{
"category_id": 5,
"poly": [
99.13092803955078,
2210.680419921875,
497.3183898925781,
2210.680419921875,
497.3183898925781,
2264.78076171875,
99.13092803955078,
2264.78076171875
],
"score": 0.9999997019767761
}
],
"page_info": {
"page_no": 1,
"height": 2339,
"width": 1654
}
}
]
The format of the poly coordinates is [x0, y0, x1, y1, x2, y2, x3, y3],
representing the coordinates of the top-left, top-right, bottom-right,
and bottom-left points respectively. |Poly Coordinate Diagram|
Inference Result
-------------------------
.. code:: python
from magic_pdf.operators.models import InferenceResult
from magic_pdf.data.dataset import Dataset
dataset : Dataset = some_data_set # not real dataset
# The inference results of all pages, ordered by page number, are stored in a list as the inference results of MinerU
model_inference_result: list[PageInferenceResults] = []
Inference_result = InferenceResult(model_inference_result, dataset)
some_model.pdf
^^^^^^^^^^^^^^^^^^^^
.. figure:: ../_static/image/inference_result.png
.. |Poly Coordinate Diagram| image:: ../_static/image/poly.png
......@@ -8,5 +8,5 @@ Installation
install/install
install//boost_with_cuda
install/download_model_weight_files
install/config
......@@ -9,25 +9,7 @@ appropriate guide based on your system:
- :ref:`ubuntu_22_04_lts_section`
- :ref:`windows_10_or_11_section`
- Quick Deployment with Docker
.. admonition:: Important
:class: tip
Docker requires a GPU with at least 16GB of VRAM, and all acceleration features are enabled by default.
Before running this Docker, you can use the following command to check if your device supports CUDA acceleration on Docker.
.. code-block:: bash
bash docker run --rm --gpus=all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
.. code:: sh
wget https://github.com/opendatalab/MinerU/raw/master/Dockerfile
docker build -t mineru:latest .
docker run --rm -it --gpus=all mineru:latest /bin/bash
magic-pdf --help
.. _ubuntu_22_04_lts_section:
......
Config
=========
File **magic-pdf.json** is typically located in the **${HOME}** directory under a Linux system or in the **C:\Users\{username}** directory under a Windows system.
.. admonition:: Tip
:class: tip
You can override the default location of config file via the following command:
export MINERU_TOOLS_CONFIG_JSON=new_magic_pdf.json
magic-pdf.json
----------------
.. code:: json
{
"bucket_info":{
"bucket-name-1":["ak", "sk", "endpoint"],
"bucket-name-2":["ak", "sk", "endpoint"]
},
"models-dir":"/tmp/models",
"layoutreader-model-dir":"/tmp/layoutreader",
"device-mode":"cpu",
"layout-config": {
"model": "layoutlmv3"
},
"formula-config": {
"mfd_model": "yolo_v8_mfd",
"mfr_model": "unimernet_small",
"enable": true
},
"table-config": {
"model": "rapid_table",
"enable": false,
"max_time": 400
},
"config_version": "1.0.0"
}
bucket_info
^^^^^^^^^^^^^^
Store the access_key, secret_key and endpoint of AWS S3 Compatible storage config
Example:
.. code:: text
{
"image_bucket":[{access_key}, {secret_key}, {endpoint}],
"video_bucket":[{access_key}, {secret_key}, {endpoint}]
}
models-dir
^^^^^^^^^^^^
Store the models download from **huggingface** or **modelshop**. You do not need to modify this field if you download the model using the scripts shipped with **MinerU**
layoutreader-model-dir
^^^^^^^^^^^^^^^^^^^^^^^
Store the models download from **huggingface** or **modelshop**. You do not need to modify this field if you download the model using the scripts shipped with **MinerU**
devide-mode
^^^^^^^^^^^^^^
This field have two options, **cpu** or **cuda**.
**cpu**: inference via cpu
**cuda**: using cuda to accelerate inference
layout-config
^^^^^^^^^^^^^^^
.. code:: json
{
"model": "layoutlmv3"
}
layout model can not be disabled now, And we have only kind of layout model currently.
formula-config
^^^^^^^^^^^^^^^^
.. code:: json
{
"mfd_model": "yolo_v8_mfd",
"mfr_model": "unimernet_small",
"enable": true
}
mfd_model
""""""""""
Specify the formula detection model, options are ['yolo_v8_mfd']
mfr_model
""""""""""
Specify the formula recognition model, options are ['unimernet_small']
Check `UniMERNet <https://github.com/opendatalab/UniMERNet>`_ for more details
enable
""""""""
on-off flag, options are [true, false]. **true** means enable formula inference, **false** means disable formula inference
table-config
^^^^^^^^^^^^^^^^
.. code:: json
{
"model": "rapid_table",
"enable": false,
"max_time": 400
}
model
""""""""
Specify the table inference model, options are ['rapid_table', 'tablemaster', 'struct_eqtable']
max_time
"""""""""
Since table recognition is a time-consuming process, we set a timeout period. If the process exceeds this time, the table recognition will be terminated.
enable
"""""""
on-off flag, options are [true, false]. **true** means enable table inference, **false** means disable table inference
config_version
^^^^^^^^^^^^^^^^
The version of config schema.
.. admonition:: Tip
:class: tip
Check `Config Schema <https://github.com/opendatalab/MinerU/blob/master/magic-pdf.template.json>`_ for the latest details
......@@ -4,6 +4,7 @@ Install
If you encounter any installation issues, please first consult the :doc:`../../additional_notes/faq`.
If the parsing results are not as expected, refer to the :doc:`../../additional_notes/known_issues`.
Also you can try `online demo <https://www.modelscope.cn/studios/OpenDataLab/MinerU>`_ without installation.
.. admonition:: Warning
:class: tip
......@@ -88,7 +89,7 @@ If the parsing results are not as expected, refer to the :doc:`../../additional_
Create an environment
~~~~~~~~~~~~~~~~~~~~~
---------------------------
.. code-block:: shell
......@@ -98,7 +99,7 @@ Create an environment
Download model weight files
~~~~~~~~~~~~~~~~~~~~~~~~~~
------------------------------
.. code-block:: shell
......@@ -107,4 +108,32 @@ Download model weight files
python download_models_hf.py
The MinerU is installed, Check out :doc:`../quick_start` or reading :doc:`boost_with_cuda` for accelerate inference
\ No newline at end of file
Install LibreOffice[Optional]
----------------------------------
This section is required for handle **doc**, **docx**, **ppt**, **pptx** filetype, You can **skip** this section if no need for those filetype processing.
Linux/Macos Platform
""""""""""""""""""""""
.. code::
apt-get/yum/brew install libreoffice
Windows Platform
""""""""""""""""""""
.. code::
install libreoffice
append "install_dir\LibreOffice\program" to ENVIRONMENT PATH
.. tip::
The MinerU is installed, Check out :doc:`../usage/command_line` to convert your first pdf **or** reading the following sections for more details about install
Pipe Result
==============
.. admonition:: Tip
:class: tip
Please first navigate to :doc:`tutorial/pipeline` to get an initial understanding of how the pipeline works; this will help in understanding the content of this section.
The **PipeResult** class is a container for storing pipeline processing results and implements a series of methods related to these results, such as draw_layout, draw_span.
Checkout :doc:`../api/pipe_operators` for more details about **PipeResult**
Structure Definitions
-------------------------------
**some_pdf_middle.json**
+----------------+--------------------------------------------------------------+
| Field Name | Description |
| | |
+================+==============================================================+
| pdf_info | list, each element is a dict representing the parsing result |
| | of each PDF page, see the table below for details |
+----------------+--------------------------------------------------------------+
| \_ | ocr \| txt, used to indicate the mode used in this |
| parse_type | intermediate parsing state |
| | |
+----------------+--------------------------------------------------------------+
| \_version_name | string, indicates the version of magic-pdf used in this |
| | parsing |
| | |
+----------------+--------------------------------------------------------------+
**pdf_info**
Field structure description
+-------------------------+------------------------------------------------------------+
| Field | Description |
| Name | |
+=========================+============================================================+
| preproc_blocks | Intermediate result after PDF preprocessing, not yet |
| | segmented |
+-------------------------+------------------------------------------------------------+
| layout_bboxes | Layout segmentation results, containing layout direction |
| | (vertical, horizontal), and bbox, sorted by reading order |
+-------------------------+------------------------------------------------------------+
| page_idx | Page number, starting from 0 |
| | |
+-------------------------+------------------------------------------------------------+
| page_size | Page width and height |
| | |
+-------------------------+------------------------------------------------------------+
| \_layout_tree | Layout tree structure |
| | |
+-------------------------+------------------------------------------------------------+
| images | list, each element is a dict representing an img_block |
+-------------------------+------------------------------------------------------------+
| tables | list, each element is a dict representing a table_block |
+-------------------------+------------------------------------------------------------+
| interline_equation | list, each element is a dict representing an |
| | interline_equation_block |
| | |
+-------------------------+------------------------------------------------------------+
| discarded_blocks | List, block information returned by the model that needs |
| | to be dropped |
| | |
+-------------------------+------------------------------------------------------------+
| para_blocks | Result after segmenting preproc_blocks |
| | |
+-------------------------+------------------------------------------------------------+
In the above table, ``para_blocks`` is an array of dicts, each dict
representing a block structure. A block can support up to one level of
nesting.
**block**
The outer block is referred to as a first-level block, and the fields in
the first-level block include:
+------------------------+-------------------------------------------------------------+
| Field | Description |
| Name | |
+========================+=============================================================+
| type | Block type (table|image) |
+------------------------+-------------------------------------------------------------+
| bbox | Block bounding box coordinates |
+------------------------+-------------------------------------------------------------+
| blocks | list, each element is a dict representing a second-level |
| | block |
+------------------------+-------------------------------------------------------------+
There are only two types of first-level blocks: “table” and “image”. All
other blocks are second-level blocks.
The fields in a second-level block include:
+----------------------+----------------------------------------------------------------+
| Field | Description |
| Name | |
+======================+================================================================+
| | Block type |
| type | |
+----------------------+----------------------------------------------------------------+
| | Block bounding box coordinates |
| bbox | |
+----------------------+----------------------------------------------------------------+
| | list, each element is a dict representing a line, used to |
| lines | describe the composition of a line of information |
+----------------------+----------------------------------------------------------------+
Detailed explanation of second-level block types
================== ======================
type Description
================== ======================
image_body Main body of the image
image_caption Image description text
table_body Main body of the table
table_caption Table description text
table_footnote Table footnote
text Text block
title Title block
interline_equation Block formula
================== ======================
**line**
The field format of a line is as follows:
+---------------------+----------------------------------------------------------------+
| Field | Description |
| Name | |
+=====================+================================================================+
| | Bounding box coordinates of the line |
| bbox | |
+---------------------+----------------------------------------------------------------+
| spans | list, each element is a dict representing a span, used to |
| | describe the composition of the smallest unit |
+---------------------+----------------------------------------------------------------+
**span**
+---------------------+-----------------------------------------------------------+
| Field | Description |
| Name | |
+=====================+===========================================================+
| bbox | Bounding box coordinates of the span |
+---------------------+-----------------------------------------------------------+
| type | Type of the span |
+---------------------+-----------------------------------------------------------+
| content | Text spans use content, chart spans use img_path to store |
| \| | the actual text or screenshot path information |
| img_path | |
+---------------------+-----------------------------------------------------------+
The types of spans are as follows:
================== ==============
type Description
================== ==============
image Image
table Table
text Text
inline_equation Inline formula
interline_equation Block formula
================== ==============
**Summary**
A span is the smallest storage unit for all elements.
The elements stored within para_blocks are block information.
The block structure is as follows:
First-level block (if any) -> Second-level block -> Line -> Span
.. _example-1:
example
^^^^^^^
.. code:: json
{
"pdf_info": [
{
"preproc_blocks": [
{
"type": "text",
"bbox": [
52,
61.956024169921875,
294,
82.99800872802734
],
"lines": [
{
"bbox": [
52,
61.956024169921875,
294,
72.0000228881836
],
"spans": [
{
"bbox": [
54.0,
61.956024169921875,
296.2261657714844,
72.0000228881836
],
"content": "dependent on the service headway and the reliability of the departure ",
"type": "text",
"score": 1.0
}
]
}
]
}
],
"layout_bboxes": [
{
"layout_bbox": [
52,
61,
294,
731
],
"layout_label": "V",
"sub_layout": []
}
],
"page_idx": 0,
"page_size": [
612.0,
792.0
],
"_layout_tree": [],
"images": [],
"tables": [],
"interline_equations": [],
"discarded_blocks": [],
"para_blocks": [
{
"type": "text",
"bbox": [
52,
61.956024169921875,
294,
82.99800872802734
],
"lines": [
{
"bbox": [
52,
61.956024169921875,
294,
72.0000228881836
],
"spans": [
{
"bbox": [
54.0,
61.956024169921875,
296.2261657714844,
72.0000228881836
],
"content": "dependent on the service headway and the reliability of the departure ",
"type": "text",
"score": 1.0
}
]
}
]
}
]
}
],
"_parse_type": "txt",
"_version_name": "0.6.1"
}
Pipeline Result
------------------
.. code:: python
from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
from magic_pdf.operators.pipes import PipeResult
from magic_pdf.data.dataset import Dataset
res = pdf_parse_union(*args, **kwargs)
res['_parse_type'] = PARSE_TYPE_OCR
res['_version_name'] = __version__
if 'lang' in kwargs and kwargs['lang'] is not None:
res['lang'] = kwargs['lang']
dataset : Dataset = some_dataset # not real dataset
pipeResult = PipeResult(res, dataset)
some_pdf_layout.pdf
~~~~~~~~~~~~~~~~~~~
Each page layout consists of one or more boxes. The number at the top
left of each box indicates its sequence number. Additionally, in
``layout.pdf``, different content blocks are highlighted with different
background colors.
.. figure:: ../_static/image/layout_example.png
:alt: layout example
layout example
some_pdf_spans.pdf
~~~~~~~~~~~~~~~~~~
All spans on the page are drawn with different colored line frames
according to the span type. This file can be used for quality control,
allowing for quick identification of issues such as missing text or
unrecognized inline formulas.
.. figure:: ../_static/image/spans_example.png
:alt: spans example
spans example
......@@ -2,12 +2,11 @@
Quick Start
==============
Eager to get started? This page gives a good introduction to MinerU. Follow Installation to set up a project and install MinerU first.
Want to learn about the usage methods under different scenarios ? This page gives good examples about multiple usage cases match your needs.
.. toctree::
:maxdepth: 1
quick_start/command_line
quick_start/to_markdown
quick_start/convert_pdf
quick_start/convert_image
quick_start/convert_ms_office
Convert Image
===============
Command Line
^^^^^^^^^^^^^
.. code:: python
# make sure the file have correct suffix
magic-pdf -p a.png -o output -m auto
API
^^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.data.read_api import read_local_images
# prepare env
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
# proc
## Create Dataset Instance
input_file = "some_image.jpg" # replace with real image file
input_file_name = input_file.split(".")[0]
ds = read_local_images(input_file)[0]
# ocr mode
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
Convert Doc
=============
.. admonition:: Warning
:class: tip
When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
Command Line
^^^^^^^^^^^^^
.. code:: python
# replace with real ms-office file, we support MS-DOC, MS-DOCX, MS-PPT, MS-PPTX now
magic-pdf -p a.doc -o output -m auto
API
^^^^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.data.read_api import read_local_office
from magic_pdf.config.enums import SupportedPdfParseMethod
# prepare env
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
# proc
## Create Dataset Instance
input_file = "some_doc.doc" # replace with real ms-office file, we support MS-DOC, MS-DOCX, MS-PPT, MS-PPTX now
input_file_name = input_file.split(".")[0]
ds = read_local_office(input_file)[0]
## inference
if ds.classify() == SupportedPdfParseMethod.OCR:
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir)
else:
ds.apply(doc_analyze, ocr=False).pipe_txt_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir)
Convert PDF
============
Command Line
^^^^^^^^^^^^^
.. code:: python
# make sure the file have correct suffix
magic-pdf -p a.pdf -o output -m auto
API
^^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
# args
pdf_file_name = "abc.pdf" # replace with the real pdf path
name_without_suff = pdf_file_name.split(".")[0]
# prepare env
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
# read bytes
reader1 = FileBasedDataReader("")
pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
## inference
if ds.classify() == SupportedPdfParseMethod.OCR:
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{name_without_suff}.md", image_dir
)
else:
ds.apply(doc_analyze, ocr=False).pipe_txt_mode(image_writer).dump_md(
md_writer, f"{name_without_suff}.md", image_dir
)
......@@ -7,6 +7,5 @@ From the beginning to the end, Show how to using mineru via a minimal project
.. toctree::
:maxdepth: 1
tutorial/output_file_description
tutorial/pipeline
......@@ -28,7 +28,6 @@ Minimal Example
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
image_dir = str(os.path.basename(local_image_dir))
# read bytes
reader1 = FileBasedDataReader("")
......@@ -85,8 +84,6 @@ These stages are linked together through methods like ``apply``, ``doc_analyze``
.. admonition:: Tip
:class: tip
For more examples on how to use ``Dataset``, ``InferenceResult``, and ``PipeResult``, please refer to :doc:`../quick_start/to_markdown`
For more detailed information about ``Dataset``, ``InferenceResult``, and ``PipeResult``, please refer to :doc:`../../api/dataset`, :doc:`../../api/model_operators`, :doc:`../../api/pipe_operators`
......
Usage
========
.. toctree::
:maxdepth: 1
usage/command_line
usage/api
usage/docker
Api Usage
===========
PDF
----
Local File Example
^^^^^^^^^^^^^^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod
# args
pdf_file_name = "abc.pdf" # replace with the real pdf path
name_without_suff = pdf_file_name.split(".")[0]
# prepare env
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
# read bytes
reader1 = FileBasedDataReader("")
pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
## inference
if ds.classify() == SupportedPdfParseMethod.OCR:
infer_result = ds.apply(doc_analyze, ocr=True)
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
else:
infer_result = ds.apply(doc_analyze, ocr=False)
## pipeline
pipe_result = infer_result.pipe_txt_mode(image_writer)
### draw model result on each page
infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
### get model inference result
model_inference_result = infer_result.get_infer_res()
### draw layout result on each page
pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
### draw spans result on each page
pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
### get markdown content
md_content = pipe_result.get_markdown(image_dir)
### dump markdown
pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
### get content list content
content_list_content = pipe_result.get_content_list(image_dir)
### dump content list
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
### get middle json
middle_json_content = pipe_result.get_middle_json()
### dump middle json
pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')
S3 File Example
^^^^^^^^^^^^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod
bucket_name = "{Your S3 Bucket Name}" # replace with real bucket name
ak = "{Your S3 access key}" # replace with real s3 access key
sk = "{Your S3 secret key}" # replace with real s3 secret key
endpoint_url = "{Your S3 endpoint_url}" # replace with real s3 endpoint_url
reader = S3DataReader('unittest/tmp/', bucket_name, ak, sk, endpoint_url) # replace `unittest/tmp` with the real s3 prefix
writer = S3DataWriter('unittest/tmp', bucket_name, ak, sk, endpoint_url)
image_writer = S3DataWriter('unittest/tmp/images', bucket_name, ak, sk, endpoint_url)
md_writer = S3DataWriter('unittest/tmp', bucket_name, ak, sk, endpoint_url)
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
# args
pdf_file_name = (
f"s3://{bucket_name}/unittest/tmp/bug5-11.pdf" # replace with the real s3 path
)
# prepare env
local_dir = "output"
name_without_suff = os.path.basename(pdf_file_name).split(".")[0]
# read bytes
pdf_bytes = reader.read(pdf_file_name) # read the pdf content
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
## inference
if ds.classify() == SupportedPdfParseMethod.OCR:
infer_result = ds.apply(doc_analyze, ocr=True)
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
else:
infer_result = ds.apply(doc_analyze, ocr=False)
## pipeline
pipe_result = infer_result.pipe_txt_mode(image_writer)
### draw model result on each page
infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
### get model inference result
model_inference_result = infer_result.get_infer_res()
### draw layout result on each page
pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
### draw spans result on each page
pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
### dump markdown
pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
### dump content list
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
### get markdown content
md_content = pipe_result.get_markdown(image_dir)
### get content list content
content_list_content = pipe_result.get_content_list(image_dir)
### get middle json
middle_json_content = pipe_result.get_middle_json()
### dump middle json
pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')
MS-Office
----------
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.data.read_api import read_local_office
# prepare env
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
# proc
## Create Dataset Instance
input_file = "some_ppt.ppt" # replace with real ms-office file
input_file_name = input_file.split(".")[0]
ds = read_local_office(input_file)[0]
ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
This code snippet can be used to manipulate **ppt**, **pptx**, **doc**, **docx** file
Image
---------
Single Image File
^^^^^^^^^^^^^^^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.data.read_api import read_local_images
# prepare env
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
# proc
## Create Dataset Instance
input_file = "some_image.jpg" # replace with real image file
input_file_name = input_file.split(".")[0]
ds = read_local_images(input_file)[0]
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
Directory That Contains Images
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.data.read_api import read_local_images
# prepare env
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
# proc
## Create Dataset Instance
input_directory = "some_image_dir/" # replace with real directory that contains images
dss = read_local_images(input_directory, suffixes=['.png', '.jpg'])
count = 0
for ds in dss:
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{count}.md", image_dir
)
count += 1
Check :doc:`../data/data_reader_writer` for more [reader | writer] examples and check :doc:`../../api/pipe_operators` or :doc:`../../api/model_operators` for api details
......@@ -10,7 +10,8 @@ Command Line
Options:
-v, --version display the version and exit
-p, --path PATH local pdf filepath or directory [required]
-p, --path PATH local filepath or directory. support PDF, PPT,
PPTX, DOC, DOCX, PNG, JPG files [required]
-o, --output-dir PATH output local directory [required]
-m, --method [ocr|txt|auto] the method for parsing pdf. ocr: using ocr
technique to extract information from pdf. txt:
......@@ -40,6 +41,20 @@ Command Line
## command line example
magic-pdf -p {some_pdf} -o {some_output_dir} -m auto
.. admonition:: Important
:class: tip
The file must endswith with the following suffix.
.pdf
.png
.jpg
.ppt
.pptx
.doc
.docx
``{some_pdf}`` can be a single PDF file or a directory containing
multiple PDFs. The results will be saved in the ``{some_output_dir}``
directory. The output file list is as follows:
......@@ -57,6 +72,6 @@ directory. The output file list is as follows:
.. admonition:: Tip
:class: tip
For more information about the output files, please refer to the :doc:`../tutorial/output_file_description`
For more information about the output files, please refer to the :doc:`../inference_result` or :doc:`../pipe_result`
Docker
=======
.. admonition:: Important
:class: tip
Docker requires a GPU with at least 16GB of VRAM, and all acceleration features are enabled by default.
Before running this Docker, you can use the following command to check if your device supports CUDA acceleration on Docker.
.. code-block:: bash
bash docker run --rm --gpus=all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
.. code:: sh
wget https://github.com/opendatalab/MinerU/raw/master/Dockerfile
docker build -t mineru:latest .
docker run --rm -it --gpus=all mineru:latest /bin/bash
magic-pdf --help
......@@ -8,6 +8,7 @@ myst-parser
Pillow==8.4.0
pydantic>=2.7.2,<2.8.0
PyMuPDF>=1.24.9
pdfminer.six==20231228
sphinx
sphinx-argparse>=0.5.2
sphinx-book-theme>=1.1.3
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment