Unverified Commit bdacf291 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #1257 from icecraft/docs/refactor_en_docs

Docs/refactor en docs
parents 2df3e901 302a6950
......@@ -55,7 +55,7 @@ class FileBasedDataWriter(DataWriter):
if not os.path.isabs(fn_path) and len(self._parent_dir) > 0:
fn_path = os.path.join(self._parent_dir, path)
if not os.path.exists(os.path.dirname(fn_path)):
if not os.path.exists(os.path.dirname(fn_path)) and os.path.dirname(fn_path) != "":
os.makedirs(os.path.dirname(fn_path), exist_ok=True)
with open(fn_path, 'wb') as f:
......
import json
import os
import tempfile
import shutil
from pathlib import Path
from magic_pdf.config.exceptions import EmptyData, InvalidParams
from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
MultiBucketS3DataReader)
from magic_pdf.data.dataset import ImageDataset, PymuDocDataset
from magic_pdf.utils.office_to_pdf import convert_file_to_pdf, ConvertToPdfError
def read_jsonl(
s3_path_or_local: str, s3_client: MultiBucketS3DataReader | None = None
......@@ -58,23 +60,68 @@ def read_local_pdfs(path: str) -> list[PymuDocDataset]:
list[PymuDocDataset]: each pdf file will converted to a PymuDocDataset
"""
if os.path.isdir(path):
reader = FileBasedDataReader(path)
return [
PymuDocDataset(reader.read(doc_path.name))
for doc_path in Path(path).glob('*.pdf')
]
reader = FileBasedDataReader()
ret = []
for root, _, files in os.walk(path):
for file in files:
suffix = file.split('.')
if suffix[-1] == 'pdf':
ret.append( PymuDocDataset(reader.read(os.path.join(root, file))))
return ret
else:
reader = FileBasedDataReader()
bits = reader.read(path)
return [PymuDocDataset(bits)]
def read_local_office(path: str) -> list[PymuDocDataset]:
"""Read ms-office file (ppt, pptx, doc, docx) from path or directory.
def read_local_images(path: str, suffixes: list[str]) -> list[ImageDataset]:
Args:
path (str): ms-office file or directory that contains ms-office files
Returns:
list[PymuDocDataset]: each ms-office file will converted to a PymuDocDataset
Raises:
ConvertToPdfError: Failed to convert ms-office file to pdf via libreoffice
FileNotFoundError: File not Found
Exception: Unknown Exception raised
"""
suffixes = ['.ppt', '.pptx', '.doc', '.docx']
fns = []
ret = []
if os.path.isdir(path):
for root, _, files in os.walk(path):
for file in files:
suffix = Path(file).suffix
if suffix in suffixes:
fns.append((os.path.join(root, file)))
else:
fns.append(path)
reader = FileBasedDataReader()
temp_dir = tempfile.mkdtemp()
for fn in fns:
try:
convert_file_to_pdf(fn, temp_dir)
except ConvertToPdfError as e:
raise e
except FileNotFoundError as e:
raise e
except Exception as e:
raise e
fn_path = Path(fn)
pdf_fn = f"{temp_dir}/{fn_path.stem}.pdf"
ret.append(PymuDocDataset(reader.read(pdf_fn)))
shutil.rmtree(temp_dir)
return ret
def read_local_images(path: str, suffixes: list[str]=['.png', '.jpg']) -> list[ImageDataset]:
"""Read images from path or directory.
Args:
path (str): image file path or directory that contains image files
suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['jpg', 'png']
suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['.jpg', '.png']
Returns:
list[ImageDataset]: each image file will converted to a ImageDataset
......@@ -82,12 +129,12 @@ def read_local_images(path: str, suffixes: list[str]) -> list[ImageDataset]:
if os.path.isdir(path):
imgs_bits = []
s_suffixes = set(suffixes)
reader = FileBasedDataReader(path)
reader = FileBasedDataReader()
for root, _, files in os.walk(path):
for file in files:
suffix = file.split('.')
if suffix[-1] in s_suffixes:
imgs_bits.append(reader.read(file))
suffix = Path(file).suffix
if suffix in s_suffixes:
imgs_bits.append(reader.read(os.path.join(root, file)))
return [ImageDataset(bits) for bits in imgs_bits]
else:
reader = FileBasedDataReader()
......
......@@ -65,31 +65,6 @@ class InferenceResultBase(ABC):
"""
pass
@abstractmethod
def pipe_auto_mode(
self,
imageWriter: DataWriter,
start_page_id=0,
end_page_id=None,
debug_mode=False,
lang=None,
) -> PipeResult:
"""Post-proc the model inference result.
step1: classify the dataset type
step2: based the result of step1, using `pipe_txt_mode` or `pipe_ocr_mode`
Args:
imageWriter (DataWriter): the image writer handle
start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
end_page_id (int, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process
debug_mode (bool, optional): Defaults to False. will dump more log if enabled
lang (str, optional): Defaults to None.
Returns:
PipeResult: the result
"""
pass
@abstractmethod
def pipe_txt_mode(
self,
......
......@@ -71,40 +71,6 @@ class InferenceResult(InferenceResultBase):
"""
return proc(copy.deepcopy(self._infer_res), *args, **kwargs)
def pipe_auto_mode(
self,
imageWriter: DataWriter,
start_page_id=0,
end_page_id=None,
debug_mode=False,
lang=None,
) -> PipeResult:
"""Post-proc the model inference result.
step1: classify the dataset type
step2: based the result of step1, using `pipe_txt_mode` or `pipe_ocr_mode`
Args:
imageWriter (DataWriter): the image writer handle
start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
end_page_id (int, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process
debug_mode (bool, optional): Defaults to False. will dump more log if enabled
lang (str, optional): Defaults to None.
Returns:
PipeResult: the result
"""
pdf_proc_method = classify(self._dataset.data_bits())
if pdf_proc_method == SupportedPdfParseMethod.TXT:
return self.pipe_txt_mode(
imageWriter, start_page_id, end_page_id, debug_mode, lang
)
else:
return self.pipe_ocr_mode(
imageWriter, start_page_id, end_page_id, debug_mode, lang
)
def pipe_txt_mode(
self,
imageWriter: DataWriter,
......
import os
from pathlib import Path
import shutil
import tempfile
import click
import fitz
from loguru import logger
from pathlib import Path
import magic_pdf.model as model_config
from magic_pdf.data.data_reader_writer import FileBasedDataReader
from magic_pdf.libs.version import __version__
from magic_pdf.tools.common import do_parse, parse_pdf_methods
from magic_pdf.utils.office_to_pdf import convert_file_to_pdf
pdf_suffixes = ['.pdf']
ms_office_suffixes = ['.ppt', '.pptx', '.doc', '.docx']
image_suffixes = ['.png', '.jpg']
@click.command()
......@@ -21,7 +28,7 @@ from magic_pdf.tools.common import do_parse, parse_pdf_methods
'path',
type=click.Path(exists=True),
required=True,
help='local pdf filepath or directory',
help='local filepath or directory. support PDF, PPT, PPTX, DOC, DOCX, PNG, JPG files',
)
@click.option(
'-o',
......@@ -83,12 +90,27 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
model_config.__use_inside_model__ = True
model_config.__model_mode__ = 'full'
os.makedirs(output_dir, exist_ok=True)
temp_dir = tempfile.mkdtemp()
def read_fn(path: Path):
if path.suffix in ms_office_suffixes:
convert_file_to_pdf(str(path), temp_dir)
fn = os.path.join(temp_dir, f"{path.stem}.pdf")
elif path.suffix in image_suffixes:
with open(str(path), 'rb') as f:
bits = f.read()
pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
fn = os.path.join(temp_dir, f"{path.stem}.pdf")
with open(fn, 'wb') as f:
f.write(pdf_bytes)
elif path.suffix in pdf_suffixes:
fn = str(path)
else:
raise Exception(f"Unknown file suffix: {path.suffix}")
def read_fn(path):
disk_rw = FileBasedDataReader(os.path.dirname(path))
return disk_rw.read(os.path.basename(path))
disk_rw = FileBasedDataReader(os.path.dirname(fn))
return disk_rw.read(os.path.basename(fn))
def parse_doc(doc_path: str):
def parse_doc(doc_path: Path):
try:
file_name = str(Path(doc_path).stem)
pdf_data = read_fn(doc_path)
......@@ -108,10 +130,13 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
logger.exception(e)
if os.path.isdir(path):
for doc_path in Path(path).glob('*.pdf'):
for doc_path in Path(path).glob('*'):
if doc_path.suffix in pdf_suffixes + image_suffixes + ms_office_suffixes:
parse_doc(doc_path)
else:
parse_doc(path)
parse_doc(Path(path))
shutil.rmtree(temp_dir)
if __name__ == '__main__':
......
......@@ -170,6 +170,7 @@ def do_parse(
logger.error('need model list input')
exit(2)
else:
infer_result = InferenceResult(model_list, ds)
if parse_method == 'ocr':
pipe_result = infer_result.pipe_ocr_mode(
......@@ -180,10 +181,16 @@ def do_parse(
image_writer, debug_mode=True, lang=lang
)
else:
pipe_result = infer_result.pipe_auto_mode(
if ds.classify() == SupportedPdfParseMethod.TXT:
pipe_result = infer_result.pipe_txt_mode(
image_writer, debug_mode=True, lang=lang
)
else:
pipe_result = infer_result.pipe_txt_mode(
image_writer, debug_mode=True, lang=lang
)
if f_draw_model_bbox:
infer_result.draw_model(
os.path.join(local_md_dir, f'{pdf_file_name}_model.pdf')
......
import os
import subprocess
from pathlib import Path
class ConvertToPdfError(Exception):
def __init__(self, msg):
self.msg = msg
super().__init__(self.msg)
def convert_file_to_pdf(input_path, output_dir):
if not os.path.isfile(input_path):
raise FileNotFoundError(f"The input file {input_path} does not exist.")
os.makedirs(output_dir, exist_ok=True)
cmd = [
'soffice',
'--headless',
'--convert-to', 'pdf',
'--outdir', str(output_dir),
str(input_path)
]
process = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if process.returncode != 0:
raise ConvertToPdfError(process.stderr.decode())
......@@ -4,8 +4,11 @@ Glossary
===========
1. jsonl
TODO: add description
Newline-delimited (\n), and each line must be a valid, independent JSON object.
Currently, All the function shipped with **MinerU** assume that json object must contain one field named with either **path** or **file_location**
2. magic-pdf.json
TODO: add description
TODO
......@@ -70,6 +70,12 @@ Key Features
- Supports both CPU and GPU environments.
- Compatible with Windows, Linux, and Mac platforms.
.. tip::
Get started with MinerU by trying the `online demo <https://www.modelscope.cn/studios/OpenDataLab/MinerU>`_ or :doc:`installing it locally <user_guide/install/install>`.
User Guide
-------------
.. toctree::
......
......@@ -4,7 +4,9 @@
:maxdepth: 2
user_guide/install
user_guide/usage
user_guide/quick_start
user_guide/tutorial
user_guide/data
user_guide/inference_result
user_guide/pipe_result
......@@ -87,7 +87,10 @@ Read Examples
.. code:: python
import os
from magic_pdf.data.data_reader_writer import *
from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
from magic_pdf.data.schemas import S3Config
# file based related
file_based_reader1 = FileBasedDataReader('')
......@@ -100,43 +103,54 @@ Read Examples
## will read /tmp/abc
file_based_reader2.read('abc')
## will read /var/logs/message.txt
file_based_reader2.read('/var/logs/message.txt')
## will read /tmp/logs/message.txt
file_based_reader2.read('/tmp/logs/message.txt')
# multi bucket s3 releated
multi_bucket_s3_reader1 = MultiBucketS3DataReader("test_bucket1/test_prefix", list[S3Config(
bucket_name=test_bucket1, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
bucket = "bucket" # replace with real bucket
ak = "ak" # replace with real access key
sk = "sk" # replace with real secret key
endpoint_url = "endpoint_url" # replace with real endpoint_url
bucket_2 = "bucket_2" # replace with real bucket
ak_2 = "ak_2" # replace with real access key
sk_2 = "sk_2" # replace with real secret key
endpoint_url_2 = "endpoint_url_2" # replace with real endpoint_url
test_prefix = 'test/unittest'
multi_bucket_s3_reader1 = MultiBucketS3DataReader(f"{bucket}/{test_prefix}", [S3Config(
bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=test_bucket_2,
bucket_name=bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
)])
## will read s3://test_bucket1/test_prefix/abc
## will read s3://{bucket}/{test_prefix}/abc
multi_bucket_s3_reader1.read('abc')
## will read s3://test_bucket1/efg
multi_bucket_s3_reader1.read('s3://test_bucket1/efg')
## will read s3://{bucket}/{test_prefix}/efg
multi_bucket_s3_reader1.read(f's3://{bucket}/{test_prefix}/efg')
## will read s3://test_bucket2/abc
multi_bucket_s3_reader1.read('s3://test_bucket2/abc')
## will read s3://{bucket2}/{test_prefix}/abc
multi_bucket_s3_reader1.read(f's3://{bucket_2}/{test_prefix}/abc')
# s3 related
s3_reader1 = S3DataReader(
default_prefix_without_bucket = "test_prefix"
bucket: "test_bucket",
ak: "ak",
sk: "sk",
endpoint_url: "localhost"
test_prefix,
bucket,
ak,
sk,
endpoint_url
)
## will read s3://test_bucket/test_prefix/abc
## will read s3://{bucket}/{test_prefix}/abc
s3_reader1.read('abc')
## will read s3://test_bucket/efg
s3_reader1.read('s3://test_bucket/efg')
## will read s3://{bucket}/efg
s3_reader1.read(f's3://{bucket}/efg')
Write Examples
......@@ -144,65 +158,79 @@ Write Examples
.. code:: python
import os
from magic_pdf.data.data_reader_writer import *
from magic_pdf.data.data_reader_writer import MultiBucketS3DataWriter
from magic_pdf.data.schemas import S3Config
# file based related
file_based_writer1 = FileBasedDataWriter('')
file_based_writer1 = FileBasedDataWriter("")
## will write 123 to abc
file_based_writer1.write('abc', '123'.encode())
file_based_writer1.write("abc", "123".encode())
## will write 123 to abc
file_based_writer1.write_string('abc', '123')
file_based_writer1.write_string("abc", "123")
file_based_writer2 = FileBasedDataWriter('/tmp')
file_based_writer2 = FileBasedDataWriter("/tmp")
## will write 123 to /tmp/abc
file_based_writer2.write_string('abc', '123')
file_based_writer2.write_string("abc", "123")
## will write 123 to /var/logs/message.txt
file_based_writer2.write_string('/var/logs/message.txt', '123')
## will write 123 to /tmp/logs/message.txt
file_based_writer2.write_string("/tmp/logs/message.txt", "123")
# multi bucket s3 releated
multi_bucket_s3_writer1 = MultiBucketS3DataWriter("test_bucket1/test_prefix", list[S3Config(
bucket_name=test_bucket1, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
bucket = "bucket" # replace with real bucket
ak = "ak" # replace with real access key
sk = "sk" # replace with real secret key
endpoint_url = "endpoint_url" # replace with real endpoint_url
bucket_2 = "bucket_2" # replace with real bucket
ak_2 = "ak_2" # replace with real access key
sk_2 = "sk_2" # replace with real secret key
endpoint_url_2 = "endpoint_url_2" # replace with real endpoint_url
test_prefix = "test/unittest"
multi_bucket_s3_writer1 = MultiBucketS3DataWriter(
f"{bucket}/{test_prefix}",
[
S3Config(
bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=test_bucket_2,
bucket_name=bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
)])
),
],
)
## will write 123 to s3://test_bucket1/test_prefix/abc
multi_bucket_s3_writer1.write_string('abc', '123')
## will write 123 to s3://{bucket}/{test_prefix}/abc
multi_bucket_s3_writer1.write_string("abc", "123")
## will write 123 to s3://test_bucket1/test_prefix/abc
multi_bucket_s3_writer1.write('abc', '123'.encode())
## will write 123 to s3://{bucket}/{test_prefix}/abc
multi_bucket_s3_writer1.write("abc", "123".encode())
## will write 123 to s3://test_bucket1/efg
multi_bucket_s3_writer1.write('s3://test_bucket1/efg', '123'.encode())
## will write 123 to s3://{bucket}/{test_prefix}/efg
multi_bucket_s3_writer1.write(f"s3://{bucket}/{test_prefix}/efg", "123".encode())
## will write 123 to s3://test_bucket2/abc
multi_bucket_s3_writer1.write('s3://test_bucket2/abc', '123'.encode())
## will write 123 to s3://{bucket_2}/{test_prefix}/abc
multi_bucket_s3_writer1.write(f's3://{bucket_2}/{test_prefix}/abc', '123'.encode())
# s3 related
s3_writer1 = S3DataWriter(
default_prefix_without_bucket = "test_prefix"
bucket: "test_bucket",
ak: "ak",
sk: "sk",
endpoint_url: "localhost"
)
s3_writer1 = S3DataWriter(test_prefix, bucket, ak, sk, endpoint_url)
## will write 123 to s3://{bucket}/{test_prefix}/abc
s3_writer1.write("abc", "123".encode())
## will write 123 to s3://test_bucket/test_prefix/abc
s3_writer1.write('abc', '123'.encode())
## will write 123 to s3://{bucket}/{test_prefix}/abc
s3_writer1.write_string("abc", "123")
## will write 123 to s3://test_bucket/test_prefix/abc
s3_writer1.write_string('abc', '123')
## will write 123 to s3://{bucket}/efg
s3_writer1.write(f"s3://{bucket}/efg", "123".encode())
## will write 123 to s3://test_bucket/efg
s3_writer1.write('s3://test_bucket/efg', '123'.encode())
Check :doc:`../../api/data_reader_writer` for more details
......@@ -18,24 +18,50 @@ Read the contet from jsonl which may located on local machine or remote s3. if y
.. code:: python
from magic_pdf.data.io.read_api import *
from magic_pdf.data.read_api import *
from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
from magic_pdf.data.schemas import S3Config
# read jsonl from local machine
datasets = read_jsonl("tt.jsonl", None)
datasets = read_jsonl("tt.jsonl", None) # replace with real jsonl file
# read jsonl from remote s3
datasets = read_jsonl("s3://bucket_1/tt.jsonl", s3_reader)
bucket = "bucket_1" # replace with real s3 bucket
ak = "access_key_1" # replace with real s3 access key
sk = "secret_key_1" # replace with real s3 secret key
endpoint_url = "endpoint_url_1" # replace with real s3 endpoint url
bucket_2 = "bucket_2" # replace with real s3 bucket
ak_2 = "access_key_2" # replace with real s3 access key
sk_2 = "secret_key_2" # replace with real s3 secret key
endpoint_url_2 = "endpoint_url_2" # replace with real s3 endpoint url
s3configs = [
S3Config(
bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
),
]
s3_reader = MultiBucketS3DataReader(bucket, s3configs)
datasets = read_jsonl(f"s3://bucket_1/tt.jsonl", s3_reader) # replace with real s3 jsonl file
read_local_pdfs
^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^
Read pdf from path or directory.
.. code:: python
from magic_pdf.data.io.read_api import *
from magic_pdf.data.read_api import *
# read pdf path
datasets = read_local_pdfs("tt.pdf")
......@@ -51,13 +77,13 @@ Read images from path or directory
.. code:: python
from magic_pdf.data.io.read_api import *
from magic_pdf.data.read_api import *
# read from image path
datasets = read_local_images("tt.png")
datasets = read_local_images("tt.png") # replace with real file path
# read files from directory that endswith suffix in suffixes array
datasets = read_local_images("images/", suffixes=["png", "jpg"])
datasets = read_local_images("images/", suffixes=["png", "jpg"]) # replace with real directory
Check :doc:`../../api/read_api` for more details
\ No newline at end of file
Inference Result
==================
.. admonition:: Tip
:class: tip
Please first navigate to :doc:`tutorial/pipeline` to get an initial understanding of how the pipeline works; this will help in understanding the content of this section.
The **InferenceResult** class is a container for storing model inference results and implements a series of methods related to these results, such as draw_model, dump_model.
Checkout :doc:`../api/model_operators` for more details about **InferenceResult**
Model Inference Result
-----------------------
Structure Definition
^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: python
from pydantic import BaseModel, Field
from enum import IntEnum
class CategoryType(IntEnum):
title = 0 # Title
plain_text = 1 # Text
abandon = 2 # Includes headers, footers, page numbers, and page annotations
figure = 3 # Image
figure_caption = 4 # Image description
table = 5 # Table
table_caption = 6 # Table description
table_footnote = 7 # Table footnote
isolate_formula = 8 # Block formula
formula_caption = 9 # Formula label
embedding = 13 # Inline formula
isolated = 14 # Block formula
text = 15 # OCR recognition result
class PageInfo(BaseModel):
page_no: int = Field(description="Page number, the first page is 0", ge=0)
height: int = Field(description="Page height", gt=0)
width: int = Field(description="Page width", ge=0)
class ObjectInferenceResult(BaseModel):
category_id: CategoryType = Field(description="Category", ge=0)
poly: list[float] = Field(description="Quadrilateral coordinates, representing the coordinates of the top-left, top-right, bottom-right, and bottom-left points respectively")
score: float = Field(description="Confidence of the inference result")
latex: str | None = Field(description="LaTeX parsing result", default=None)
html: str | None = Field(description="HTML parsing result", default=None)
class PageInferenceResults(BaseModel):
layout_dets: list[ObjectInferenceResult] = Field(description="Page recognition results", ge=0)
page_info: PageInfo = Field(description="Page metadata")
Example
^^^^^^^^^^^
.. code:: json
[
{
"layout_dets": [
{
"category_id": 2,
"poly": [
99.1906967163086,
100.3119125366211,
730.3707885742188,
100.3119125366211,
730.3707885742188,
245.81326293945312,
99.1906967163086,
245.81326293945312
],
"score": 0.9999997615814209
}
],
"page_info": {
"page_no": 0,
"height": 2339,
"width": 1654
}
},
{
"layout_dets": [
{
"category_id": 5,
"poly": [
99.13092803955078,
2210.680419921875,
497.3183898925781,
2210.680419921875,
497.3183898925781,
2264.78076171875,
99.13092803955078,
2264.78076171875
],
"score": 0.9999997019767761
}
],
"page_info": {
"page_no": 1,
"height": 2339,
"width": 1654
}
}
]
The format of the poly coordinates is [x0, y0, x1, y1, x2, y2, x3, y3],
representing the coordinates of the top-left, top-right, bottom-right,
and bottom-left points respectively. |Poly Coordinate Diagram|
Inference Result
-------------------------
.. code:: python
from magic_pdf.model.operators import InferenceResult
from magic_pdf.data.dataset import Dataset
dataset : Dataset = some_data_set # not real dataset
# The inference results of all pages, ordered by page number, are stored in a list as the inference results of MinerU
model_inference_result: list[PageInferenceResults] = []
Inference_result = InferenceResult(model_inference_result, dataset)
some_model.pdf
^^^^^^^^^^^^^^^^^^^^
.. figure:: ../_static/image/Inference_result.png
.. |Poly Coordinate Diagram| image:: ../_static/image/poly.png
......@@ -8,5 +8,5 @@ Installation
install/install
install//boost_with_cuda
install/download_model_weight_files
install/config
......@@ -9,25 +9,7 @@ appropriate guide based on your system:
- :ref:`ubuntu_22_04_lts_section`
- :ref:`windows_10_or_11_section`
- Quick Deployment with Docker
.. admonition:: Important
:class: tip
Docker requires a GPU with at least 16GB of VRAM, and all acceleration features are enabled by default.
Before running this Docker, you can use the following command to check if your device supports CUDA acceleration on Docker.
.. code-block:: bash
bash docker run --rm --gpus=all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
.. code:: sh
wget https://github.com/opendatalab/MinerU/raw/master/Dockerfile
docker build -t mineru:latest .
docker run --rm -it --gpus=all mineru:latest /bin/bash
magic-pdf --help
.. _ubuntu_22_04_lts_section:
......
Config
=========
File **magic-pdf.json** is typically located in the **${HOME}** directory under a Linux system or in the **C:\Users\{username}** directory under a Windows system.
magic-pdf.json
----------------
.. code:: json
{
"bucket_info":{
"bucket-name-1":["ak", "sk", "endpoint"],
"bucket-name-2":["ak", "sk", "endpoint"]
},
"models-dir":"/tmp/models",
"layoutreader-model-dir":"/tmp/layoutreader",
"device-mode":"cpu",
"layout-config": {
"model": "layoutlmv3"
},
"formula-config": {
"mfd_model": "yolo_v8_mfd",
"mfr_model": "unimernet_small",
"enable": true
},
"table-config": {
"model": "rapid_table",
"enable": false,
"max_time": 400
},
"config_version": "1.0.0"
}
bucket_info
^^^^^^^^^^^^^^
Store the access_key, secret_key and endpoint of AWS S3 Compatible storage config
Example:
.. code:: text
{
"image_bucket":[{access_key}, {secret_key}, {endpoint}],
"video_bucket":[{access_key}, {secret_key}, {endpoint}]
}
models-dir
^^^^^^^^^^^^
Store the models download from **huggingface** or **modelshop**. You do not need to modify this field if you download the model using the scripts shipped with **MinerU**
layoutreader-model-dir
^^^^^^^^^^^^^^^^^^^^^^^
Store the models download from **huggingface** or **modelshop**. You do not need to modify this field if you download the model using the scripts shipped with **MinerU**
devide-mode
^^^^^^^^^^^^^^
This field have two options, **cpu** or **cuda**.
**cpu**: inference via cpu
**cuda**: using cuda to accelerate inference
layout-config
^^^^^^^^^^^^^^^
.. code:: json
{
"model": "layoutlmv3"
}
layout model can not be disabled now, And we have only kind of layout model currently.
formula-config
^^^^^^^^^^^^^^^^
.. code:: json
{
"mfd_model": "yolo_v8_mfd",
"mfr_model": "unimernet_small",
"enable": true
}
mfd_model
""""""""""
Specify the formula detection model, options are ['yolo_v8_mfd']
mfr_model
""""""""""
Specify the formula recognition model, options are ['unimernet_small']
Check `UniMERNet <https://github.com/opendatalab/UniMERNet>`_ for more details
enable
""""""""
on-off flag, options are [true, false]. **true** means enable formula inference, **false** means disable formula inference
table-config
^^^^^^^^^^^^^^^^
.. code:: json
{
"model": "rapid_table",
"enable": false,
"max_time": 400
}
model
""""""""
Specify the table inference model, options are ['rapid_table', 'tablemaster', 'struct_eqtable']
max_time
"""""""""
Since table recognition is a time-consuming process, we set a timeout period. If the process exceeds this time, the table recognition will be terminated.
enable
"""""""
on-off flag, options are [true, false]. **true** means enable table inference, **false** means disable table inference
config_version
^^^^^^^^^^^^^^^^
The version of config schema.
.. admonition:: Tip
:class: tip
Check `Config Schema <https://github.com/opendatalab/MinerU/blob/master/magic-pdf.template.json>`_ for the latest details
......@@ -4,6 +4,7 @@ Install
If you encounter any installation issues, please first consult the :doc:`../../additional_notes/faq`.
If the parsing results are not as expected, refer to the :doc:`../../additional_notes/known_issues`.
Also you can try `online demo <https://www.modelscope.cn/studios/OpenDataLab/MinerU>`_ without installation.
.. admonition:: Warning
:class: tip
......@@ -88,7 +89,7 @@ If the parsing results are not as expected, refer to the :doc:`../../additional_
Create an environment
~~~~~~~~~~~~~~~~~~~~~
---------------------------
.. code-block:: shell
......@@ -98,7 +99,7 @@ Create an environment
Download model weight files
~~~~~~~~~~~~~~~~~~~~~~~~~~
------------------------------
.. code-block:: shell
......@@ -107,4 +108,32 @@ Download model weight files
python download_models_hf.py
The MinerU is installed, Check out :doc:`../quick_start` or reading :doc:`boost_with_cuda` for accelerate inference
\ No newline at end of file
Install LibreOffice[Optional]
----------------------------------
This section is required for handle **doc**, **docx**, **ppt**, **pptx** filetype, You can **skip** this section if no need for those filetype processing.
Linux/Macos Platform
""""""""""""""""""""""
.. code::
apt-get/yum/brew install libreoffice
Windows Platform
""""""""""""""""""""
.. code::
install libreoffice
append "install_dir\LibreOffice\program" to ENVIRONMENT PATH
.. tip::
The MinerU is installed, Check out :doc:`../usage/command_line` to convert your first pdf **or** reading the following sections for more details about install
Pipe Result
==============
.. admonition:: Tip
:class: tip
Please first navigate to :doc:`tutorial/pipeline` to get an initial understanding of how the pipeline works; this will help in understanding the content of this section.
The **PipeResult** class is a container for storing pipeline processing results and implements a series of methods related to these results, such as draw_layout, draw_span.
Checkout :doc:`../api/pipe_operators` for more details about **PipeResult**
Structure Definitions
-------------------------------
**some_pdf_middle.json**
+----------------+--------------------------------------------------------------+
| Field Name | Description |
| | |
+================+==============================================================+
| pdf_info | list, each element is a dict representing the parsing result |
| | of each PDF page, see the table below for details |
+----------------+--------------------------------------------------------------+
| \_ | ocr \| txt, used to indicate the mode used in this |
| parse_type | intermediate parsing state |
| | |
+----------------+--------------------------------------------------------------+
| \_version_name | string, indicates the version of magic-pdf used in this |
| | parsing |
| | |
+----------------+--------------------------------------------------------------+
**pdf_info**
Field structure description
+-------------------------+------------------------------------------------------------+
| Field | Description |
| Name | |
+=========================+============================================================+
| preproc_blocks | Intermediate result after PDF preprocessing, not yet |
| | segmented |
+-------------------------+------------------------------------------------------------+
| layout_bboxes | Layout segmentation results, containing layout direction |
| | (vertical, horizontal), and bbox, sorted by reading order |
+-------------------------+------------------------------------------------------------+
| page_idx | Page number, starting from 0 |
| | |
+-------------------------+------------------------------------------------------------+
| page_size | Page width and height |
| | |
+-------------------------+------------------------------------------------------------+
| \_layout_tree | Layout tree structure |
| | |
+-------------------------+------------------------------------------------------------+
| images | list, each element is a dict representing an img_block |
+-------------------------+------------------------------------------------------------+
| tables | list, each element is a dict representing a table_block |
+-------------------------+------------------------------------------------------------+
| interline_equation | list, each element is a dict representing an |
| | interline_equation_block |
| | |
+-------------------------+------------------------------------------------------------+
| discarded_blocks | List, block information returned by the model that needs |
| | to be dropped |
| | |
+-------------------------+------------------------------------------------------------+
| para_blocks | Result after segmenting preproc_blocks |
| | |
+-------------------------+------------------------------------------------------------+
In the above table, ``para_blocks`` is an array of dicts, each dict
representing a block structure. A block can support up to one level of
nesting.
**block**
The outer block is referred to as a first-level block, and the fields in
the first-level block include:
+------------------------+-------------------------------------------------------------+
| Field | Description |
| Name | |
+========================+=============================================================+
| type | Block type (table|image) |
+------------------------+-------------------------------------------------------------+
| bbox | Block bounding box coordinates |
+------------------------+-------------------------------------------------------------+
| blocks | list, each element is a dict representing a second-level |
| | block |
+------------------------+-------------------------------------------------------------+
There are only two types of first-level blocks: “table” and “image”. All
other blocks are second-level blocks.
The fields in a second-level block include:
+----------------------+----------------------------------------------------------------+
| Field | Description |
| Name | |
+======================+================================================================+
| | Block type |
| type | |
+----------------------+----------------------------------------------------------------+
| | Block bounding box coordinates |
| bbox | |
+----------------------+----------------------------------------------------------------+
| | list, each element is a dict representing a line, used to |
| lines | describe the composition of a line of information |
+----------------------+----------------------------------------------------------------+
Detailed explanation of second-level block types
================== ======================
type Description
================== ======================
image_body Main body of the image
image_caption Image description text
table_body Main body of the table
table_caption Table description text
table_footnote Table footnote
text Text block
title Title block
interline_equation Block formula
================== ======================
**line**
The field format of a line is as follows:
+---------------------+----------------------------------------------------------------+
| Field | Description |
| Name | |
+=====================+================================================================+
| | Bounding box coordinates of the line |
| bbox | |
+---------------------+----------------------------------------------------------------+
| spans | list, each element is a dict representing a span, used to |
| | describe the composition of the smallest unit |
+---------------------+----------------------------------------------------------------+
**span**
+---------------------+-----------------------------------------------------------+
| Field | Description |
| Name | |
+=====================+===========================================================+
| bbox | Bounding box coordinates of the span |
+---------------------+-----------------------------------------------------------+
| type | Type of the span |
+---------------------+-----------------------------------------------------------+
| content | Text spans use content, chart spans use img_path to store |
| \| | the actual text or screenshot path information |
| img_path | |
+---------------------+-----------------------------------------------------------+
The types of spans are as follows:
================== ==============
type Description
================== ==============
image Image
table Table
text Text
inline_equation Inline formula
interline_equation Block formula
================== ==============
**Summary**
A span is the smallest storage unit for all elements.
The elements stored within para_blocks are block information.
The block structure is as follows:
First-level block (if any) -> Second-level block -> Line -> Span
.. _example-1:
example
^^^^^^^
.. code:: json
{
"pdf_info": [
{
"preproc_blocks": [
{
"type": "text",
"bbox": [
52,
61.956024169921875,
294,
82.99800872802734
],
"lines": [
{
"bbox": [
52,
61.956024169921875,
294,
72.0000228881836
],
"spans": [
{
"bbox": [
54.0,
61.956024169921875,
296.2261657714844,
72.0000228881836
],
"content": "dependent on the service headway and the reliability of the departure ",
"type": "text",
"score": 1.0
}
]
}
]
}
],
"layout_bboxes": [
{
"layout_bbox": [
52,
61,
294,
731
],
"layout_label": "V",
"sub_layout": []
}
],
"page_idx": 0,
"page_size": [
612.0,
792.0
],
"_layout_tree": [],
"images": [],
"tables": [],
"interline_equations": [],
"discarded_blocks": [],
"para_blocks": [
{
"type": "text",
"bbox": [
52,
61.956024169921875,
294,
82.99800872802734
],
"lines": [
{
"bbox": [
52,
61.956024169921875,
294,
72.0000228881836
],
"spans": [
{
"bbox": [
54.0,
61.956024169921875,
296.2261657714844,
72.0000228881836
],
"content": "dependent on the service headway and the reliability of the departure ",
"type": "text",
"score": 1.0
}
]
}
]
}
]
}
],
"_parse_type": "txt",
"_version_name": "0.6.1"
}
Pipeline Result
------------------
.. code:: python
from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
from magic_pdf.pipe.operators import PipeResult
from magic_pdf.data.dataset import Dataset
res = pdf_parse_union(*args, **kwargs)
res['_parse_type'] = PARSE_TYPE_OCR
res['_version_name'] = __version__
if 'lang' in kwargs and kwargs['lang'] is not None:
res['lang'] = kwargs['lang']
dataset : Dataset = some_dataset # not real dataset
pipeResult = PipeResult(res, dataset)
some_pdf_layout.pdf
~~~~~~~~~~~~~~~~~~~
Each page layout consists of one or more boxes. The number at the top
left of each box indicates its sequence number. Additionally, in
``layout.pdf``, different content blocks are highlighted with different
background colors.
.. figure:: ../_static/image/layout_example.png
:alt: layout example
layout example
some_pdf_spans.pdf
~~~~~~~~~~~~~~~~~~
All spans on the page are drawn with different colored line frames
according to the span type. This file can be used for quality control,
allowing for quick identification of issues such as missing text or
unrecognized inline formulas.
.. figure:: ../_static/image/spans_example.png
:alt: spans example
spans example
\ No newline at end of file
......@@ -2,12 +2,14 @@
Quick Start
==============
Eager to get started? This page gives a good introduction to MinerU. Follow Installation to set up a project and install MinerU first.
Want to learn about the usage methods under different scenarios ? This page gives good examples about multiple usage cases match your needs.
.. toctree::
:maxdepth: 1
quick_start/command_line
quick_start/to_markdown
quick_start/convert_pdf
quick_start/convert_image
quick_start/convert_ppt
quick_start/convert_pptx
quick_start/convert_doc
quick_start/convert_docx
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment