Unverified Commit bdacf291 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #1257 from icecraft/docs/refactor_en_docs

Docs/refactor en docs
parents 2df3e901 302a6950
...@@ -55,7 +55,7 @@ class FileBasedDataWriter(DataWriter): ...@@ -55,7 +55,7 @@ class FileBasedDataWriter(DataWriter):
if not os.path.isabs(fn_path) and len(self._parent_dir) > 0: if not os.path.isabs(fn_path) and len(self._parent_dir) > 0:
fn_path = os.path.join(self._parent_dir, path) fn_path = os.path.join(self._parent_dir, path)
if not os.path.exists(os.path.dirname(fn_path)): if not os.path.exists(os.path.dirname(fn_path)) and os.path.dirname(fn_path) != "":
os.makedirs(os.path.dirname(fn_path), exist_ok=True) os.makedirs(os.path.dirname(fn_path), exist_ok=True)
with open(fn_path, 'wb') as f: with open(fn_path, 'wb') as f:
......
import json import json
import os import os
import tempfile
import shutil
from pathlib import Path from pathlib import Path
from magic_pdf.config.exceptions import EmptyData, InvalidParams from magic_pdf.config.exceptions import EmptyData, InvalidParams
from magic_pdf.data.data_reader_writer import (FileBasedDataReader, from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
MultiBucketS3DataReader) MultiBucketS3DataReader)
from magic_pdf.data.dataset import ImageDataset, PymuDocDataset from magic_pdf.data.dataset import ImageDataset, PymuDocDataset
from magic_pdf.utils.office_to_pdf import convert_file_to_pdf, ConvertToPdfError
def read_jsonl( def read_jsonl(
s3_path_or_local: str, s3_client: MultiBucketS3DataReader | None = None s3_path_or_local: str, s3_client: MultiBucketS3DataReader | None = None
...@@ -58,23 +60,68 @@ def read_local_pdfs(path: str) -> list[PymuDocDataset]: ...@@ -58,23 +60,68 @@ def read_local_pdfs(path: str) -> list[PymuDocDataset]:
list[PymuDocDataset]: each pdf file will converted to a PymuDocDataset list[PymuDocDataset]: each pdf file will converted to a PymuDocDataset
""" """
if os.path.isdir(path): if os.path.isdir(path):
reader = FileBasedDataReader(path) reader = FileBasedDataReader()
return [ ret = []
PymuDocDataset(reader.read(doc_path.name)) for root, _, files in os.walk(path):
for doc_path in Path(path).glob('*.pdf') for file in files:
] suffix = file.split('.')
if suffix[-1] == 'pdf':
ret.append( PymuDocDataset(reader.read(os.path.join(root, file))))
return ret
else: else:
reader = FileBasedDataReader() reader = FileBasedDataReader()
bits = reader.read(path) bits = reader.read(path)
return [PymuDocDataset(bits)] return [PymuDocDataset(bits)]
def read_local_office(path: str) -> list[PymuDocDataset]:
"""Read ms-office file (ppt, pptx, doc, docx) from path or directory.
def read_local_images(path: str, suffixes: list[str]) -> list[ImageDataset]: Args:
path (str): ms-office file or directory that contains ms-office files
Returns:
list[PymuDocDataset]: each ms-office file will converted to a PymuDocDataset
Raises:
ConvertToPdfError: Failed to convert ms-office file to pdf via libreoffice
FileNotFoundError: File not Found
Exception: Unknown Exception raised
"""
suffixes = ['.ppt', '.pptx', '.doc', '.docx']
fns = []
ret = []
if os.path.isdir(path):
for root, _, files in os.walk(path):
for file in files:
suffix = Path(file).suffix
if suffix in suffixes:
fns.append((os.path.join(root, file)))
else:
fns.append(path)
reader = FileBasedDataReader()
temp_dir = tempfile.mkdtemp()
for fn in fns:
try:
convert_file_to_pdf(fn, temp_dir)
except ConvertToPdfError as e:
raise e
except FileNotFoundError as e:
raise e
except Exception as e:
raise e
fn_path = Path(fn)
pdf_fn = f"{temp_dir}/{fn_path.stem}.pdf"
ret.append(PymuDocDataset(reader.read(pdf_fn)))
shutil.rmtree(temp_dir)
return ret
def read_local_images(path: str, suffixes: list[str]=['.png', '.jpg']) -> list[ImageDataset]:
"""Read images from path or directory. """Read images from path or directory.
Args: Args:
path (str): image file path or directory that contains image files path (str): image file path or directory that contains image files
suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['jpg', 'png'] suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['.jpg', '.png']
Returns: Returns:
list[ImageDataset]: each image file will converted to a ImageDataset list[ImageDataset]: each image file will converted to a ImageDataset
...@@ -82,12 +129,12 @@ def read_local_images(path: str, suffixes: list[str]) -> list[ImageDataset]: ...@@ -82,12 +129,12 @@ def read_local_images(path: str, suffixes: list[str]) -> list[ImageDataset]:
if os.path.isdir(path): if os.path.isdir(path):
imgs_bits = [] imgs_bits = []
s_suffixes = set(suffixes) s_suffixes = set(suffixes)
reader = FileBasedDataReader(path) reader = FileBasedDataReader()
for root, _, files in os.walk(path): for root, _, files in os.walk(path):
for file in files: for file in files:
suffix = file.split('.') suffix = Path(file).suffix
if suffix[-1] in s_suffixes: if suffix in s_suffixes:
imgs_bits.append(reader.read(file)) imgs_bits.append(reader.read(os.path.join(root, file)))
return [ImageDataset(bits) for bits in imgs_bits] return [ImageDataset(bits) for bits in imgs_bits]
else: else:
reader = FileBasedDataReader() reader = FileBasedDataReader()
......
...@@ -65,31 +65,6 @@ class InferenceResultBase(ABC): ...@@ -65,31 +65,6 @@ class InferenceResultBase(ABC):
""" """
pass pass
@abstractmethod
def pipe_auto_mode(
self,
imageWriter: DataWriter,
start_page_id=0,
end_page_id=None,
debug_mode=False,
lang=None,
) -> PipeResult:
"""Post-proc the model inference result.
step1: classify the dataset type
step2: based the result of step1, using `pipe_txt_mode` or `pipe_ocr_mode`
Args:
imageWriter (DataWriter): the image writer handle
start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
end_page_id (int, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process
debug_mode (bool, optional): Defaults to False. will dump more log if enabled
lang (str, optional): Defaults to None.
Returns:
PipeResult: the result
"""
pass
@abstractmethod @abstractmethod
def pipe_txt_mode( def pipe_txt_mode(
self, self,
......
...@@ -71,40 +71,6 @@ class InferenceResult(InferenceResultBase): ...@@ -71,40 +71,6 @@ class InferenceResult(InferenceResultBase):
""" """
return proc(copy.deepcopy(self._infer_res), *args, **kwargs) return proc(copy.deepcopy(self._infer_res), *args, **kwargs)
def pipe_auto_mode(
self,
imageWriter: DataWriter,
start_page_id=0,
end_page_id=None,
debug_mode=False,
lang=None,
) -> PipeResult:
"""Post-proc the model inference result.
step1: classify the dataset type
step2: based the result of step1, using `pipe_txt_mode` or `pipe_ocr_mode`
Args:
imageWriter (DataWriter): the image writer handle
start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
end_page_id (int, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process
debug_mode (bool, optional): Defaults to False. will dump more log if enabled
lang (str, optional): Defaults to None.
Returns:
PipeResult: the result
"""
pdf_proc_method = classify(self._dataset.data_bits())
if pdf_proc_method == SupportedPdfParseMethod.TXT:
return self.pipe_txt_mode(
imageWriter, start_page_id, end_page_id, debug_mode, lang
)
else:
return self.pipe_ocr_mode(
imageWriter, start_page_id, end_page_id, debug_mode, lang
)
def pipe_txt_mode( def pipe_txt_mode(
self, self,
imageWriter: DataWriter, imageWriter: DataWriter,
......
import os import os
from pathlib import Path import shutil
import tempfile
import click import click
import fitz
from loguru import logger from loguru import logger
from pathlib import Path
import magic_pdf.model as model_config import magic_pdf.model as model_config
from magic_pdf.data.data_reader_writer import FileBasedDataReader from magic_pdf.data.data_reader_writer import FileBasedDataReader
from magic_pdf.libs.version import __version__ from magic_pdf.libs.version import __version__
from magic_pdf.tools.common import do_parse, parse_pdf_methods from magic_pdf.tools.common import do_parse, parse_pdf_methods
from magic_pdf.utils.office_to_pdf import convert_file_to_pdf
pdf_suffixes = ['.pdf']
ms_office_suffixes = ['.ppt', '.pptx', '.doc', '.docx']
image_suffixes = ['.png', '.jpg']
@click.command() @click.command()
...@@ -21,7 +28,7 @@ from magic_pdf.tools.common import do_parse, parse_pdf_methods ...@@ -21,7 +28,7 @@ from magic_pdf.tools.common import do_parse, parse_pdf_methods
'path', 'path',
type=click.Path(exists=True), type=click.Path(exists=True),
required=True, required=True,
help='local pdf filepath or directory', help='local filepath or directory. support PDF, PPT, PPTX, DOC, DOCX, PNG, JPG files',
) )
@click.option( @click.option(
'-o', '-o',
...@@ -83,12 +90,27 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id): ...@@ -83,12 +90,27 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
model_config.__use_inside_model__ = True model_config.__use_inside_model__ = True
model_config.__model_mode__ = 'full' model_config.__model_mode__ = 'full'
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
temp_dir = tempfile.mkdtemp()
def read_fn(path: Path):
if path.suffix in ms_office_suffixes:
convert_file_to_pdf(str(path), temp_dir)
fn = os.path.join(temp_dir, f"{path.stem}.pdf")
elif path.suffix in image_suffixes:
with open(str(path), 'rb') as f:
bits = f.read()
pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
fn = os.path.join(temp_dir, f"{path.stem}.pdf")
with open(fn, 'wb') as f:
f.write(pdf_bytes)
elif path.suffix in pdf_suffixes:
fn = str(path)
else:
raise Exception(f"Unknown file suffix: {path.suffix}")
disk_rw = FileBasedDataReader(os.path.dirname(fn))
return disk_rw.read(os.path.basename(fn))
def read_fn(path): def parse_doc(doc_path: Path):
disk_rw = FileBasedDataReader(os.path.dirname(path))
return disk_rw.read(os.path.basename(path))
def parse_doc(doc_path: str):
try: try:
file_name = str(Path(doc_path).stem) file_name = str(Path(doc_path).stem)
pdf_data = read_fn(doc_path) pdf_data = read_fn(doc_path)
...@@ -108,10 +130,13 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id): ...@@ -108,10 +130,13 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
logger.exception(e) logger.exception(e)
if os.path.isdir(path): if os.path.isdir(path):
for doc_path in Path(path).glob('*.pdf'): for doc_path in Path(path).glob('*'):
parse_doc(doc_path) if doc_path.suffix in pdf_suffixes + image_suffixes + ms_office_suffixes:
parse_doc(doc_path)
else: else:
parse_doc(path) parse_doc(Path(path))
shutil.rmtree(temp_dir)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -170,6 +170,7 @@ def do_parse( ...@@ -170,6 +170,7 @@ def do_parse(
logger.error('need model list input') logger.error('need model list input')
exit(2) exit(2)
else: else:
infer_result = InferenceResult(model_list, ds) infer_result = InferenceResult(model_list, ds)
if parse_method == 'ocr': if parse_method == 'ocr':
pipe_result = infer_result.pipe_ocr_mode( pipe_result = infer_result.pipe_ocr_mode(
...@@ -180,9 +181,15 @@ def do_parse( ...@@ -180,9 +181,15 @@ def do_parse(
image_writer, debug_mode=True, lang=lang image_writer, debug_mode=True, lang=lang
) )
else: else:
pipe_result = infer_result.pipe_auto_mode( if ds.classify() == SupportedPdfParseMethod.TXT:
image_writer, debug_mode=True, lang=lang pipe_result = infer_result.pipe_txt_mode(
) image_writer, debug_mode=True, lang=lang
)
else:
pipe_result = infer_result.pipe_txt_mode(
image_writer, debug_mode=True, lang=lang
)
if f_draw_model_bbox: if f_draw_model_bbox:
infer_result.draw_model( infer_result.draw_model(
......
import os
import subprocess
from pathlib import Path
class ConvertToPdfError(Exception):
def __init__(self, msg):
self.msg = msg
super().__init__(self.msg)
def convert_file_to_pdf(input_path, output_dir):
if not os.path.isfile(input_path):
raise FileNotFoundError(f"The input file {input_path} does not exist.")
os.makedirs(output_dir, exist_ok=True)
cmd = [
'soffice',
'--headless',
'--convert-to', 'pdf',
'--outdir', str(output_dir),
str(input_path)
]
process = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if process.returncode != 0:
raise ConvertToPdfError(process.stderr.decode())
...@@ -4,8 +4,11 @@ Glossary ...@@ -4,8 +4,11 @@ Glossary
=========== ===========
1. jsonl 1. jsonl
TODO: add description Newline-delimited (\n), and each line must be a valid, independent JSON object.
Currently, All the function shipped with **MinerU** assume that json object must contain one field named with either **path** or **file_location**
2. magic-pdf.json
TODO
2. magic-pdf.json
TODO: add description
...@@ -70,6 +70,12 @@ Key Features ...@@ -70,6 +70,12 @@ Key Features
- Supports both CPU and GPU environments. - Supports both CPU and GPU environments.
- Compatible with Windows, Linux, and Mac platforms. - Compatible with Windows, Linux, and Mac platforms.
.. tip::
Get started with MinerU by trying the `online demo <https://www.modelscope.cn/studios/OpenDataLab/MinerU>`_ or :doc:`installing it locally <user_guide/install/install>`.
User Guide User Guide
------------- -------------
.. toctree:: .. toctree::
......
...@@ -4,7 +4,9 @@ ...@@ -4,7 +4,9 @@
:maxdepth: 2 :maxdepth: 2
user_guide/install user_guide/install
user_guide/usage
user_guide/quick_start user_guide/quick_start
user_guide/tutorial user_guide/tutorial
user_guide/data user_guide/data
user_guide/inference_result
user_guide/pipe_result
...@@ -87,56 +87,70 @@ Read Examples ...@@ -87,56 +87,70 @@ Read Examples
.. code:: python .. code:: python
import os
from magic_pdf.data.data_reader_writer import * from magic_pdf.data.data_reader_writer import *
from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
from magic_pdf.data.schemas import S3Config
# file based related # file based related
file_based_reader1 = FileBasedDataReader('') file_based_reader1 = FileBasedDataReader('')
## will read file abc ## will read file abc
file_based_reader1.read('abc') file_based_reader1.read('abc')
file_based_reader2 = FileBasedDataReader('/tmp') file_based_reader2 = FileBasedDataReader('/tmp')
## will read /tmp/abc ## will read /tmp/abc
file_based_reader2.read('abc') file_based_reader2.read('abc')
## will read /var/logs/message.txt ## will read /tmp/logs/message.txt
file_based_reader2.read('/var/logs/message.txt') file_based_reader2.read('/tmp/logs/message.txt')
# multi bucket s3 releated # multi bucket s3 releated
multi_bucket_s3_reader1 = MultiBucketS3DataReader("test_bucket1/test_prefix", list[S3Config( bucket = "bucket" # replace with real bucket
bucket_name=test_bucket1, access_key=ak, secret_key=sk, endpoint_url=endpoint_url ak = "ak" # replace with real access key
sk = "sk" # replace with real secret key
endpoint_url = "endpoint_url" # replace with real endpoint_url
bucket_2 = "bucket_2" # replace with real bucket
ak_2 = "ak_2" # replace with real access key
sk_2 = "sk_2" # replace with real secret key
endpoint_url_2 = "endpoint_url_2" # replace with real endpoint_url
test_prefix = 'test/unittest'
multi_bucket_s3_reader1 = MultiBucketS3DataReader(f"{bucket}/{test_prefix}", [S3Config(
bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
), ),
S3Config( S3Config(
bucket_name=test_bucket_2, bucket_name=bucket_2,
access_key=ak_2, access_key=ak_2,
secret_key=sk_2, secret_key=sk_2,
endpoint_url=endpoint_url_2, endpoint_url=endpoint_url_2,
)]) )])
## will read s3://test_bucket1/test_prefix/abc ## will read s3://{bucket}/{test_prefix}/abc
multi_bucket_s3_reader1.read('abc') multi_bucket_s3_reader1.read('abc')
## will read s3://test_bucket1/efg ## will read s3://{bucket}/{test_prefix}/efg
multi_bucket_s3_reader1.read('s3://test_bucket1/efg') multi_bucket_s3_reader1.read(f's3://{bucket}/{test_prefix}/efg')
## will read s3://test_bucket2/abc ## will read s3://{bucket2}/{test_prefix}/abc
multi_bucket_s3_reader1.read('s3://test_bucket2/abc') multi_bucket_s3_reader1.read(f's3://{bucket_2}/{test_prefix}/abc')
# s3 related # s3 related
s3_reader1 = S3DataReader( s3_reader1 = S3DataReader(
default_prefix_without_bucket = "test_prefix" test_prefix,
bucket: "test_bucket", bucket,
ak: "ak", ak,
sk: "sk", sk,
endpoint_url: "localhost" endpoint_url
) )
## will read s3://test_bucket/test_prefix/abc ## will read s3://{bucket}/{test_prefix}/abc
s3_reader1.read('abc') s3_reader1.read('abc')
## will read s3://test_bucket/efg ## will read s3://{bucket}/efg
s3_reader1.read('s3://test_bucket/efg') s3_reader1.read(f's3://{bucket}/efg')
Write Examples Write Examples
...@@ -144,65 +158,79 @@ Write Examples ...@@ -144,65 +158,79 @@ Write Examples
.. code:: python .. code:: python
import os
from magic_pdf.data.data_reader_writer import * from magic_pdf.data.data_reader_writer import *
from magic_pdf.data.data_reader_writer import MultiBucketS3DataWriter
from magic_pdf.data.schemas import S3Config
# file based related # file based related
file_based_writer1 = FileBasedDataWriter('') file_based_writer1 = FileBasedDataWriter("")
## will write 123 to abc ## will write 123 to abc
file_based_writer1.write('abc', '123'.encode()) file_based_writer1.write("abc", "123".encode())
## will write 123 to abc ## will write 123 to abc
file_based_writer1.write_string('abc', '123') file_based_writer1.write_string("abc", "123")
file_based_writer2 = FileBasedDataWriter('/tmp') file_based_writer2 = FileBasedDataWriter("/tmp")
## will write 123 to /tmp/abc ## will write 123 to /tmp/abc
file_based_writer2.write_string('abc', '123') file_based_writer2.write_string("abc", "123")
## will write 123 to /var/logs/message.txt ## will write 123 to /tmp/logs/message.txt
file_based_writer2.write_string('/var/logs/message.txt', '123') file_based_writer2.write_string("/tmp/logs/message.txt", "123")
# multi bucket s3 releated # multi bucket s3 releated
multi_bucket_s3_writer1 = MultiBucketS3DataWriter("test_bucket1/test_prefix", list[S3Config( bucket = "bucket" # replace with real bucket
bucket_name=test_bucket1, access_key=ak, secret_key=sk, endpoint_url=endpoint_url ak = "ak" # replace with real access key
), sk = "sk" # replace with real secret key
S3Config( endpoint_url = "endpoint_url" # replace with real endpoint_url
bucket_name=test_bucket_2,
access_key=ak_2, bucket_2 = "bucket_2" # replace with real bucket
secret_key=sk_2, ak_2 = "ak_2" # replace with real access key
endpoint_url=endpoint_url_2, sk_2 = "sk_2" # replace with real secret key
)]) endpoint_url_2 = "endpoint_url_2" # replace with real endpoint_url
## will write 123 to s3://test_bucket1/test_prefix/abc test_prefix = "test/unittest"
multi_bucket_s3_writer1.write_string('abc', '123') multi_bucket_s3_writer1 = MultiBucketS3DataWriter(
f"{bucket}/{test_prefix}",
[
S3Config(
bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
),
],
)
## will write 123 to s3://test_bucket1/test_prefix/abc ## will write 123 to s3://{bucket}/{test_prefix}/abc
multi_bucket_s3_writer1.write('abc', '123'.encode()) multi_bucket_s3_writer1.write_string("abc", "123")
## will write 123 to s3://test_bucket1/efg ## will write 123 to s3://{bucket}/{test_prefix}/abc
multi_bucket_s3_writer1.write('s3://test_bucket1/efg', '123'.encode()) multi_bucket_s3_writer1.write("abc", "123".encode())
## will write 123 to s3://test_bucket2/abc ## will write 123 to s3://{bucket}/{test_prefix}/efg
multi_bucket_s3_writer1.write('s3://test_bucket2/abc', '123'.encode()) multi_bucket_s3_writer1.write(f"s3://{bucket}/{test_prefix}/efg", "123".encode())
## will write 123 to s3://{bucket_2}/{test_prefix}/abc
multi_bucket_s3_writer1.write(f's3://{bucket_2}/{test_prefix}/abc', '123'.encode())
# s3 related # s3 related
s3_writer1 = S3DataWriter( s3_writer1 = S3DataWriter(test_prefix, bucket, ak, sk, endpoint_url)
default_prefix_without_bucket = "test_prefix"
bucket: "test_bucket", ## will write 123 to s3://{bucket}/{test_prefix}/abc
ak: "ak", s3_writer1.write("abc", "123".encode())
sk: "sk",
endpoint_url: "localhost"
)
## will write 123 to s3://test_bucket/test_prefix/abc ## will write 123 to s3://{bucket}/{test_prefix}/abc
s3_writer1.write('abc', '123'.encode()) s3_writer1.write_string("abc", "123")
## will write 123 to s3://test_bucket/test_prefix/abc ## will write 123 to s3://{bucket}/efg
s3_writer1.write_string('abc', '123') s3_writer1.write(f"s3://{bucket}/efg", "123".encode())
## will write 123 to s3://test_bucket/efg
s3_writer1.write('s3://test_bucket/efg', '123'.encode())
Check :doc:`../../api/data_reader_writer` for more details Check :doc:`../../api/data_reader_writer` for more details
...@@ -18,24 +18,50 @@ Read the contet from jsonl which may located on local machine or remote s3. if y ...@@ -18,24 +18,50 @@ Read the contet from jsonl which may located on local machine or remote s3. if y
.. code:: python .. code:: python
from magic_pdf.data.io.read_api import * from magic_pdf.data.read_api import *
from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
from magic_pdf.data.schemas import S3Config
# read jsonl from local machine # read jsonl from local machine
datasets = read_jsonl("tt.jsonl", None) datasets = read_jsonl("tt.jsonl", None) # replace with real jsonl file
# read jsonl from remote s3 # read jsonl from remote s3
datasets = read_jsonl("s3://bucket_1/tt.jsonl", s3_reader)
bucket = "bucket_1" # replace with real s3 bucket
ak = "access_key_1" # replace with real s3 access key
sk = "secret_key_1" # replace with real s3 secret key
endpoint_url = "endpoint_url_1" # replace with real s3 endpoint url
bucket_2 = "bucket_2" # replace with real s3 bucket
ak_2 = "access_key_2" # replace with real s3 access key
sk_2 = "secret_key_2" # replace with real s3 secret key
endpoint_url_2 = "endpoint_url_2" # replace with real s3 endpoint url
s3configs = [
S3Config(
bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
),
]
s3_reader = MultiBucketS3DataReader(bucket, s3configs)
datasets = read_jsonl(f"s3://bucket_1/tt.jsonl", s3_reader) # replace with real s3 jsonl file
read_local_pdfs read_local_pdfs
^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^
Read pdf from path or directory. Read pdf from path or directory.
.. code:: python .. code:: python
from magic_pdf.data.io.read_api import * from magic_pdf.data.read_api import *
# read pdf path # read pdf path
datasets = read_local_pdfs("tt.pdf") datasets = read_local_pdfs("tt.pdf")
...@@ -51,13 +77,13 @@ Read images from path or directory ...@@ -51,13 +77,13 @@ Read images from path or directory
.. code:: python .. code:: python
from magic_pdf.data.io.read_api import * from magic_pdf.data.read_api import *
# read from image path # read from image path
datasets = read_local_images("tt.png") datasets = read_local_images("tt.png") # replace with real file path
# read files from directory that endswith suffix in suffixes array # read files from directory that endswith suffix in suffixes array
datasets = read_local_images("images/", suffixes=["png", "jpg"]) datasets = read_local_images("images/", suffixes=["png", "jpg"]) # replace with real directory
Check :doc:`../../api/read_api` for more details Check :doc:`../../api/read_api` for more details
\ No newline at end of file
Inference Result
==================
.. admonition:: Tip
:class: tip
Please first navigate to :doc:`tutorial/pipeline` to get an initial understanding of how the pipeline works; this will help in understanding the content of this section.
The **InferenceResult** class is a container for storing model inference results and implements a series of methods related to these results, such as draw_model, dump_model.
Checkout :doc:`../api/model_operators` for more details about **InferenceResult**
Model Inference Result
-----------------------
Structure Definition
^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: python
from pydantic import BaseModel, Field
from enum import IntEnum
class CategoryType(IntEnum):
title = 0 # Title
plain_text = 1 # Text
abandon = 2 # Includes headers, footers, page numbers, and page annotations
figure = 3 # Image
figure_caption = 4 # Image description
table = 5 # Table
table_caption = 6 # Table description
table_footnote = 7 # Table footnote
isolate_formula = 8 # Block formula
formula_caption = 9 # Formula label
embedding = 13 # Inline formula
isolated = 14 # Block formula
text = 15 # OCR recognition result
class PageInfo(BaseModel):
page_no: int = Field(description="Page number, the first page is 0", ge=0)
height: int = Field(description="Page height", gt=0)
width: int = Field(description="Page width", ge=0)
class ObjectInferenceResult(BaseModel):
category_id: CategoryType = Field(description="Category", ge=0)
poly: list[float] = Field(description="Quadrilateral coordinates, representing the coordinates of the top-left, top-right, bottom-right, and bottom-left points respectively")
score: float = Field(description="Confidence of the inference result")
latex: str | None = Field(description="LaTeX parsing result", default=None)
html: str | None = Field(description="HTML parsing result", default=None)
class PageInferenceResults(BaseModel):
layout_dets: list[ObjectInferenceResult] = Field(description="Page recognition results", ge=0)
page_info: PageInfo = Field(description="Page metadata")
Example
^^^^^^^^^^^
.. code:: json
[
{
"layout_dets": [
{
"category_id": 2,
"poly": [
99.1906967163086,
100.3119125366211,
730.3707885742188,
100.3119125366211,
730.3707885742188,
245.81326293945312,
99.1906967163086,
245.81326293945312
],
"score": 0.9999997615814209
}
],
"page_info": {
"page_no": 0,
"height": 2339,
"width": 1654
}
},
{
"layout_dets": [
{
"category_id": 5,
"poly": [
99.13092803955078,
2210.680419921875,
497.3183898925781,
2210.680419921875,
497.3183898925781,
2264.78076171875,
99.13092803955078,
2264.78076171875
],
"score": 0.9999997019767761
}
],
"page_info": {
"page_no": 1,
"height": 2339,
"width": 1654
}
}
]
The format of the poly coordinates is [x0, y0, x1, y1, x2, y2, x3, y3],
representing the coordinates of the top-left, top-right, bottom-right,
and bottom-left points respectively. |Poly Coordinate Diagram|
Inference Result
-------------------------
.. code:: python
from magic_pdf.model.operators import InferenceResult
from magic_pdf.data.dataset import Dataset
dataset : Dataset = some_data_set # not real dataset
# The inference results of all pages, ordered by page number, are stored in a list as the inference results of MinerU
model_inference_result: list[PageInferenceResults] = []
Inference_result = InferenceResult(model_inference_result, dataset)
some_model.pdf
^^^^^^^^^^^^^^^^^^^^
.. figure:: ../_static/image/Inference_result.png
.. |Poly Coordinate Diagram| image:: ../_static/image/poly.png
...@@ -8,5 +8,5 @@ Installation ...@@ -8,5 +8,5 @@ Installation
install/install install/install
install//boost_with_cuda install//boost_with_cuda
install/download_model_weight_files install/download_model_weight_files
install/config
...@@ -9,25 +9,7 @@ appropriate guide based on your system: ...@@ -9,25 +9,7 @@ appropriate guide based on your system:
- :ref:`ubuntu_22_04_lts_section` - :ref:`ubuntu_22_04_lts_section`
- :ref:`windows_10_or_11_section` - :ref:`windows_10_or_11_section`
- Quick Deployment with Docker
.. admonition:: Important
:class: tip
Docker requires a GPU with at least 16GB of VRAM, and all acceleration features are enabled by default.
Before running this Docker, you can use the following command to check if your device supports CUDA acceleration on Docker.
.. code-block:: bash
bash docker run --rm --gpus=all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
.. code:: sh
wget https://github.com/opendatalab/MinerU/raw/master/Dockerfile
docker build -t mineru:latest .
docker run --rm -it --gpus=all mineru:latest /bin/bash
magic-pdf --help
.. _ubuntu_22_04_lts_section: .. _ubuntu_22_04_lts_section:
......
Config
=========
File **magic-pdf.json** is typically located in the **${HOME}** directory under a Linux system or in the **C:\Users\{username}** directory under a Windows system.
magic-pdf.json
----------------
.. code:: json
{
"bucket_info":{
"bucket-name-1":["ak", "sk", "endpoint"],
"bucket-name-2":["ak", "sk", "endpoint"]
},
"models-dir":"/tmp/models",
"layoutreader-model-dir":"/tmp/layoutreader",
"device-mode":"cpu",
"layout-config": {
"model": "layoutlmv3"
},
"formula-config": {
"mfd_model": "yolo_v8_mfd",
"mfr_model": "unimernet_small",
"enable": true
},
"table-config": {
"model": "rapid_table",
"enable": false,
"max_time": 400
},
"config_version": "1.0.0"
}
bucket_info
^^^^^^^^^^^^^^
Store the access_key, secret_key and endpoint of AWS S3 Compatible storage config
Example:
.. code:: text
{
"image_bucket":[{access_key}, {secret_key}, {endpoint}],
"video_bucket":[{access_key}, {secret_key}, {endpoint}]
}
models-dir
^^^^^^^^^^^^
Store the models download from **huggingface** or **modelshop**. You do not need to modify this field if you download the model using the scripts shipped with **MinerU**
layoutreader-model-dir
^^^^^^^^^^^^^^^^^^^^^^^
Store the models download from **huggingface** or **modelshop**. You do not need to modify this field if you download the model using the scripts shipped with **MinerU**
devide-mode
^^^^^^^^^^^^^^
This field have two options, **cpu** or **cuda**.
**cpu**: inference via cpu
**cuda**: using cuda to accelerate inference
layout-config
^^^^^^^^^^^^^^^
.. code:: json
{
"model": "layoutlmv3"
}
layout model can not be disabled now, And we have only kind of layout model currently.
formula-config
^^^^^^^^^^^^^^^^
.. code:: json
{
"mfd_model": "yolo_v8_mfd",
"mfr_model": "unimernet_small",
"enable": true
}
mfd_model
""""""""""
Specify the formula detection model, options are ['yolo_v8_mfd']
mfr_model
""""""""""
Specify the formula recognition model, options are ['unimernet_small']
Check `UniMERNet <https://github.com/opendatalab/UniMERNet>`_ for more details
enable
""""""""
on-off flag, options are [true, false]. **true** means enable formula inference, **false** means disable formula inference
table-config
^^^^^^^^^^^^^^^^
.. code:: json
{
"model": "rapid_table",
"enable": false,
"max_time": 400
}
model
""""""""
Specify the table inference model, options are ['rapid_table', 'tablemaster', 'struct_eqtable']
max_time
"""""""""
Since table recognition is a time-consuming process, we set a timeout period. If the process exceeds this time, the table recognition will be terminated.
enable
"""""""
on-off flag, options are [true, false]. **true** means enable table inference, **false** means disable table inference
config_version
^^^^^^^^^^^^^^^^
The version of config schema.
.. admonition:: Tip
:class: tip
Check `Config Schema <https://github.com/opendatalab/MinerU/blob/master/magic-pdf.template.json>`_ for the latest details
...@@ -4,6 +4,7 @@ Install ...@@ -4,6 +4,7 @@ Install
If you encounter any installation issues, please first consult the :doc:`../../additional_notes/faq`. If you encounter any installation issues, please first consult the :doc:`../../additional_notes/faq`.
If the parsing results are not as expected, refer to the :doc:`../../additional_notes/known_issues`. If the parsing results are not as expected, refer to the :doc:`../../additional_notes/known_issues`.
Also you can try `online demo <https://www.modelscope.cn/studios/OpenDataLab/MinerU>`_ without installation.
.. admonition:: Warning .. admonition:: Warning
:class: tip :class: tip
...@@ -88,7 +89,7 @@ If the parsing results are not as expected, refer to the :doc:`../../additional_ ...@@ -88,7 +89,7 @@ If the parsing results are not as expected, refer to the :doc:`../../additional_
Create an environment Create an environment
~~~~~~~~~~~~~~~~~~~~~ ---------------------------
.. code-block:: shell .. code-block:: shell
...@@ -98,7 +99,7 @@ Create an environment ...@@ -98,7 +99,7 @@ Create an environment
Download model weight files Download model weight files
~~~~~~~~~~~~~~~~~~~~~~~~~~ ------------------------------
.. code-block:: shell .. code-block:: shell
...@@ -107,4 +108,32 @@ Download model weight files ...@@ -107,4 +108,32 @@ Download model weight files
python download_models_hf.py python download_models_hf.py
The MinerU is installed, Check out :doc:`../quick_start` or reading :doc:`boost_with_cuda` for accelerate inference
\ No newline at end of file Install LibreOffice[Optional]
----------------------------------
This section is required for handle **doc**, **docx**, **ppt**, **pptx** filetype, You can **skip** this section if no need for those filetype processing.
Linux/Macos Platform
""""""""""""""""""""""
.. code::
apt-get/yum/brew install libreoffice
Windows Platform
""""""""""""""""""""
.. code::
install libreoffice
append "install_dir\LibreOffice\program" to ENVIRONMENT PATH
.. tip::
The MinerU is installed, Check out :doc:`../usage/command_line` to convert your first pdf **or** reading the following sections for more details about install
Pipe Result
==============
.. admonition:: Tip
:class: tip
Please first navigate to :doc:`tutorial/pipeline` to get an initial understanding of how the pipeline works; this will help in understanding the content of this section.
The **PipeResult** class is a container for storing pipeline processing results and implements a series of methods related to these results, such as draw_layout, draw_span.
Checkout :doc:`../api/pipe_operators` for more details about **PipeResult**
Structure Definitions
-------------------------------
**some_pdf_middle.json**
+----------------+--------------------------------------------------------------+
| Field Name | Description |
| | |
+================+==============================================================+
| pdf_info | list, each element is a dict representing the parsing result |
| | of each PDF page, see the table below for details |
+----------------+--------------------------------------------------------------+
| \_ | ocr \| txt, used to indicate the mode used in this |
| parse_type | intermediate parsing state |
| | |
+----------------+--------------------------------------------------------------+
| \_version_name | string, indicates the version of magic-pdf used in this |
| | parsing |
| | |
+----------------+--------------------------------------------------------------+
**pdf_info**
Field structure description
+-------------------------+------------------------------------------------------------+
| Field | Description |
| Name | |
+=========================+============================================================+
| preproc_blocks | Intermediate result after PDF preprocessing, not yet |
| | segmented |
+-------------------------+------------------------------------------------------------+
| layout_bboxes | Layout segmentation results, containing layout direction |
| | (vertical, horizontal), and bbox, sorted by reading order |
+-------------------------+------------------------------------------------------------+
| page_idx | Page number, starting from 0 |
| | |
+-------------------------+------------------------------------------------------------+
| page_size | Page width and height |
| | |
+-------------------------+------------------------------------------------------------+
| \_layout_tree | Layout tree structure |
| | |
+-------------------------+------------------------------------------------------------+
| images | list, each element is a dict representing an img_block |
+-------------------------+------------------------------------------------------------+
| tables | list, each element is a dict representing a table_block |
+-------------------------+------------------------------------------------------------+
| interline_equation | list, each element is a dict representing an |
| | interline_equation_block |
| | |
+-------------------------+------------------------------------------------------------+
| discarded_blocks | List, block information returned by the model that needs |
| | to be dropped |
| | |
+-------------------------+------------------------------------------------------------+
| para_blocks | Result after segmenting preproc_blocks |
| | |
+-------------------------+------------------------------------------------------------+
In the above table, ``para_blocks`` is an array of dicts, each dict
representing a block structure. A block can support up to one level of
nesting.
**block**
The outer block is referred to as a first-level block, and the fields in
the first-level block include:
+------------------------+-------------------------------------------------------------+
| Field | Description |
| Name | |
+========================+=============================================================+
| type | Block type (table|image) |
+------------------------+-------------------------------------------------------------+
| bbox | Block bounding box coordinates |
+------------------------+-------------------------------------------------------------+
| blocks | list, each element is a dict representing a second-level |
| | block |
+------------------------+-------------------------------------------------------------+
There are only two types of first-level blocks: “table” and “image”. All
other blocks are second-level blocks.
The fields in a second-level block include:
+----------------------+----------------------------------------------------------------+
| Field | Description |
| Name | |
+======================+================================================================+
| | Block type |
| type | |
+----------------------+----------------------------------------------------------------+
| | Block bounding box coordinates |
| bbox | |
+----------------------+----------------------------------------------------------------+
| | list, each element is a dict representing a line, used to |
| lines | describe the composition of a line of information |
+----------------------+----------------------------------------------------------------+
Detailed explanation of second-level block types
================== ======================
type Description
================== ======================
image_body Main body of the image
image_caption Image description text
table_body Main body of the table
table_caption Table description text
table_footnote Table footnote
text Text block
title Title block
interline_equation Block formula
================== ======================
**line**
The field format of a line is as follows:
+---------------------+----------------------------------------------------------------+
| Field | Description |
| Name | |
+=====================+================================================================+
| | Bounding box coordinates of the line |
| bbox | |
+---------------------+----------------------------------------------------------------+
| spans | list, each element is a dict representing a span, used to |
| | describe the composition of the smallest unit |
+---------------------+----------------------------------------------------------------+
**span**
+---------------------+-----------------------------------------------------------+
| Field | Description |
| Name | |
+=====================+===========================================================+
| bbox | Bounding box coordinates of the span |
+---------------------+-----------------------------------------------------------+
| type | Type of the span |
+---------------------+-----------------------------------------------------------+
| content | Text spans use content, chart spans use img_path to store |
| \| | the actual text or screenshot path information |
| img_path | |
+---------------------+-----------------------------------------------------------+
The types of spans are as follows:
================== ==============
type Description
================== ==============
image Image
table Table
text Text
inline_equation Inline formula
interline_equation Block formula
================== ==============
**Summary**
A span is the smallest storage unit for all elements.
The elements stored within para_blocks are block information.
The block structure is as follows:
First-level block (if any) -> Second-level block -> Line -> Span
.. _example-1:
example
^^^^^^^
.. code:: json
{
"pdf_info": [
{
"preproc_blocks": [
{
"type": "text",
"bbox": [
52,
61.956024169921875,
294,
82.99800872802734
],
"lines": [
{
"bbox": [
52,
61.956024169921875,
294,
72.0000228881836
],
"spans": [
{
"bbox": [
54.0,
61.956024169921875,
296.2261657714844,
72.0000228881836
],
"content": "dependent on the service headway and the reliability of the departure ",
"type": "text",
"score": 1.0
}
]
}
]
}
],
"layout_bboxes": [
{
"layout_bbox": [
52,
61,
294,
731
],
"layout_label": "V",
"sub_layout": []
}
],
"page_idx": 0,
"page_size": [
612.0,
792.0
],
"_layout_tree": [],
"images": [],
"tables": [],
"interline_equations": [],
"discarded_blocks": [],
"para_blocks": [
{
"type": "text",
"bbox": [
52,
61.956024169921875,
294,
82.99800872802734
],
"lines": [
{
"bbox": [
52,
61.956024169921875,
294,
72.0000228881836
],
"spans": [
{
"bbox": [
54.0,
61.956024169921875,
296.2261657714844,
72.0000228881836
],
"content": "dependent on the service headway and the reliability of the departure ",
"type": "text",
"score": 1.0
}
]
}
]
}
]
}
],
"_parse_type": "txt",
"_version_name": "0.6.1"
}
Pipeline Result
------------------
.. code:: python
from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
from magic_pdf.pipe.operators import PipeResult
from magic_pdf.data.dataset import Dataset
res = pdf_parse_union(*args, **kwargs)
res['_parse_type'] = PARSE_TYPE_OCR
res['_version_name'] = __version__
if 'lang' in kwargs and kwargs['lang'] is not None:
res['lang'] = kwargs['lang']
dataset : Dataset = some_dataset # not real dataset
pipeResult = PipeResult(res, dataset)
some_pdf_layout.pdf
~~~~~~~~~~~~~~~~~~~
Each page layout consists of one or more boxes. The number at the top
left of each box indicates its sequence number. Additionally, in
``layout.pdf``, different content blocks are highlighted with different
background colors.
.. figure:: ../_static/image/layout_example.png
:alt: layout example
layout example
some_pdf_spans.pdf
~~~~~~~~~~~~~~~~~~
All spans on the page are drawn with different colored line frames
according to the span type. This file can be used for quality control,
allowing for quick identification of issues such as missing text or
unrecognized inline formulas.
.. figure:: ../_static/image/spans_example.png
:alt: spans example
spans example
\ No newline at end of file
...@@ -2,12 +2,14 @@ ...@@ -2,12 +2,14 @@
Quick Start Quick Start
============== ==============
Eager to get started? This page gives a good introduction to MinerU. Follow Installation to set up a project and install MinerU first. Want to learn about the usage methods under different scenarios ? This page gives good examples about multiple usage cases match your needs.
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
quick_start/command_line quick_start/convert_pdf
quick_start/to_markdown quick_start/convert_image
quick_start/convert_ppt
quick_start/convert_pptx
quick_start/convert_doc
quick_start/convert_docx
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment