Merge pull request #1257 from icecraft/docs/refactor_en_docs

Docs/refactor en docs

Merge pull request #1257 from icecraft/docs/refactor_en_docs
Docs/refactor en docs
bdacf291 · Xiaomeng Zhao · GitHub · 2df3e901 · 302a6950 · bdacf291
Unverified Commit bdacf291 authored Dec 11, 2024 by Xiaomeng Zhao Committed by GitHub Dec 11, 2024
20 changed files
--- a/magic_pdf/data/data_reader_writer/filebase.py
+++ b/magic_pdf/data/data_reader_writer/filebase.py
@@ -55,7 +55,7 @@ class FileBasedDataWriter(DataWriter):
        if not os.path.isabs(fn_path) and len(self._parent_dir) > 0:
            fn_path = os.path.join(self._parent_dir, path)

-        if not os.path.exists(os.path.dirname(fn_path)):
+        if not os.path.exists(os.path.dirname(fn_path)) and os.path.dirname(fn_path) != "":
            os.makedirs(os.path.dirname(fn_path), exist_ok=True)

        with open(fn_path, 'wb') as f:

--- a/magic_pdf/data/read_api.py
+++ b/magic_pdf/data/read_api.py
 import json
 import os
+import tempfile
+import shutil
 from pathlib import Path

 from magic_pdf.config.exceptions import EmptyData, InvalidParams
 from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
                                               MultiBucketS3DataReader)
 from magic_pdf.data.dataset import ImageDataset, PymuDocDataset
-
+from magic_pdf.utils.office_to_pdf import convert_file_to_pdf, ConvertToPdfError

 def read_jsonl(
    s3_path_or_local: str, s3_client: MultiBucketS3DataReader | None = None
@@ -58,23 +60,68 @@ def read_local_pdfs(path: str) -> list[PymuDocDataset]:
        list[PymuDocDataset]: each pdf file will converted to a PymuDocDataset
    """
    if os.path.isdir(path):
-        reader = FileBasedDataReader(path)
-        return [
-            PymuDocDataset(reader.read(doc_path.name))
-            for doc_path in Path(path).glob('*.pdf')
-        ]
+        reader = FileBasedDataReader()
+        ret = []
+        for root, _, files in os.walk(path):
+            for file in files:
+                suffix = file.split('.')
+                if suffix[-1] == 'pdf':
+                    ret.append( PymuDocDataset(reader.read(os.path.join(root, file))))
+        return ret
    else:
        reader = FileBasedDataReader()
        bits = reader.read(path)
        return [PymuDocDataset(bits)]

+def read_local_office(path: str) -> list[PymuDocDataset]:
+    """Read ms-office file (ppt, pptx, doc, docx) from path or directory.

-def read_local_images(path: str, suffixes: list[str]) -> list[ImageDataset]:
+    Args:
+        path (str): ms-office file or directory that contains ms-office files
+
+    Returns:
+        list[PymuDocDataset]: each ms-office file will converted to a PymuDocDataset
+        
+    Raises:
+        ConvertToPdfError: Failed to convert ms-office file to pdf via libreoffice
+        FileNotFoundError: File not Found
+        Exception: Unknown Exception raised
+    """
+    suffixes = ['.ppt', '.pptx', '.doc', '.docx']
+    fns = []
+    ret = []
+    if os.path.isdir(path):
+        for root, _, files in os.walk(path):
+            for file in files:
+                suffix = Path(file).suffix
+                if suffix in suffixes:
+                    fns.append((os.path.join(root, file)))
+    else:
+        fns.append(path)
+        
+    reader = FileBasedDataReader()
+    temp_dir = tempfile.mkdtemp()
+    for fn in fns:
+        try:
+            convert_file_to_pdf(fn, temp_dir)
+        except ConvertToPdfError as e:
+            raise e
+        except FileNotFoundError as e:
+            raise e
+        except Exception as e:
+            raise e
+        fn_path = Path(fn)
+        pdf_fn = f"{temp_dir}/{fn_path.stem}.pdf"
+        ret.append(PymuDocDataset(reader.read(pdf_fn)))
+    shutil.rmtree(temp_dir)
+    return ret
+
+def read_local_images(path: str, suffixes: list[str]=['.png', '.jpg']) -> list[ImageDataset]:
    """Read images from path or directory.

    Args:
        path (str): image file path or directory that contains image files
-        suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['jpg', 'png']
+        suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['.jpg', '.png']

    Returns:
        list[ImageDataset]: each image file will converted to a ImageDataset
@@ -82,12 +129,12 @@ def read_local_images(path: str, suffixes: list[str]) -> list[ImageDataset]:
    if os.path.isdir(path):
        imgs_bits = []
        s_suffixes = set(suffixes)
-        reader = FileBasedDataReader(path)
+        reader = FileBasedDataReader()
        for root, _, files in os.walk(path):
            for file in files:
-                suffix = file.split('.')
-                if suffix[-1] in s_suffixes:
-                    imgs_bits.append(reader.read(file))
+                suffix = Path(file).suffix
+                if suffix in s_suffixes:
+                    imgs_bits.append(reader.read(os.path.join(root, file)))
        return [ImageDataset(bits) for bits in imgs_bits]
    else:
        reader = FileBasedDataReader()

--- a/magic_pdf/model/__init__.py
+++ b/magic_pdf/model/__init__.py
@@ -65,31 +65,6 @@ class InferenceResultBase(ABC):
        """
        pass

-    @abstractmethod
-    def pipe_auto_mode(
-        self,
-        imageWriter: DataWriter,
-        start_page_id=0,
-        end_page_id=None,
-        debug_mode=False,
-        lang=None,
-    ) -> PipeResult:
-        """Post-proc the model inference result.
-            step1: classify the dataset type
-            step2: based the result of step1, using `pipe_txt_mode` or `pipe_ocr_mode`
-
-        Args:
-            imageWriter (DataWriter): the image writer handle
-            start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
-            end_page_id (int, optional):  Defaults to the last page index of dataset. Let user select some pages He/She want to process
-            debug_mode (bool, optional): Defaults to False. will dump more log if enabled
-            lang (str, optional): Defaults to None.
-
-        Returns:
-            PipeResult: the result
-        """
-        pass
-
    @abstractmethod
    def pipe_txt_mode(
        self,

--- a/magic_pdf/model/operators.py
+++ b/magic_pdf/model/operators.py
@@ -71,40 +71,6 @@ class InferenceResult(InferenceResultBase):
        """
        return proc(copy.deepcopy(self._infer_res), *args, **kwargs)

-    def pipe_auto_mode(
-        self,
-        imageWriter: DataWriter,
-        start_page_id=0,
-        end_page_id=None,
-        debug_mode=False,
-        lang=None,
-    ) -> PipeResult:
-        """Post-proc the model inference result.
-            step1: classify the dataset type
-            step2: based the result of step1, using `pipe_txt_mode` or `pipe_ocr_mode`
-
-        Args:
-            imageWriter (DataWriter): the image writer handle
-            start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
-            end_page_id (int, optional):  Defaults to the last page index of dataset. Let user select some pages He/She want to process
-            debug_mode (bool, optional): Defaults to False. will dump more log if enabled
-            lang (str, optional): Defaults to None.
-
-        Returns:
-            PipeResult: the result
-        """
-
-        pdf_proc_method = classify(self._dataset.data_bits())
-
-        if pdf_proc_method == SupportedPdfParseMethod.TXT:
-            return self.pipe_txt_mode(
-                imageWriter, start_page_id, end_page_id, debug_mode, lang
-            )
-        else:
-            return self.pipe_ocr_mode(
-                imageWriter, start_page_id, end_page_id, debug_mode, lang
-            )
-
    def pipe_txt_mode(
        self,
        imageWriter: DataWriter,

--- a/magic_pdf/tools/cli.py
+++ b/magic_pdf/tools/cli.py
 import os
-from pathlib import Path
-
+import shutil
+import tempfile
 import click
+import fitz
 from loguru import logger
+from pathlib import Path

 import magic_pdf.model as model_config
 from magic_pdf.data.data_reader_writer import FileBasedDataReader
 from magic_pdf.libs.version import __version__
 from magic_pdf.tools.common import do_parse, parse_pdf_methods
+from magic_pdf.utils.office_to_pdf import convert_file_to_pdf
+
+pdf_suffixes = ['.pdf']
+ms_office_suffixes = ['.ppt', '.pptx', '.doc', '.docx']
+image_suffixes = ['.png', '.jpg']


 @click.command()
@@ -21,7 +28,7 @@ from magic_pdf.tools.common import do_parse, parse_pdf_methods
    'path',
    type=click.Path(exists=True),
    required=True,
-    help='local pdf filepath or directory',
+    help='local filepath or directory. support PDF, PPT, PPTX, DOC, DOCX, PNG, JPG files',
 )
 @click.option(
    '-o',
@@ -83,12 +90,27 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
    model_config.__use_inside_model__ = True
    model_config.__model_mode__ = 'full'
    os.makedirs(output_dir, exist_ok=True)
+    temp_dir = tempfile.mkdtemp()
+    def read_fn(path: Path):
+        if path.suffix in ms_office_suffixes:
+            convert_file_to_pdf(str(path), temp_dir)
+            fn = os.path.join(temp_dir, f"{path.stem}.pdf")
+        elif path.suffix in image_suffixes:
+            with open(str(path), 'rb') as f:
+                bits = f.read()
+            pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
+            fn = os.path.join(temp_dir, f"{path.stem}.pdf")
+            with open(fn, 'wb') as f:
+                f.write(pdf_bytes)
+        elif path.suffix in pdf_suffixes:
+            fn = str(path)
+        else:
+            raise Exception(f"Unknown file suffix: {path.suffix}")
        
-    def read_fn(path):
-        disk_rw = FileBasedDataReader(os.path.dirname(path))
-        return disk_rw.read(os.path.basename(path))
+        disk_rw = FileBasedDataReader(os.path.dirname(fn))
+        return disk_rw.read(os.path.basename(fn))

-    def parse_doc(doc_path: str):
+    def parse_doc(doc_path: Path):
        try:
            file_name = str(Path(doc_path).stem)
            pdf_data = read_fn(doc_path)
@@ -108,10 +130,13 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
            logger.exception(e)

    if os.path.isdir(path):
-        for doc_path in Path(path).glob('*.pdf'):
+        for doc_path in Path(path).glob('*'):
+            if doc_path.suffix in pdf_suffixes + image_suffixes + ms_office_suffixes:
                parse_doc(doc_path)
    else:
-        parse_doc(path)
+        parse_doc(Path(path))
+
+    shutil.rmtree(temp_dir)


 if __name__ == '__main__':

--- a/magic_pdf/tools/common.py
+++ b/magic_pdf/tools/common.py
@@ -170,6 +170,7 @@ def do_parse(
            logger.error('need model list input')
            exit(2)
    else:
+        
        infer_result = InferenceResult(model_list, ds)
        if parse_method == 'ocr':
            pipe_result = infer_result.pipe_ocr_mode(
@@ -180,10 +181,16 @@ def do_parse(
                image_writer, debug_mode=True, lang=lang
            )
        else:
-            pipe_result = infer_result.pipe_auto_mode(
+            if ds.classify() == SupportedPdfParseMethod.TXT:
+                pipe_result = infer_result.pipe_txt_mode(
+                        image_writer, debug_mode=True, lang=lang
+                    )
+            else:
+                pipe_result = infer_result.pipe_txt_mode(
                        image_writer, debug_mode=True, lang=lang
                    )
            
+
    if f_draw_model_bbox:
        infer_result.draw_model(
            os.path.join(local_md_dir, f'{pdf_file_name}_model.pdf')

--- a/magic_pdf/utils/office_to_pdf.py
+++ b/magic_pdf/utils/office_to_pdf.py
+import os
+import subprocess
+from pathlib import Path
+
+
+class ConvertToPdfError(Exception):
+    def __init__(self, msg):
+        self.msg = msg
+        super().__init__(self.msg)
+
+
+def convert_file_to_pdf(input_path, output_dir):
+    if not os.path.isfile(input_path):
+        raise FileNotFoundError(f"The input file {input_path} does not exist.")
+
+    os.makedirs(output_dir, exist_ok=True)
+    
+    cmd = [
+        'soffice',
+        '--headless',
+        '--convert-to', 'pdf',
+        '--outdir', str(output_dir),
+        str(input_path)
+    ]
+    
+    process = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    
+    if process.returncode != 0:
+        raise ConvertToPdfError(process.stderr.decode())
--- a/next_docs/en/_static/image/inference_result.png
+++ b/next_docs/en/_static/image/inference_result.png
--- a/next_docs/en/additional_notes/glossary.rst
+++ b/next_docs/en/additional_notes/glossary.rst
@@ -4,8 +4,11 @@ Glossary
 ===========

 1. jsonl 
-    TODO: add description
+    Newline-delimited (\n), and each line must be a valid, independent JSON object. 
+    Currently, All the function shipped with **MinerU** assume that json object must contain one field named with either **path** or **file_location**
+

 2. magic-pdf.json 
-    TODO: add description
+    TODO
+

--- a/next_docs/en/index.rst
+++ b/next_docs/en/index.rst
@@ -70,6 +70,12 @@ Key Features
 -  Supports both CPU and GPU environments.
 -  Compatible with Windows, Linux, and Mac platforms.

+
+.. tip::
+
+   Get started with MinerU by trying the `online demo <https://www.modelscope.cn/studios/OpenDataLab/MinerU>`_ or :doc:`installing it locally <user_guide/install/install>`.
+
+
 User Guide
 -------------
 .. toctree::

--- a/next_docs/en/user_guide.rst
+++ b/next_docs/en/user_guide.rst
@@ -4,7 +4,9 @@
    :maxdepth: 2

    user_guide/install
+    user_guide/usage
    user_guide/quick_start
    user_guide/tutorial
    user_guide/data
-    
+    user_guide/inference_result
+    user_guide/pipe_result
--- a/next_docs/en/user_guide/data/data_reader_writer.rst
+++ b/next_docs/en/user_guide/data/data_reader_writer.rst
@@ -87,7 +87,10 @@ Read Examples

 .. code:: python

+    import os 
    from magic_pdf.data.data_reader_writer import *
+    from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
+    from magic_pdf.data.schemas import S3Config

    # file based related
    file_based_reader1 = FileBasedDataReader('')
@@ -100,43 +103,54 @@ Read Examples
    ## will read /tmp/abc
    file_based_reader2.read('abc')

-    ## will read /var/logs/message.txt
-    file_based_reader2.read('/var/logs/message.txt')
+    ## will read /tmp/logs/message.txt
+    file_based_reader2.read('/tmp/logs/message.txt')

    # multi bucket s3 releated
-    multi_bucket_s3_reader1 = MultiBucketS3DataReader("test_bucket1/test_prefix", list[S3Config(
-            bucket_name=test_bucket1, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
+    bucket = "bucket"               # replace with real bucket
+    ak = "ak"                       # replace with real access key
+    sk = "sk"                       # replace with real secret key
+    endpoint_url = "endpoint_url"   # replace with real endpoint_url
+
+    bucket_2 = "bucket_2"               # replace with real bucket
+    ak_2 = "ak_2"                       # replace with real access key
+    sk_2 = "sk_2"                       # replace with real secret key 
+    endpoint_url_2 = "endpoint_url_2"   # replace with real endpoint_url
+
+    test_prefix = 'test/unittest'
+    multi_bucket_s3_reader1 = MultiBucketS3DataReader(f"{bucket}/{test_prefix}", [S3Config(
+            bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
        ),
        S3Config(
-            bucket_name=test_bucket_2,
+            bucket_name=bucket_2,
            access_key=ak_2,
            secret_key=sk_2,
            endpoint_url=endpoint_url_2,
        )])

-    ## will read s3://test_bucket1/test_prefix/abc
+    ## will read s3://{bucket}/{test_prefix}/abc
    multi_bucket_s3_reader1.read('abc')

-    ## will read s3://test_bucket1/efg
-    multi_bucket_s3_reader1.read('s3://test_bucket1/efg')
+    ## will read s3://{bucket}/{test_prefix}/efg
+    multi_bucket_s3_reader1.read(f's3://{bucket}/{test_prefix}/efg')

-    ## will read s3://test_bucket2/abc
-    multi_bucket_s3_reader1.read('s3://test_bucket2/abc')
+    ## will read s3://{bucket2}/{test_prefix}/abc
+    multi_bucket_s3_reader1.read(f's3://{bucket_2}/{test_prefix}/abc')

    # s3 related
    s3_reader1 = S3DataReader(
-        default_prefix_without_bucket = "test_prefix"
-        bucket: "test_bucket",
-        ak: "ak",
-        sk: "sk",
-        endpoint_url: "localhost"
+        test_prefix,
+        bucket,
+        ak,
+        sk,
+        endpoint_url
    )

-    ## will read s3://test_bucket/test_prefix/abc 
+    ## will read s3://{bucket}/{test_prefix}/abc
    s3_reader1.read('abc')

-    ## will read s3://test_bucket/efg
-    s3_reader1.read('s3://test_bucket/efg')
+    ## will read s3://{bucket}/efg
+    s3_reader1.read(f's3://{bucket}/efg')


 Write Examples
@@ -144,65 +158,79 @@ Write Examples

 .. code:: python

+    import os
    from magic_pdf.data.data_reader_writer import *
+    from magic_pdf.data.data_reader_writer import MultiBucketS3DataWriter
+    from magic_pdf.data.schemas import S3Config

    # file based related
-    file_based_writer1 = FileBasedDataWriter('')
+    file_based_writer1 = FileBasedDataWriter("")

    ## will write 123 to abc
-    file_based_writer1.write('abc', '123'.encode()) 
+    file_based_writer1.write("abc", "123".encode())

    ## will write 123 to abc
-    file_based_writer1.write_string('abc', '123') 
+    file_based_writer1.write_string("abc", "123")

-    file_based_writer2 = FileBasedDataWriter('/tmp')
+    file_based_writer2 = FileBasedDataWriter("/tmp")

    ## will write 123 to /tmp/abc
-    file_based_writer2.write_string('abc', '123')
+    file_based_writer2.write_string("abc", "123")

-    ## will write 123 to /var/logs/message.txt
-    file_based_writer2.write_string('/var/logs/message.txt', '123')
+    ## will write 123 to /tmp/logs/message.txt
+    file_based_writer2.write_string("/tmp/logs/message.txt", "123")

    # multi bucket s3 releated
-    multi_bucket_s3_writer1 = MultiBucketS3DataWriter("test_bucket1/test_prefix", list[S3Config(
-            bucket_name=test_bucket1, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
+    bucket = "bucket"               # replace with real bucket
+    ak = "ak"                       # replace with real access key
+    sk = "sk"                       # replace with real secret key
+    endpoint_url = "endpoint_url"   # replace with real endpoint_url
+
+    bucket_2 = "bucket_2"               # replace with real bucket
+    ak_2 = "ak_2"                       # replace with real access key
+    sk_2 = "sk_2"                       # replace with real secret key 
+    endpoint_url_2 = "endpoint_url_2"   # replace with real endpoint_url
+
+    test_prefix = "test/unittest"
+    multi_bucket_s3_writer1 = MultiBucketS3DataWriter(
+        f"{bucket}/{test_prefix}",
+        [
+            S3Config(
+                bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
            ),
            S3Config(
-            bucket_name=test_bucket_2,
+                bucket_name=bucket_2,
                access_key=ak_2,
                secret_key=sk_2,
                endpoint_url=endpoint_url_2,
-        )])
+            ),
+        ],
+    )

-    ## will write 123 to s3://test_bucket1/test_prefix/abc
-    multi_bucket_s3_writer1.write_string('abc', '123')
+    ## will write 123 to s3://{bucket}/{test_prefix}/abc
+    multi_bucket_s3_writer1.write_string("abc", "123")

-    ## will write 123 to s3://test_bucket1/test_prefix/abc
-    multi_bucket_s3_writer1.write('abc', '123'.encode())
+    ## will write 123 to s3://{bucket}/{test_prefix}/abc
+    multi_bucket_s3_writer1.write("abc", "123".encode())

-    ## will write 123 to s3://test_bucket1/efg
-    multi_bucket_s3_writer1.write('s3://test_bucket1/efg', '123'.encode())
+    ## will write 123 to s3://{bucket}/{test_prefix}/efg
+    multi_bucket_s3_writer1.write(f"s3://{bucket}/{test_prefix}/efg", "123".encode())

-    ## will write 123 to s3://test_bucket2/abc
-    multi_bucket_s3_writer1.write('s3://test_bucket2/abc', '123'.encode())
+    ## will write 123 to s3://{bucket_2}/{test_prefix}/abc
+    multi_bucket_s3_writer1.write(f's3://{bucket_2}/{test_prefix}/abc', '123'.encode())

    # s3 related
-    s3_writer1 = S3DataWriter(
-        default_prefix_without_bucket = "test_prefix"
-        bucket: "test_bucket",
-        ak: "ak",
-        sk: "sk",
-        endpoint_url: "localhost"
-    )
+    s3_writer1 = S3DataWriter(test_prefix, bucket, ak, sk, endpoint_url)
+
+    ## will write 123 to s3://{bucket}/{test_prefix}/abc
+    s3_writer1.write("abc", "123".encode())

-    ## will write 123 to s3://test_bucket/test_prefix/abc 
-    s3_writer1.write('abc', '123'.encode())
+    ## will write 123 to s3://{bucket}/{test_prefix}/abc
+    s3_writer1.write_string("abc", "123")

-    ## will write 123 to s3://test_bucket/test_prefix/abc 
-    s3_writer1.write_string('abc', '123')
+    ## will write 123 to s3://{bucket}/efg
+    s3_writer1.write(f"s3://{bucket}/efg", "123".encode())

-    ## will write 123 to s3://test_bucket/efg
-    s3_writer1.write('s3://test_bucket/efg', '123'.encode())


 Check :doc:`../../api/data_reader_writer` for more details
--- a/next_docs/en/user_guide/data/read_api.rst
+++ b/next_docs/en/user_guide/data/read_api.rst
@@ -18,24 +18,50 @@ Read the contet from jsonl which may located on local machine or remote s3. if y

 .. code:: python

-    from magic_pdf.data.io.read_api import *
+    from magic_pdf.data.read_api import *
+    from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
+    from magic_pdf.data.schemas import S3Config

    # read jsonl from local machine
-    datasets = read_jsonl("tt.jsonl", None)
+    datasets = read_jsonl("tt.jsonl", None)   # replace with real jsonl file

    # read jsonl from remote s3
-    datasets = read_jsonl("s3://bucket_1/tt.jsonl", s3_reader)

+    bucket = "bucket_1"                     # replace with real s3 bucket
+    ak = "access_key_1"                     # replace with real s3 access key
+    sk = "secret_key_1"                     # replace with real s3 secret key
+    endpoint_url = "endpoint_url_1"         # replace with real s3 endpoint url
+
+    bucket_2 = "bucket_2"                   # replace with real s3 bucket
+    ak_2 = "access_key_2"                   # replace with real s3 access key
+    sk_2 = "secret_key_2"                   # replace with real s3 secret key
+    endpoint_url_2 = "endpoint_url_2"       # replace with real s3 endpoint url
+
+    s3configs = [
+        S3Config(
+            bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
+        ),
+        S3Config(
+            bucket_name=bucket_2,
+            access_key=ak_2,
+            secret_key=sk_2,
+            endpoint_url=endpoint_url_2,
+        ),
+    ]
+
+    s3_reader = MultiBucketS3DataReader(bucket, s3configs)
+
+    datasets = read_jsonl(f"s3://bucket_1/tt.jsonl", s3_reader)  # replace with real s3 jsonl file

 read_local_pdfs
-^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^

 Read pdf from path or directory.


 .. code:: python

-    from magic_pdf.data.io.read_api import *
+    from magic_pdf.data.read_api import *

    # read pdf path
    datasets = read_local_pdfs("tt.pdf")
@@ -51,13 +77,13 @@ Read images from path or directory

 .. code:: python 

-    from magic_pdf.data.io.read_api import *
+    from magic_pdf.data.read_api import *

    # read from image path 
-    datasets = read_local_images("tt.png")
+    datasets = read_local_images("tt.png")  # replace with real file path

    # read files from directory that endswith suffix in suffixes array 
-    datasets = read_local_images("images/", suffixes=["png", "jpg"])
+    datasets = read_local_images("images/", suffixes=["png", "jpg"])  # replace with real directory 


 Check :doc:`../../api/read_api` for more details
\ No newline at end of file
--- a/next_docs/en/user_guide/inference_result.rst
+++ b/next_docs/en/user_guide/inference_result.rst
+
+Inference Result 
+==================
+
+.. admonition:: Tip
+    :class: tip
+
+    Please first navigate to :doc:`tutorial/pipeline` to get an initial understanding of how the pipeline works; this will help in understanding the content of this section.
+
+The **InferenceResult** class is a container for storing model inference results and implements a series of methods related to these results, such as draw_model, dump_model. 
+Checkout :doc:`../api/model_operators` for more details about **InferenceResult**
+
+
+Model Inference Result
+-----------------------
+
+Structure Definition
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code:: python
+
+    from pydantic import BaseModel, Field
+    from enum import IntEnum
+
+    class CategoryType(IntEnum):
+            title = 0               # Title
+            plain_text = 1          # Text
+            abandon = 2             # Includes headers, footers, page numbers, and page annotations
+            figure = 3              # Image
+            figure_caption = 4      # Image description
+            table = 5               # Table
+            table_caption = 6       # Table description
+            table_footnote = 7      # Table footnote
+            isolate_formula = 8     # Block formula
+            formula_caption = 9     # Formula label
+
+            embedding = 13          # Inline formula
+            isolated = 14           # Block formula
+            text = 15               # OCR recognition result
+
+
+    class PageInfo(BaseModel):
+        page_no: int = Field(description="Page number, the first page is 0", ge=0)
+        height: int = Field(description="Page height", gt=0)
+        width: int = Field(description="Page width", ge=0)
+
+    class ObjectInferenceResult(BaseModel):
+        category_id: CategoryType = Field(description="Category", ge=0)
+        poly: list[float] = Field(description="Quadrilateral coordinates, representing the coordinates of the top-left, top-right, bottom-right, and bottom-left points respectively")
+        score: float = Field(description="Confidence of the inference result")
+        latex: str | None = Field(description="LaTeX parsing result", default=None)
+        html: str | None = Field(description="HTML parsing result", default=None)
+
+    class PageInferenceResults(BaseModel):
+            layout_dets: list[ObjectInferenceResult] = Field(description="Page recognition results", ge=0)
+            page_info: PageInfo = Field(description="Page metadata")
+
+
+Example 
+^^^^^^^^^^^
+
+.. code:: json
+
+    [
+        {
+            "layout_dets": [
+                {
+                    "category_id": 2,
+                    "poly": [
+                        99.1906967163086,
+                        100.3119125366211,
+                        730.3707885742188,
+                        100.3119125366211,
+                        730.3707885742188,
+                        245.81326293945312,
+                        99.1906967163086,
+                        245.81326293945312
+                    ],
+                    "score": 0.9999997615814209
+                }
+            ],
+            "page_info": {
+                "page_no": 0,
+                "height": 2339,
+                "width": 1654
+            }
+        },
+        {
+            "layout_dets": [
+                {
+                    "category_id": 5,
+                    "poly": [
+                        99.13092803955078,
+                        2210.680419921875,
+                        497.3183898925781,
+                        2210.680419921875,
+                        497.3183898925781,
+                        2264.78076171875,
+                        99.13092803955078,
+                        2264.78076171875
+                    ],
+                    "score": 0.9999997019767761
+                }
+            ],
+            "page_info": {
+                "page_no": 1,
+                "height": 2339,
+                "width": 1654
+            }
+        }
+    ]
+
+The format of the poly coordinates is [x0, y0, x1, y1, x2, y2, x3, y3],
+representing the coordinates of the top-left, top-right, bottom-right,
+and bottom-left points respectively. |Poly Coordinate Diagram|
+
+
+
+Inference Result 
+-------------------------
+
+
+.. code:: python
+
+    from magic_pdf.model.operators import InferenceResult
+    from magic_pdf.data.dataset import Dataset 
+    
+    dataset : Dataset = some_data_set    # not real dataset
+
+    # The inference results of all pages, ordered by page number, are stored in a list as the inference results of MinerU
+    model_inference_result: list[PageInferenceResults] = []
+
+    Inference_result = InferenceResult(model_inference_result, dataset)
+
+
+
+some_model.pdf
+^^^^^^^^^^^^^^^^^^^^
+
+.. figure:: ../_static/image/Inference_result.png
+
+
+
+.. |Poly Coordinate Diagram| image:: ../_static/image/poly.png
+
--- a/next_docs/en/user_guide/install.rst
+++ b/next_docs/en/user_guide/install.rst
@@ -8,5 +8,5 @@ Installation
   install/install
   install//boost_with_cuda
   install/download_model_weight_files
-
+   install/config

--- a/next_docs/en/user_guide/install/boost_with_cuda.rst
+++ b/next_docs/en/user_guide/install/boost_with_cuda.rst
@@ -9,25 +9,7 @@ appropriate guide based on your system:

 -  :ref:`ubuntu_22_04_lts_section`
 -  :ref:`windows_10_or_11_section`
-  Quick Deployment with Docker

-.. admonition:: Important
-   :class: tip
-
-   Docker requires a GPU with at least 16GB of VRAM, and all acceleration features are enabled by default.
-
-   Before running this Docker, you can use the following command to check if your device supports CUDA acceleration on Docker. 
-
-   .. code-block:: bash
-
-      bash  docker run --rm --gpus=all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
-
-.. code:: sh
-
-   wget https://github.com/opendatalab/MinerU/raw/master/Dockerfile
-   docker build -t mineru:latest .
-   docker run --rm -it --gpus=all mineru:latest /bin/bash
-   magic-pdf --help

 .. _ubuntu_22_04_lts_section:


--- a/next_docs/en/user_guide/install/config.rst
+++ b/next_docs/en/user_guide/install/config.rst
+
+
+Config
+=========
+
+File **magic-pdf.json** is typically located in the **${HOME}** directory under a Linux system or in the **C:\Users\{username}** directory under a Windows system.
+
+
+magic-pdf.json
+----------------
+
+.. code:: json 
+
+    {
+        "bucket_info":{
+            "bucket-name-1":["ak", "sk", "endpoint"],
+            "bucket-name-2":["ak", "sk", "endpoint"]
+        },
+        "models-dir":"/tmp/models",
+        "layoutreader-model-dir":"/tmp/layoutreader",
+        "device-mode":"cpu",
+        "layout-config": {
+            "model": "layoutlmv3"
+        },
+        "formula-config": {
+            "mfd_model": "yolo_v8_mfd",
+            "mfr_model": "unimernet_small",
+            "enable": true
+        },
+        "table-config": {
+            "model": "rapid_table",
+            "enable": false,
+            "max_time": 400    
+        },
+        "config_version": "1.0.0"
+    }
+
+
+
+
+bucket_info
+^^^^^^^^^^^^^^
+Store the access_key, secret_key and endpoint of AWS S3 Compatible storage config
+
+Example: 
+
+.. code:: text
+
+        {
+            "image_bucket":[{access_key}, {secret_key}, {endpoint}],
+            "video_bucket":[{access_key}, {secret_key}, {endpoint}]
+        }
+
+
+models-dir
+^^^^^^^^^^^^
+
+Store the models download from **huggingface** or **modelshop**. You do not need to modify this field if you download the model using the scripts shipped with **MinerU**
+
+
+layoutreader-model-dir
+^^^^^^^^^^^^^^^^^^^^^^^
+
+Store the models download from **huggingface** or **modelshop**. You do not need to modify this field if you download the model using the scripts shipped with **MinerU**
+
+
+devide-mode
+^^^^^^^^^^^^^^
+
+This field have two options, **cpu** or **cuda**.
+
+**cpu**: inference via cpu
+
+**cuda**: using cuda to accelerate inference
+
+
+layout-config 
+^^^^^^^^^^^^^^^
+
+.. code:: json
+
+    {
+        "model": "layoutlmv3"  
+    }
+
+layout model can not be disabled now, And we have only kind of layout model currently.
+
+
+formula-config
+^^^^^^^^^^^^^^^^
+
+.. code:: json
+
+    {
+        "mfd_model": "yolo_v8_mfd",   
+        "mfr_model": "unimernet_small",
+        "enable": true 
+    }
+
+
+mfd_model
+""""""""""
+
+Specify the formula detection model, options are ['yolo_v8_mfd']
+
+
+mfr_model
+""""""""""
+Specify the formula recognition model, options are ['unimernet_small']
+
+Check `UniMERNet <https://github.com/opendatalab/UniMERNet>`_ for more details
+
+
+enable
+""""""""
+
+on-off flag, options are [true, false]. **true** means enable formula inference, **false** means disable formula inference
+
+
+table-config
+^^^^^^^^^^^^^^^^
+
+.. code:: json
+
+   {
+        "model": "rapid_table",
+        "enable": false,
+        "max_time": 400    
+    }
+
+model
+""""""""
+
+Specify the table inference model, options are ['rapid_table', 'tablemaster', 'struct_eqtable']
+
+
+max_time
+"""""""""
+
+Since table recognition is a time-consuming process, we set a timeout period. If the process exceeds this time, the table recognition will be terminated.
+
+
+
+enable
+"""""""
+
+on-off flag, options are [true, false]. **true** means enable table inference, **false** means disable table inference
+
+
+config_version
+^^^^^^^^^^^^^^^^
+
+The version of config schema.
+
+
+.. admonition:: Tip
+    :class: tip
+    
+    Check `Config Schema <https://github.com/opendatalab/MinerU/blob/master/magic-pdf.template.json>`_ for the latest details
+
--- a/next_docs/en/user_guide/install/install.rst
+++ b/next_docs/en/user_guide/install/install.rst
@@ -4,6 +4,7 @@ Install
 If you encounter any installation issues, please first consult the :doc:`../../additional_notes/faq`.
 If the parsing results are not as expected, refer to the :doc:`../../additional_notes/known_issues`.

+Also you can try `online demo <https://www.modelscope.cn/studios/OpenDataLab/MinerU>`_ without installation.

 .. admonition:: Warning
    :class: tip
@@ -88,7 +89,7 @@ If the parsing results are not as expected, refer to the :doc:`../../additional_


 Create an environment
-~~~~~~~~~~~~~~~~~~~~~
+---------------------------

 .. code-block:: shell

@@ -98,7 +99,7 @@ Create an environment


 Download model weight files
-~~~~~~~~~~~~~~~~~~~~~~~~~~
+------------------------------

 .. code-block:: shell

@@ -107,4 +108,32 @@ Download model weight files
    python download_models_hf.py    


-The MinerU is installed, Check out :doc:`../quick_start` or reading :doc:`boost_with_cuda` for accelerate inference
\ No newline at end of file
+
+Install LibreOffice[Optional]
+----------------------------------
+
+This section is required for handle **doc**, **docx**, **ppt**, **pptx** filetype, You can **skip** this section if no need for those filetype processing.
+
+
+Linux/Macos Platform
+""""""""""""""""""""""
+
+.. code::
+
+    apt-get/yum/brew install libreoffice
+
+
+Windows Platform 
+""""""""""""""""""""
+
+.. code::
+
+    install libreoffice 
+    append "install_dir\LibreOffice\program" to ENVIRONMENT PATH
+
+
+.. tip::
+
+    The MinerU is installed, Check out :doc:`../usage/command_line` to convert your first pdf **or** reading the following sections for more details about install
+
+
--- a/next_docs/en/user_guide/pipe_result.rst
+++ b/next_docs/en/user_guide/pipe_result.rst
+
+
+Pipe Result 
+==============
+
+.. admonition:: Tip
+    :class: tip
+
+    Please first navigate to :doc:`tutorial/pipeline` to get an initial understanding of how the pipeline works; this will help in understanding the content of this section.
+
+
+The **PipeResult** class is a container for storing pipeline processing results and implements a series of methods related to these results, such as draw_layout, draw_span. 
+Checkout :doc:`../api/pipe_operators` for more details about **PipeResult**
+
+
+
+Structure Definitions
+-------------------------------
+
+**some_pdf_middle.json**
+
+----------------+--------------------------------------------------------------+
+| Field Name     | Description                                                  |
+|                |                                                              |
+================+==============================================================+
+| pdf_info       | list, each element is a dict representing the parsing result |
+|                | of each PDF page, see the table below for details            |
+----------------+--------------------------------------------------------------+
+| \_             | ocr \| txt, used to indicate the mode used in this           |
+| parse_type     | intermediate parsing state                                   |
+|                |                                                              |
+----------------+--------------------------------------------------------------+
+| \_version_name | string, indicates the version of magic-pdf used in this      |
+|                | parsing                                                      |
+|                |                                                              |
+----------------+--------------------------------------------------------------+
+
+**pdf_info**
+
+Field structure description
+
+-------------------------+------------------------------------------------------------+
+| Field                   | Description                                                |
+| Name                    |                                                            |
+=========================+============================================================+
+| preproc_blocks          | Intermediate result after PDF preprocessing, not yet       |
+|                         | segmented                                                  |
+-------------------------+------------------------------------------------------------+
+| layout_bboxes           | Layout segmentation results, containing layout direction   |
+|                         | (vertical, horizontal), and bbox, sorted by reading order  |
+-------------------------+------------------------------------------------------------+
+| page_idx                | Page number, starting from 0                               |
+|                         |                                                            |
+-------------------------+------------------------------------------------------------+
+| page_size               | Page width and height                                      |
+|                         |                                                            |
+-------------------------+------------------------------------------------------------+
+| \_layout_tree           | Layout tree structure                                      |
+|                         |                                                            |
+-------------------------+------------------------------------------------------------+
+| images                  | list, each element is a dict representing an img_block     |
+-------------------------+------------------------------------------------------------+
+| tables                  | list, each element is a dict representing a table_block    |
+-------------------------+------------------------------------------------------------+
+| interline_equation      | list, each element is a dict representing an               |
+|                         | interline_equation_block                                   |
+|                         |                                                            |
+-------------------------+------------------------------------------------------------+
+| discarded_blocks        | List, block information returned by the model that needs   |
+|                         | to be dropped                                              |
+|                         |                                                            |
+-------------------------+------------------------------------------------------------+
+| para_blocks             | Result after segmenting preproc_blocks                     |
+|                         |                                                            |
+-------------------------+------------------------------------------------------------+
+
+In the above table, ``para_blocks`` is an array of dicts, each dict
+representing a block structure. A block can support up to one level of
+nesting.
+
+**block**
+
+The outer block is referred to as a first-level block, and the fields in
+the first-level block include:
+
+------------------------+-------------------------------------------------------------+
+| Field                  | Description                                                 |
+| Name                   |                                                             |
+========================+=============================================================+
+| type                   | Block type (table|image)                                    |
+------------------------+-------------------------------------------------------------+
+| bbox                   | Block bounding box coordinates                              |
+------------------------+-------------------------------------------------------------+
+| blocks                 | list, each element is a dict representing a second-level    |
+|                        | block                                                       |
+------------------------+-------------------------------------------------------------+
+
+There are only two types of first-level blocks: “table” and “image”. All
+other blocks are second-level blocks.
+
+The fields in a second-level block include:
+
+----------------------+----------------------------------------------------------------+
+| Field                | Description                                                    |
+| Name                 |                                                                |
+======================+================================================================+
+|                      | Block type                                                     |
+| type                 |                                                                |
+----------------------+----------------------------------------------------------------+
+|                      | Block bounding box coordinates                                 |
+| bbox                 |                                                                |
+----------------------+----------------------------------------------------------------+
+|                      | list, each element is a dict representing a line, used to      |
+| lines                | describe the composition of a line of information              |
+----------------------+----------------------------------------------------------------+
+
+Detailed explanation of second-level block types
+
+================== ======================
+type               Description
+================== ======================
+image_body         Main body of the image
+image_caption      Image description text
+table_body         Main body of the table
+table_caption      Table description text
+table_footnote     Table footnote
+text               Text block
+title              Title block
+interline_equation Block formula
+================== ======================
+
+**line**
+
+The field format of a line is as follows:
+
+---------------------+----------------------------------------------------------------+
+| Field               | Description                                                    |
+| Name                |                                                                |
+=====================+================================================================+
+|                     | Bounding box coordinates of the line                           |
+| bbox                |                                                                |
+---------------------+----------------------------------------------------------------+
+| spans               | list, each element is a dict representing a span, used to      |
+|                     | describe the composition of the smallest unit                  |
+---------------------+----------------------------------------------------------------+
+
+**span**
+
+---------------------+-----------------------------------------------------------+
+| Field               | Description                                               |
+| Name                |                                                           |
+=====================+===========================================================+
+| bbox                | Bounding box coordinates of the span                      |
+---------------------+-----------------------------------------------------------+
+| type                | Type of the span                                          |
+---------------------+-----------------------------------------------------------+
+| content             | Text spans use content, chart spans use img_path to store |
+| \|                  | the actual text or screenshot path information            |
+| img_path            |                                                           |
+---------------------+-----------------------------------------------------------+
+
+The types of spans are as follows:
+
+================== ==============
+type               Description
+================== ==============
+image              Image
+table              Table
+text               Text
+inline_equation    Inline formula
+interline_equation Block formula
+================== ==============
+
+**Summary**
+
+A span is the smallest storage unit for all elements.
+
+The elements stored within para_blocks are block information.
+
+The block structure is as follows:
+
+First-level block (if any) -> Second-level block -> Line -> Span
+
+.. _example-1:
+
+example
+^^^^^^^
+
+.. code:: json
+
+   {
+       "pdf_info": [
+           {
+               "preproc_blocks": [
+                   {
+                       "type": "text",
+                       "bbox": [
+                           52,
+                           61.956024169921875,
+                           294,
+                           82.99800872802734
+                       ],
+                       "lines": [
+                           {
+                               "bbox": [
+                                   52,
+                                   61.956024169921875,
+                                   294,
+                                   72.0000228881836
+                               ],
+                               "spans": [
+                                   {
+                                       "bbox": [
+                                           54.0,
+                                           61.956024169921875,
+                                           296.2261657714844,
+                                           72.0000228881836
+                                       ],
+                                       "content": "dependent on the service headway and the reliability of the departure ",
+                                       "type": "text",
+                                       "score": 1.0
+                                   }
+                               ]
+                           }
+                       ]
+                   }
+               ],
+               "layout_bboxes": [
+                   {
+                       "layout_bbox": [
+                           52,
+                           61,
+                           294,
+                           731
+                       ],
+                       "layout_label": "V",
+                       "sub_layout": []
+                   }
+               ],
+               "page_idx": 0,
+               "page_size": [
+                   612.0,
+                   792.0
+               ],
+               "_layout_tree": [],
+               "images": [],
+               "tables": [],
+               "interline_equations": [],
+               "discarded_blocks": [],
+               "para_blocks": [
+                   {
+                       "type": "text",
+                       "bbox": [
+                           52,
+                           61.956024169921875,
+                           294,
+                           82.99800872802734
+                       ],
+                       "lines": [
+                           {
+                               "bbox": [
+                                   52,
+                                   61.956024169921875,
+                                   294,
+                                   72.0000228881836
+                               ],
+                               "spans": [
+                                   {
+                                       "bbox": [
+                                           54.0,
+                                           61.956024169921875,
+                                           296.2261657714844,
+                                           72.0000228881836
+                                       ],
+                                       "content": "dependent on the service headway and the reliability of the departure ",
+                                       "type": "text",
+                                       "score": 1.0
+                                   }
+                               ]
+                           }
+                       ]
+                   }
+               ]
+           }
+       ],
+       "_parse_type": "txt",
+       "_version_name": "0.6.1"
+   }
+
+
+Pipeline Result 
+------------------
+
+.. code:: python 
+
+    from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
+    from magic_pdf.pipe.operators import PipeResult
+    from magic_pdf.data.dataset import Dataset 
+
+    res = pdf_parse_union(*args, **kwargs)
+    res['_parse_type'] = PARSE_TYPE_OCR
+    res['_version_name'] = __version__
+    if 'lang' in kwargs and kwargs['lang'] is not None:
+        res['lang'] = kwargs['lang']
+
+    dataset : Dataset = some_dataset   # not real dataset
+    pipeResult = PipeResult(res, dataset)
+
+
+
+some_pdf_layout.pdf
+~~~~~~~~~~~~~~~~~~~
+
+Each page layout consists of one or more boxes. The number at the top
+left of each box indicates its sequence number. Additionally, in
+``layout.pdf``, different content blocks are highlighted with different
+background colors.
+
+.. figure:: ../_static/image/layout_example.png
+   :alt: layout example
+
+   layout example
+
+some_pdf_spans.pdf
+~~~~~~~~~~~~~~~~~~
+
+All spans on the page are drawn with different colored line frames
+according to the span type. This file can be used for quality control,
+allowing for quick identification of issues such as missing text or
+unrecognized inline formulas.
+
+.. figure:: ../_static/image/spans_example.png
+   :alt: spans example
+
+   spans example
\ No newline at end of file
--- a/next_docs/en/user_guide/quick_start.rst
+++ b/next_docs/en/user_guide/quick_start.rst
@@ -2,12 +2,14 @@
 Quick Start 
 ==============

-Eager to get started? This page gives a good introduction to MinerU. Follow Installation to set up a project and install MinerU first.
-
+Want to learn about the usage methods under different scenarios ? This page gives good examples about multiple usage cases match your needs.

 .. toctree::
    :maxdepth: 1

-    quick_start/command_line
-    quick_start/to_markdown
-
+    quick_start/convert_pdf 
+    quick_start/convert_image
+    quick_start/convert_ppt
+    quick_start/convert_pptx
+    quick_start/convert_doc
+    quick_start/convert_docx