docs: check links in doc

b04867f9 · xu rui · cece8f53 · b04867f9 · b04867f9 · b04867f9
Commit b04867f9 authored Dec 11, 2024 by xu rui
14 changed files
--- a/magic_pdf/data/read_api.py
+++ b/magic_pdf/data/read_api.py
@@ -87,14 +87,14 @@ def read_local_office(path: str) -> list[PymuDocDataset]:
        FileNotFoundError: File not Found
        Exception: Unknown Exception raised
    """
-    suffixes = ['ppt', 'pptx', 'doc', 'docx']
+    suffixes = ['.ppt', '.pptx', '.doc', '.docx']
    fns = []
    ret = []
    if os.path.isdir(path):
        for root, _, files in os.walk(path):
            for file in files:
-                suffix = file.split('.')
+                suffix = Path(file).suffix
-                if suffix[-1] in suffixes:
+                if suffix in suffixes:
                    fns.append((os.path.join(root, file)))
    else:
        fns.append(path)
@@ -116,12 +116,12 @@ def read_local_office(path: str) -> list[PymuDocDataset]:
    shutil.rmtree(temp_dir)
    return ret
-def read_local_images(path: str, suffixes: list[str]=['png', 'jpg']) -> list[ImageDataset]:
+def read_local_images(path: str, suffixes: list[str]=['.png', '.jpg']) -> list[ImageDataset]:
    """Read images from path or directory.
    Args:
        path (str): image file path or directory that contains image files
-        suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['jpg', 'png']
+        suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['.jpg', '.png']
    Returns:
        list[ImageDataset]: each image file will converted to a ImageDataset
@@ -132,8 +132,8 @@ def read_local_images(path: str, suffixes: list[str]=['png', 'jpg']) -> list[Ima
        reader = FileBasedDataReader()
        for root, _, files in os.walk(path):
            for file in files:
-                suffix = file.split('.')
+                suffix = Path(file).suffix
-                if suffix[-1] in s_suffixes:
+                if suffix in s_suffixes:
                    imgs_bits.append(reader.read(os.path.join(root, file)))
        return [ImageDataset(bits) for bits in imgs_bits]
    else:

--- a/magic_pdf/tools/cli.py
+++ b/magic_pdf/tools/cli.py
@@ -97,7 +97,7 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
            fn = os.path.join(temp_dir, f"{path.stem}.pdf")
        elif path.suffix in image_suffixes:
            with open(str(path), 'rb') as f:
-                bits = f.read(_)
+                bits = f.read()
            pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
            fn = os.path.join(temp_dir, f"{path.stem}.pdf")
            with open(fn, 'wb') as f:
@@ -134,7 +134,7 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
            if doc_path.suffix in pdf_suffixes + image_suffixes + ms_office_suffixes:
                parse_doc(doc_path)
    else:
-        parse_doc(path)
+        parse_doc(Path(path))
    shutil.rmtree(temp_dir)

--- a/next_docs/en/additional_notes/glossary.rst
+++ b/next_docs/en/additional_notes/glossary.rst
@@ -4,8 +4,11 @@ Glossary
 ===========
 1. jsonl 
-    TODO: add description
+    Newline-delimited (\n), and each line must be a valid, independent JSON object. 
+    Currently, All the function shipped with **MinerU** assume that json object must contain one field named with either **path** or **file_location**
 2. magic-pdf.json 
-    TODO: add description
+    TODO
--- a/next_docs/en/user_guide/install/install.rst
+++ b/next_docs/en/user_guide/install/install.rst
@@ -134,6 +134,6 @@ Windows Platform
 .. tip::
-    The MinerU is installed, Check out :doc:`../quick_start/command_line` to convert your first pdf **or** reading the following sections for more details about install
+    The MinerU is installed, Check out :doc:`../usage/command_line` to convert your first pdf **or** reading the following sections for more details about install
--- a/next_docs/en/user_guide/quick_start/convert_directory.rst
+++ b/next_docs/en/user_guide/quick_start/convert_directory.rst
-Convert Files Under Directory 
-=================================
-.. code:: python 
--- a/next_docs/en/user_guide/quick_start/convert_doc.rst
+++ b/next_docs/en/user_guide/quick_start/convert_doc.rst
@@ -10,6 +10,19 @@ Convert Doc
    For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
+Command Line
+^^^^^^^^^^^^^
+.. code:: python 
+    # make sure the file have correct suffix
+    magic-pdf -p a.doc -o output -m auto
+API 
+^^^^^^^^
 .. code:: python 
    import os

--- a/next_docs/en/user_guide/quick_start/convert_docx.rst
+++ b/next_docs/en/user_guide/quick_start/convert_docx.rst
@@ -10,6 +10,18 @@ Convert DocX
    For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
+Command Line
+^^^^^^^^^^^^^
+.. code:: python 
+    # make sure the file have correct suffix
+    magic-pdf -p a.docx -o output -m auto
+API 
+^^^^^
 .. code:: python 
    import os

--- a/next_docs/en/user_guide/quick_start/convert_image.rst
+++ b/next_docs/en/user_guide/quick_start/convert_image.rst
@@ -3,6 +3,19 @@
 Convert Image
 ===============
+Command Line
+^^^^^^^^^^^^^
+.. code:: python 
+    # make sure the file have correct suffix
+    magic-pdf -p a.png -o output -m auto
+API 
+^^^^^^
 .. code:: python
    import os

--- a/next_docs/en/user_guide/quick_start/convert_pdf.rst
+++ b/next_docs/en/user_guide/quick_start/convert_pdf.rst
@@ -3,6 +3,17 @@
 Convert PDF 
 ============
+Command Line
+^^^^^^^^^^^^^
+.. code:: python 
+    # make sure the file have correct suffix
+    magic-pdf -p a.pdf -o output -m auto
+API
+^^^^^^
 .. code:: python
    import os

--- a/next_docs/en/user_guide/quick_start/convert_ppt.rst
+++ b/next_docs/en/user_guide/quick_start/convert_ppt.rst
@@ -10,6 +10,17 @@ Convert PPT
    For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
+Command Line
+^^^^^^^^^^^^^
+.. code:: python 
+    # make sure the file have correct suffix
+    magic-pdf -p a.ppt -o output -m auto
+API 
+^^^^^
 .. code:: python 

--- a/next_docs/en/user_guide/quick_start/convert_pptx.rst
+++ b/next_docs/en/user_guide/quick_start/convert_pptx.rst
@@ -11,6 +11,19 @@ Convert PPTX
    For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
+Command Line
+^^^^^^^^^^^^^
+.. code:: python 
+    # make sure the file have correct suffix
+    magic-pdf -p a.pptx -o output -m auto
+API 
+^^^^^^
 .. code:: python 

--- a/next_docs/en/user_guide/tutorial.rst
+++ b/next_docs/en/user_guide/tutorial.rst
@@ -7,6 +7,5 @@ From the beginning to the end, Show how to using mineru via a minimal project
 .. toctree::
    :maxdepth: 1
-    tutorial/output_file_description
    tutorial/pipeline
--- a/next_docs/en/user_guide/usage/api.rst
+++ b/next_docs/en/user_guide/usage/api.rst
@@ -2,6 +2,10 @@
 Api Usage 
 ===========
+PDF
+----
 Local File Example
 ^^^^^^^^^^^^^^^^^^
@@ -111,4 +115,112 @@ S3 File Example
    pipe_result.dump_md(writer, f'{name_without_suff}.md', "unittest/tmp/images")    # dump to remote s3
+MS-Office 
+----------
+.. code:: python 
+    import os
+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+    from magic_pdf.data.read_api import read_local_office
+    # prepare env
+    local_image_dir, local_md_dir = "output/images", "output"
+    image_dir = str(os.path.basename(local_image_dir))
+    os.makedirs(local_image_dir, exist_ok=True)
+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
+        local_md_dir
+    )
+    # proc
+    ## Create Dataset Instance
+    input_file = "some_ppt.ppt"     # replace with real ms-office file
+    input_file_name = input_file.split(".")[0]
+    ds = read_local_office(input_file)[0]
+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
+        md_writer, f"{input_file_name}.md", image_dir
+    )
+This code snippet can be used to manipulate **ppt**, **pptx**, **doc**, **docx** file
+Image
+---------
+Single Image File 
+^^^^^^^^^^^^^^^^^^^
+.. code:: python
+    import os
+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter
+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+    from magic_pdf.data.read_api import read_local_images
+    # prepare env
+    local_image_dir, local_md_dir = "output/images", "output"
+    image_dir = str(os.path.basename(local_image_dir))
+    os.makedirs(local_image_dir, exist_ok=True)
+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
+        local_md_dir
+    )
+    # proc
+    ## Create Dataset Instance
+    input_file = "some_image.jpg"       # replace with real image file
+    input_file_name = input_file.split(".")[0]
+    ds = read_local_images(input_file)[0]
+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
+        md_writer, f"{input_file_name}.md", image_dir
+    )
+Directory That Contains Images 
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: python
+    import os
+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter
+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+    from magic_pdf.data.read_api import read_local_images
+    # prepare env
+    local_image_dir, local_md_dir = "output/images", "output"
+    image_dir = str(os.path.basename(local_image_dir))
+    os.makedirs(local_image_dir, exist_ok=True)
+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
+        local_md_dir
+    )
+    # proc
+    ## Create Dataset Instance
+    input_directory = "some_image_dir/"       # replace with real directory that contains images
+    dss = read_local_images(input_directory, suffixes=['.png', '.jpg'])[0]  
+    count = 0
+    for ds in dss:
+        ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
+            md_writer, f"{count}.md", image_dir
+        )
+        count += 1
 Check :doc:`../data/data_reader_writer` for more [reader | writer] examples and check :doc:`../../api/pipe_operators` or :doc:`../../api/model_operators` for api details
--- a/next_docs/en/user_guide/usage/command_line.rst
+++ b/next_docs/en/user_guide/usage/command_line.rst
@@ -10,7 +10,8 @@ Command Line
   Options:
     -v, --version                display the version and exit
-     -p, --path PATH              local pdf filepath or directory  [required]
+     -p, --path PATH              local filepath or directory. support PDF, PPT,
+                                  PPTX, DOC, DOCX, PNG, JPG files  [required]
     -o, --output-dir PATH        output local directory  [required]
     -m, --method [ocr|txt|auto]  the method for parsing pdf. ocr: using ocr
                                  technique to extract information from pdf. txt:
@@ -40,6 +41,20 @@ Command Line
   ## command line example
   magic-pdf -p {some_pdf} -o {some_output_dir} -m auto
+.. admonition:: Important
+    :class: tip
+    The file must endswith with the following suffix.
+       .pdf 
+       .png
+       .jpg
+       .ppt
+       .pptx
+       .doc
+       .docx
 ``{some_pdf}`` can be a single PDF file or a directory containing
 multiple PDFs. The results will be saved in the ``{some_output_dir}``
 directory. The output file list is as follows:
@@ -59,4 +74,4 @@ directory. The output file list is as follows:
   :class: tip
-   For more information about the output files, please refer to the :doc:`TODO: modify link <../tutorial/output_file_description>`
+   For more information about the output files, please refer to the :doc:`../inference_result` or :doc:`../pipe_result`