feat: support ms-office and images file in command line tools

cece8f53 · xu rui · 7dc3b0a9 · cece8f53 · cece8f53 · cece8f53
Commit cece8f53 authored Dec 11, 2024 by xu rui
9 changed files
--- a/magic_pdf/data/read_api.py
+++ b/magic_pdf/data/read_api.py
@@ -81,6 +81,11 @@ def read_local_office(path: str) -> list[PymuDocDataset]:

    Returns:
        list[PymuDocDataset]: each ms-office file will converted to a PymuDocDataset
+        
+    Raises:
+        ConvertToPdfError: Failed to convert ms-office file to pdf via libreoffice
+        FileNotFoundError: File not Found
+        Exception: Unknown Exception raised
    """
    suffixes = ['ppt', 'pptx', 'doc', 'docx']
    fns = []
@@ -97,7 +102,14 @@ def read_local_office(path: str) -> list[PymuDocDataset]:
    reader = FileBasedDataReader()
    temp_dir = tempfile.mkdtemp()
    for fn in fns:
-        convert_file_to_pdf(fn, temp_dir)
+        try:
+            convert_file_to_pdf(fn, temp_dir)
+        except ConvertToPdfError as e:
+            raise e
+        except FileNotFoundError as e:
+            raise e
+        except Exception as e:
+            raise e
        fn_path = Path(fn)
        pdf_fn = f"{temp_dir}/{fn_path.stem}.pdf"
        ret.append(PymuDocDataset(reader.read(pdf_fn)))

--- a/magic_pdf/tools/cli.py
+++ b/magic_pdf/tools/cli.py
 import os
-from pathlib import Path
-
+import shutil
+import tempfile
 import click
+import fitz
 from loguru import logger
+from pathlib import Path

 import magic_pdf.model as model_config
 from magic_pdf.data.data_reader_writer import FileBasedDataReader
 from magic_pdf.libs.version import __version__
 from magic_pdf.tools.common import do_parse, parse_pdf_methods
+from magic_pdf.utils.office_to_pdf import convert_file_to_pdf
+
+pdf_suffixes = ['.pdf']
+ms_office_suffixes = ['.ppt', '.pptx', '.doc', '.docx']
+image_suffixes = ['.png', '.jpg']


 @click.command()
@@ -21,7 +28,7 @@ from magic_pdf.tools.common import do_parse, parse_pdf_methods
    'path',
    type=click.Path(exists=True),
    required=True,
-    help='local pdf filepath or directory',
+    help='local filepath or directory. support PDF, PPT, PPTX, DOC, DOCX, PNG, JPG files',
 )
 @click.option(
    '-o',
@@ -83,12 +90,27 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
    model_config.__use_inside_model__ = True
    model_config.__model_mode__ = 'full'
    os.makedirs(output_dir, exist_ok=True)
+    temp_dir = tempfile.mkdtemp()
+    def read_fn(path: Path):
+        if path.suffix in ms_office_suffixes:
+            convert_file_to_pdf(str(path), temp_dir)
+            fn = os.path.join(temp_dir, f"{path.stem}.pdf")
+        elif path.suffix in image_suffixes:
+            with open(str(path), 'rb') as f:
+                bits = f.read(_)
+            pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
+            fn = os.path.join(temp_dir, f"{path.stem}.pdf")
+            with open(fn, 'wb') as f:
+                f.write(pdf_bytes)
+        elif path.suffix in pdf_suffixes:
+            fn = str(path)
+        else:
+            raise Exception(f"Unknown file suffix: {path.suffix}")
+        
+        disk_rw = FileBasedDataReader(os.path.dirname(fn))
+        return disk_rw.read(os.path.basename(fn))

-    def read_fn(path):
-        disk_rw = FileBasedDataReader(os.path.dirname(path))
-        return disk_rw.read(os.path.basename(path))
-
-    def parse_doc(doc_path: str):
+    def parse_doc(doc_path: Path):
        try:
            file_name = str(Path(doc_path).stem)
            pdf_data = read_fn(doc_path)
@@ -108,11 +130,14 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
            logger.exception(e)

    if os.path.isdir(path):
-        for doc_path in Path(path).glob('*.pdf'):
-            parse_doc(doc_path)
+        for doc_path in Path(path).glob('*'):
+            if doc_path.suffix in pdf_suffixes + image_suffixes + ms_office_suffixes:
+                parse_doc(doc_path)
    else:
        parse_doc(path)

+    shutil.rmtree(temp_dir)
+

 if __name__ == '__main__':
    cli()
--- a/next_docs/en/_static/image/inference_result.png
+++ b/next_docs/en/_static/image/inference_result.png
--- a/next_docs/en/user_guide/inference_result.rst
+++ b/next_docs/en/user_guide/inference_result.rst

-
 Inference Result 
 ==================

+.. admonition:: Tip
+    :class: tip
+
+    Please first navigate to :doc:`tutorial/pipeline` to get an initial understanding of how the pipeline works; this will help in understanding the content of this section.
+
+The **InferenceResult** class is a container for storing model inference results and implements a series of methods related to these results, such as draw_model, dump_model. 
+Checkout :doc:`../api/model_operators` for more details about **InferenceResult**
+
+
+Model Inference Result
+-----------------------
+
+Structure Definition
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code:: python
+
+    from pydantic import BaseModel, Field
+    from enum import IntEnum
+
+    class CategoryType(IntEnum):
+            title = 0               # Title
+            plain_text = 1          # Text
+            abandon = 2             # Includes headers, footers, page numbers, and page annotations
+            figure = 3              # Image
+            figure_caption = 4      # Image description
+            table = 5               # Table
+            table_caption = 6       # Table description
+            table_footnote = 7      # Table footnote
+            isolate_formula = 8     # Block formula
+            formula_caption = 9     # Formula label
+
+            embedding = 13          # Inline formula
+            isolated = 14           # Block formula
+            text = 15               # OCR recognition result
+
+
+    class PageInfo(BaseModel):
+        page_no: int = Field(description="Page number, the first page is 0", ge=0)
+        height: int = Field(description="Page height", gt=0)
+        width: int = Field(description="Page width", ge=0)
+
+    class ObjectInferenceResult(BaseModel):
+        category_id: CategoryType = Field(description="Category", ge=0)
+        poly: list[float] = Field(description="Quadrilateral coordinates, representing the coordinates of the top-left, top-right, bottom-right, and bottom-left points respectively")
+        score: float = Field(description="Confidence of the inference result")
+        latex: str | None = Field(description="LaTeX parsing result", default=None)
+        html: str | None = Field(description="HTML parsing result", default=None)
+
+    class PageInferenceResults(BaseModel):
+            layout_dets: list[ObjectInferenceResult] = Field(description="Page recognition results", ge=0)
+            page_info: PageInfo = Field(description="Page metadata")
+
+
+Example 
+^^^^^^^^^^^
+
+.. code:: json
+
+    [
+        {
+            "layout_dets": [
+                {
+                    "category_id": 2,
+                    "poly": [
+                        99.1906967163086,
+                        100.3119125366211,
+                        730.3707885742188,
+                        100.3119125366211,
+                        730.3707885742188,
+                        245.81326293945312,
+                        99.1906967163086,
+                        245.81326293945312
+                    ],
+                    "score": 0.9999997615814209
+                }
+            ],
+            "page_info": {
+                "page_no": 0,
+                "height": 2339,
+                "width": 1654
+            }
+        },
+        {
+            "layout_dets": [
+                {
+                    "category_id": 5,
+                    "poly": [
+                        99.13092803955078,
+                        2210.680419921875,
+                        497.3183898925781,
+                        2210.680419921875,
+                        497.3183898925781,
+                        2264.78076171875,
+                        99.13092803955078,
+                        2264.78076171875
+                    ],
+                    "score": 0.9999997019767761
+                }
+            ],
+            "page_info": {
+                "page_no": 1,
+                "height": 2339,
+                "width": 1654
+            }
+        }
+    ]
+
+The format of the poly coordinates is [x0, y0, x1, y1, x2, y2, x3, y3],
+representing the coordinates of the top-left, top-right, bottom-right,
+and bottom-left points respectively. |Poly Coordinate Diagram|
+
+
+
+Inference Result 
+-------------------------
+
+
+.. code:: python
+
+    from magic_pdf.model.operators import InferenceResult
+    from magic_pdf.data.dataset import Dataset 
+    
+    dataset : Dataset = some_data_set    # not real dataset
+
+    # The inference results of all pages, ordered by page number, are stored in a list as the inference results of MinerU
+    model_inference_result: list[PageInferenceResults] = []
+
+    Inference_result = InferenceResult(model_inference_result, dataset)
+
+
+
+some_model.pdf
+^^^^^^^^^^^^^^^^^^^^
+
+.. figure:: ../_static/image/Inference_result.png
+
+

+.. |Poly Coordinate Diagram| image:: ../_static/image/poly.png

--- a/next_docs/en/user_guide/install/config.rst
+++ b/next_docs/en/user_guide/install/config.rst
@@ -156,5 +156,5 @@ The version of config schema.
 .. admonition:: Tip
    :class: tip
    
-    Check `Config Schema <https://github.com/opendatalab/MinerU/blob/master/magic-pdf.template.json>`_ for the latest config schema.
+    Check `Config Schema <https://github.com/opendatalab/MinerU/blob/master/magic-pdf.template.json>`_ for the latest details

--- a/next_docs/en/user_guide/pipe_result.rst
+++ b/next_docs/en/user_guide/pipe_result.rst
@@ -3,3 +3,333 @@
 Pipe Result 
 ==============

+.. admonition:: Tip
+    :class: tip
+
+    Please first navigate to :doc:`tutorial/pipeline` to get an initial understanding of how the pipeline works; this will help in understanding the content of this section.
+
+
+The **PipeResult** class is a container for storing pipeline processing results and implements a series of methods related to these results, such as draw_layout, draw_span. 
+Checkout :doc:`../api/pipe_operators` for more details about **PipeResult**
+
+
+
+Structure Definitions
+-------------------------------
+
+**some_pdf_middle.json**
+
+----------------+--------------------------------------------------------------+
+| Field Name     | Description                                                  |
+|                |                                                              |
+================+==============================================================+
+| pdf_info       | list, each element is a dict representing the parsing result |
+|                | of each PDF page, see the table below for details            |
+----------------+--------------------------------------------------------------+
+| \_             | ocr \| txt, used to indicate the mode used in this           |
+| parse_type     | intermediate parsing state                                   |
+|                |                                                              |
+----------------+--------------------------------------------------------------+
+| \_version_name | string, indicates the version of magic-pdf used in this      |
+|                | parsing                                                      |
+|                |                                                              |
+----------------+--------------------------------------------------------------+
+
+**pdf_info**
+
+Field structure description
+
+-------------------------+------------------------------------------------------------+
+| Field                   | Description                                                |
+| Name                    |                                                            |
+=========================+============================================================+
+| preproc_blocks          | Intermediate result after PDF preprocessing, not yet       |
+|                         | segmented                                                  |
+-------------------------+------------------------------------------------------------+
+| layout_bboxes           | Layout segmentation results, containing layout direction   |
+|                         | (vertical, horizontal), and bbox, sorted by reading order  |
+-------------------------+------------------------------------------------------------+
+| page_idx                | Page number, starting from 0                               |
+|                         |                                                            |
+-------------------------+------------------------------------------------------------+
+| page_size               | Page width and height                                      |
+|                         |                                                            |
+-------------------------+------------------------------------------------------------+
+| \_layout_tree           | Layout tree structure                                      |
+|                         |                                                            |
+-------------------------+------------------------------------------------------------+
+| images                  | list, each element is a dict representing an img_block     |
+-------------------------+------------------------------------------------------------+
+| tables                  | list, each element is a dict representing a table_block    |
+-------------------------+------------------------------------------------------------+
+| interline_equation      | list, each element is a dict representing an               |
+|                         | interline_equation_block                                   |
+|                         |                                                            |
+-------------------------+------------------------------------------------------------+
+| discarded_blocks        | List, block information returned by the model that needs   |
+|                         | to be dropped                                              |
+|                         |                                                            |
+-------------------------+------------------------------------------------------------+
+| para_blocks             | Result after segmenting preproc_blocks                     |
+|                         |                                                            |
+-------------------------+------------------------------------------------------------+
+
+In the above table, ``para_blocks`` is an array of dicts, each dict
+representing a block structure. A block can support up to one level of
+nesting.
+
+**block**
+
+The outer block is referred to as a first-level block, and the fields in
+the first-level block include:
+
+------------------------+-------------------------------------------------------------+
+| Field                  | Description                                                 |
+| Name                   |                                                             |
+========================+=============================================================+
+| type                   | Block type (table|image)                                    |
+------------------------+-------------------------------------------------------------+
+| bbox                   | Block bounding box coordinates                              |
+------------------------+-------------------------------------------------------------+
+| blocks                 | list, each element is a dict representing a second-level    |
+|                        | block                                                       |
+------------------------+-------------------------------------------------------------+
+
+There are only two types of first-level blocks: “table” and “image”. All
+other blocks are second-level blocks.
+
+The fields in a second-level block include:
+
+----------------------+----------------------------------------------------------------+
+| Field                | Description                                                    |
+| Name                 |                                                                |
+======================+================================================================+
+|                      | Block type                                                     |
+| type                 |                                                                |
+----------------------+----------------------------------------------------------------+
+|                      | Block bounding box coordinates                                 |
+| bbox                 |                                                                |
+----------------------+----------------------------------------------------------------+
+|                      | list, each element is a dict representing a line, used to      |
+| lines                | describe the composition of a line of information              |
+----------------------+----------------------------------------------------------------+
+
+Detailed explanation of second-level block types
+
+================== ======================
+type               Description
+================== ======================
+image_body         Main body of the image
+image_caption      Image description text
+table_body         Main body of the table
+table_caption      Table description text
+table_footnote     Table footnote
+text               Text block
+title              Title block
+interline_equation Block formula
+================== ======================
+
+**line**
+
+The field format of a line is as follows:
+
+---------------------+----------------------------------------------------------------+
+| Field               | Description                                                    |
+| Name                |                                                                |
+=====================+================================================================+
+|                     | Bounding box coordinates of the line                           |
+| bbox                |                                                                |
+---------------------+----------------------------------------------------------------+
+| spans               | list, each element is a dict representing a span, used to      |
+|                     | describe the composition of the smallest unit                  |
+---------------------+----------------------------------------------------------------+
+
+**span**
+
+---------------------+-----------------------------------------------------------+
+| Field               | Description                                               |
+| Name                |                                                           |
+=====================+===========================================================+
+| bbox                | Bounding box coordinates of the span                      |
+---------------------+-----------------------------------------------------------+
+| type                | Type of the span                                          |
+---------------------+-----------------------------------------------------------+
+| content             | Text spans use content, chart spans use img_path to store |
+| \|                  | the actual text or screenshot path information            |
+| img_path            |                                                           |
+---------------------+-----------------------------------------------------------+
+
+The types of spans are as follows:
+
+================== ==============
+type               Description
+================== ==============
+image              Image
+table              Table
+text               Text
+inline_equation    Inline formula
+interline_equation Block formula
+================== ==============
+
+**Summary**
+
+A span is the smallest storage unit for all elements.
+
+The elements stored within para_blocks are block information.
+
+The block structure is as follows:
+
+First-level block (if any) -> Second-level block -> Line -> Span
+
+.. _example-1:
+
+example
+^^^^^^^
+
+.. code:: json
+
+   {
+       "pdf_info": [
+           {
+               "preproc_blocks": [
+                   {
+                       "type": "text",
+                       "bbox": [
+                           52,
+                           61.956024169921875,
+                           294,
+                           82.99800872802734
+                       ],
+                       "lines": [
+                           {
+                               "bbox": [
+                                   52,
+                                   61.956024169921875,
+                                   294,
+                                   72.0000228881836
+                               ],
+                               "spans": [
+                                   {
+                                       "bbox": [
+                                           54.0,
+                                           61.956024169921875,
+                                           296.2261657714844,
+                                           72.0000228881836
+                                       ],
+                                       "content": "dependent on the service headway and the reliability of the departure ",
+                                       "type": "text",
+                                       "score": 1.0
+                                   }
+                               ]
+                           }
+                       ]
+                   }
+               ],
+               "layout_bboxes": [
+                   {
+                       "layout_bbox": [
+                           52,
+                           61,
+                           294,
+                           731
+                       ],
+                       "layout_label": "V",
+                       "sub_layout": []
+                   }
+               ],
+               "page_idx": 0,
+               "page_size": [
+                   612.0,
+                   792.0
+               ],
+               "_layout_tree": [],
+               "images": [],
+               "tables": [],
+               "interline_equations": [],
+               "discarded_blocks": [],
+               "para_blocks": [
+                   {
+                       "type": "text",
+                       "bbox": [
+                           52,
+                           61.956024169921875,
+                           294,
+                           82.99800872802734
+                       ],
+                       "lines": [
+                           {
+                               "bbox": [
+                                   52,
+                                   61.956024169921875,
+                                   294,
+                                   72.0000228881836
+                               ],
+                               "spans": [
+                                   {
+                                       "bbox": [
+                                           54.0,
+                                           61.956024169921875,
+                                           296.2261657714844,
+                                           72.0000228881836
+                                       ],
+                                       "content": "dependent on the service headway and the reliability of the departure ",
+                                       "type": "text",
+                                       "score": 1.0
+                                   }
+                               ]
+                           }
+                       ]
+                   }
+               ]
+           }
+       ],
+       "_parse_type": "txt",
+       "_version_name": "0.6.1"
+   }
+
+
+Pipeline Result 
+------------------
+
+.. code:: python 
+
+    from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
+    from magic_pdf.pipe.operators import PipeResult
+    from magic_pdf.data.dataset import Dataset 
+
+    res = pdf_parse_union(*args, **kwargs)
+    res['_parse_type'] = PARSE_TYPE_OCR
+    res['_version_name'] = __version__
+    if 'lang' in kwargs and kwargs['lang'] is not None:
+        res['lang'] = kwargs['lang']
+
+    dataset : Dataset = some_dataset   # not real dataset
+    pipeResult = PipeResult(res, dataset)
+
+
+
+some_pdf_layout.pdf
+~~~~~~~~~~~~~~~~~~~
+
+Each page layout consists of one or more boxes. The number at the top
+left of each box indicates its sequence number. Additionally, in
+``layout.pdf``, different content blocks are highlighted with different
+background colors.
+
+.. figure:: ../_static/image/layout_example.png
+   :alt: layout example
+
+   layout example
+
+some_pdf_spans.pdf
+~~~~~~~~~~~~~~~~~~
+
+All spans on the page are drawn with different colored line frames
+according to the span type. This file can be used for quality control,
+allowing for quick identification of issues such as missing text or
+unrecognized inline formulas.
+
+.. figure:: ../_static/image/spans_example.png
+   :alt: spans example
+
+   spans example
\ No newline at end of file
--- a/next_docs/en/user_guide/quick_start/convert_doc.rst
+++ b/next_docs/en/user_guide/quick_start/convert_doc.rst


-Convert Word 
+Convert Doc
 =============

 .. admonition:: Warning

--- a/next_docs/en/user_guide/tutorial/pipeline.rst
+++ b/next_docs/en/user_guide/tutorial/pipeline.rst
@@ -84,8 +84,6 @@ These stages are linked together through methods like ``apply``, ``doc_analyze``
 .. admonition:: Tip
    :class: tip

-    For more examples on how to use ``Dataset``, ``InferenceResult``, and ``PipeResult``, please refer to :doc:`../quick_start/to_markdown`
-
    For more detailed information about ``Dataset``, ``InferenceResult``, and ``PipeResult``, please refer to :doc:`../../api/dataset`, :doc:`../../api/model_operators`, :doc:`../../api/pipe_operators`



--- a/next_docs/zh_cn/_static/image/inference_result.png
+++ b/next_docs/zh_cn/_static/image/inference_result.png