Commit cece8f53 authored by xu rui's avatar xu rui
Browse files

feat: support ms-office and images file in command line tools

parent 7dc3b0a9
......@@ -81,6 +81,11 @@ def read_local_office(path: str) -> list[PymuDocDataset]:
Returns:
list[PymuDocDataset]: each ms-office file will converted to a PymuDocDataset
Raises:
ConvertToPdfError: Failed to convert ms-office file to pdf via libreoffice
FileNotFoundError: File not Found
Exception: Unknown Exception raised
"""
suffixes = ['ppt', 'pptx', 'doc', 'docx']
fns = []
......@@ -97,7 +102,14 @@ def read_local_office(path: str) -> list[PymuDocDataset]:
reader = FileBasedDataReader()
temp_dir = tempfile.mkdtemp()
for fn in fns:
convert_file_to_pdf(fn, temp_dir)
try:
convert_file_to_pdf(fn, temp_dir)
except ConvertToPdfError as e:
raise e
except FileNotFoundError as e:
raise e
except Exception as e:
raise e
fn_path = Path(fn)
pdf_fn = f"{temp_dir}/{fn_path.stem}.pdf"
ret.append(PymuDocDataset(reader.read(pdf_fn)))
......
import os
from pathlib import Path
import shutil
import tempfile
import click
import fitz
from loguru import logger
from pathlib import Path
import magic_pdf.model as model_config
from magic_pdf.data.data_reader_writer import FileBasedDataReader
from magic_pdf.libs.version import __version__
from magic_pdf.tools.common import do_parse, parse_pdf_methods
from magic_pdf.utils.office_to_pdf import convert_file_to_pdf
pdf_suffixes = ['.pdf']
ms_office_suffixes = ['.ppt', '.pptx', '.doc', '.docx']
image_suffixes = ['.png', '.jpg']
@click.command()
......@@ -21,7 +28,7 @@ from magic_pdf.tools.common import do_parse, parse_pdf_methods
'path',
type=click.Path(exists=True),
required=True,
help='local pdf filepath or directory',
help='local filepath or directory. support PDF, PPT, PPTX, DOC, DOCX, PNG, JPG files',
)
@click.option(
'-o',
......@@ -83,12 +90,27 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
model_config.__use_inside_model__ = True
model_config.__model_mode__ = 'full'
os.makedirs(output_dir, exist_ok=True)
temp_dir = tempfile.mkdtemp()
def read_fn(path: Path):
if path.suffix in ms_office_suffixes:
convert_file_to_pdf(str(path), temp_dir)
fn = os.path.join(temp_dir, f"{path.stem}.pdf")
elif path.suffix in image_suffixes:
with open(str(path), 'rb') as f:
bits = f.read(_)
pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
fn = os.path.join(temp_dir, f"{path.stem}.pdf")
with open(fn, 'wb') as f:
f.write(pdf_bytes)
elif path.suffix in pdf_suffixes:
fn = str(path)
else:
raise Exception(f"Unknown file suffix: {path.suffix}")
disk_rw = FileBasedDataReader(os.path.dirname(fn))
return disk_rw.read(os.path.basename(fn))
def read_fn(path):
disk_rw = FileBasedDataReader(os.path.dirname(path))
return disk_rw.read(os.path.basename(path))
def parse_doc(doc_path: str):
def parse_doc(doc_path: Path):
try:
file_name = str(Path(doc_path).stem)
pdf_data = read_fn(doc_path)
......@@ -108,11 +130,14 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
logger.exception(e)
if os.path.isdir(path):
for doc_path in Path(path).glob('*.pdf'):
parse_doc(doc_path)
for doc_path in Path(path).glob('*'):
if doc_path.suffix in pdf_suffixes + image_suffixes + ms_office_suffixes:
parse_doc(doc_path)
else:
parse_doc(path)
shutil.rmtree(temp_dir)
if __name__ == '__main__':
cli()
Inference Result
==================
.. admonition:: Tip
:class: tip
Please first navigate to :doc:`tutorial/pipeline` to get an initial understanding of how the pipeline works; this will help in understanding the content of this section.
The **InferenceResult** class is a container for storing model inference results and implements a series of methods related to these results, such as draw_model, dump_model.
Checkout :doc:`../api/model_operators` for more details about **InferenceResult**
Model Inference Result
-----------------------
Structure Definition
^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: python
from pydantic import BaseModel, Field
from enum import IntEnum
class CategoryType(IntEnum):
title = 0 # Title
plain_text = 1 # Text
abandon = 2 # Includes headers, footers, page numbers, and page annotations
figure = 3 # Image
figure_caption = 4 # Image description
table = 5 # Table
table_caption = 6 # Table description
table_footnote = 7 # Table footnote
isolate_formula = 8 # Block formula
formula_caption = 9 # Formula label
embedding = 13 # Inline formula
isolated = 14 # Block formula
text = 15 # OCR recognition result
class PageInfo(BaseModel):
page_no: int = Field(description="Page number, the first page is 0", ge=0)
height: int = Field(description="Page height", gt=0)
width: int = Field(description="Page width", ge=0)
class ObjectInferenceResult(BaseModel):
category_id: CategoryType = Field(description="Category", ge=0)
poly: list[float] = Field(description="Quadrilateral coordinates, representing the coordinates of the top-left, top-right, bottom-right, and bottom-left points respectively")
score: float = Field(description="Confidence of the inference result")
latex: str | None = Field(description="LaTeX parsing result", default=None)
html: str | None = Field(description="HTML parsing result", default=None)
class PageInferenceResults(BaseModel):
layout_dets: list[ObjectInferenceResult] = Field(description="Page recognition results", ge=0)
page_info: PageInfo = Field(description="Page metadata")
Example
^^^^^^^^^^^
.. code:: json
[
{
"layout_dets": [
{
"category_id": 2,
"poly": [
99.1906967163086,
100.3119125366211,
730.3707885742188,
100.3119125366211,
730.3707885742188,
245.81326293945312,
99.1906967163086,
245.81326293945312
],
"score": 0.9999997615814209
}
],
"page_info": {
"page_no": 0,
"height": 2339,
"width": 1654
}
},
{
"layout_dets": [
{
"category_id": 5,
"poly": [
99.13092803955078,
2210.680419921875,
497.3183898925781,
2210.680419921875,
497.3183898925781,
2264.78076171875,
99.13092803955078,
2264.78076171875
],
"score": 0.9999997019767761
}
],
"page_info": {
"page_no": 1,
"height": 2339,
"width": 1654
}
}
]
The format of the poly coordinates is [x0, y0, x1, y1, x2, y2, x3, y3],
representing the coordinates of the top-left, top-right, bottom-right,
and bottom-left points respectively. |Poly Coordinate Diagram|
Inference Result
-------------------------
.. code:: python
from magic_pdf.model.operators import InferenceResult
from magic_pdf.data.dataset import Dataset
dataset : Dataset = some_data_set # not real dataset
# The inference results of all pages, ordered by page number, are stored in a list as the inference results of MinerU
model_inference_result: list[PageInferenceResults] = []
Inference_result = InferenceResult(model_inference_result, dataset)
some_model.pdf
^^^^^^^^^^^^^^^^^^^^
.. figure:: ../_static/image/Inference_result.png
.. |Poly Coordinate Diagram| image:: ../_static/image/poly.png
......@@ -156,5 +156,5 @@ The version of config schema.
.. admonition:: Tip
:class: tip
Check `Config Schema <https://github.com/opendatalab/MinerU/blob/master/magic-pdf.template.json>`_ for the latest config schema.
Check `Config Schema <https://github.com/opendatalab/MinerU/blob/master/magic-pdf.template.json>`_ for the latest details
......@@ -3,3 +3,333 @@
Pipe Result
==============
.. admonition:: Tip
:class: tip
Please first navigate to :doc:`tutorial/pipeline` to get an initial understanding of how the pipeline works; this will help in understanding the content of this section.
The **PipeResult** class is a container for storing pipeline processing results and implements a series of methods related to these results, such as draw_layout, draw_span.
Checkout :doc:`../api/pipe_operators` for more details about **PipeResult**
Structure Definitions
-------------------------------
**some_pdf_middle.json**
+----------------+--------------------------------------------------------------+
| Field Name | Description |
| | |
+================+==============================================================+
| pdf_info | list, each element is a dict representing the parsing result |
| | of each PDF page, see the table below for details |
+----------------+--------------------------------------------------------------+
| \_ | ocr \| txt, used to indicate the mode used in this |
| parse_type | intermediate parsing state |
| | |
+----------------+--------------------------------------------------------------+
| \_version_name | string, indicates the version of magic-pdf used in this |
| | parsing |
| | |
+----------------+--------------------------------------------------------------+
**pdf_info**
Field structure description
+-------------------------+------------------------------------------------------------+
| Field | Description |
| Name | |
+=========================+============================================================+
| preproc_blocks | Intermediate result after PDF preprocessing, not yet |
| | segmented |
+-------------------------+------------------------------------------------------------+
| layout_bboxes | Layout segmentation results, containing layout direction |
| | (vertical, horizontal), and bbox, sorted by reading order |
+-------------------------+------------------------------------------------------------+
| page_idx | Page number, starting from 0 |
| | |
+-------------------------+------------------------------------------------------------+
| page_size | Page width and height |
| | |
+-------------------------+------------------------------------------------------------+
| \_layout_tree | Layout tree structure |
| | |
+-------------------------+------------------------------------------------------------+
| images | list, each element is a dict representing an img_block |
+-------------------------+------------------------------------------------------------+
| tables | list, each element is a dict representing a table_block |
+-------------------------+------------------------------------------------------------+
| interline_equation | list, each element is a dict representing an |
| | interline_equation_block |
| | |
+-------------------------+------------------------------------------------------------+
| discarded_blocks | List, block information returned by the model that needs |
| | to be dropped |
| | |
+-------------------------+------------------------------------------------------------+
| para_blocks | Result after segmenting preproc_blocks |
| | |
+-------------------------+------------------------------------------------------------+
In the above table, ``para_blocks`` is an array of dicts, each dict
representing a block structure. A block can support up to one level of
nesting.
**block**
The outer block is referred to as a first-level block, and the fields in
the first-level block include:
+------------------------+-------------------------------------------------------------+
| Field | Description |
| Name | |
+========================+=============================================================+
| type | Block type (table|image) |
+------------------------+-------------------------------------------------------------+
| bbox | Block bounding box coordinates |
+------------------------+-------------------------------------------------------------+
| blocks | list, each element is a dict representing a second-level |
| | block |
+------------------------+-------------------------------------------------------------+
There are only two types of first-level blocks: “table” and “image”. All
other blocks are second-level blocks.
The fields in a second-level block include:
+----------------------+----------------------------------------------------------------+
| Field | Description |
| Name | |
+======================+================================================================+
| | Block type |
| type | |
+----------------------+----------------------------------------------------------------+
| | Block bounding box coordinates |
| bbox | |
+----------------------+----------------------------------------------------------------+
| | list, each element is a dict representing a line, used to |
| lines | describe the composition of a line of information |
+----------------------+----------------------------------------------------------------+
Detailed explanation of second-level block types
================== ======================
type Description
================== ======================
image_body Main body of the image
image_caption Image description text
table_body Main body of the table
table_caption Table description text
table_footnote Table footnote
text Text block
title Title block
interline_equation Block formula
================== ======================
**line**
The field format of a line is as follows:
+---------------------+----------------------------------------------------------------+
| Field | Description |
| Name | |
+=====================+================================================================+
| | Bounding box coordinates of the line |
| bbox | |
+---------------------+----------------------------------------------------------------+
| spans | list, each element is a dict representing a span, used to |
| | describe the composition of the smallest unit |
+---------------------+----------------------------------------------------------------+
**span**
+---------------------+-----------------------------------------------------------+
| Field | Description |
| Name | |
+=====================+===========================================================+
| bbox | Bounding box coordinates of the span |
+---------------------+-----------------------------------------------------------+
| type | Type of the span |
+---------------------+-----------------------------------------------------------+
| content | Text spans use content, chart spans use img_path to store |
| \| | the actual text or screenshot path information |
| img_path | |
+---------------------+-----------------------------------------------------------+
The types of spans are as follows:
================== ==============
type Description
================== ==============
image Image
table Table
text Text
inline_equation Inline formula
interline_equation Block formula
================== ==============
**Summary**
A span is the smallest storage unit for all elements.
The elements stored within para_blocks are block information.
The block structure is as follows:
First-level block (if any) -> Second-level block -> Line -> Span
.. _example-1:
example
^^^^^^^
.. code:: json
{
"pdf_info": [
{
"preproc_blocks": [
{
"type": "text",
"bbox": [
52,
61.956024169921875,
294,
82.99800872802734
],
"lines": [
{
"bbox": [
52,
61.956024169921875,
294,
72.0000228881836
],
"spans": [
{
"bbox": [
54.0,
61.956024169921875,
296.2261657714844,
72.0000228881836
],
"content": "dependent on the service headway and the reliability of the departure ",
"type": "text",
"score": 1.0
}
]
}
]
}
],
"layout_bboxes": [
{
"layout_bbox": [
52,
61,
294,
731
],
"layout_label": "V",
"sub_layout": []
}
],
"page_idx": 0,
"page_size": [
612.0,
792.0
],
"_layout_tree": [],
"images": [],
"tables": [],
"interline_equations": [],
"discarded_blocks": [],
"para_blocks": [
{
"type": "text",
"bbox": [
52,
61.956024169921875,
294,
82.99800872802734
],
"lines": [
{
"bbox": [
52,
61.956024169921875,
294,
72.0000228881836
],
"spans": [
{
"bbox": [
54.0,
61.956024169921875,
296.2261657714844,
72.0000228881836
],
"content": "dependent on the service headway and the reliability of the departure ",
"type": "text",
"score": 1.0
}
]
}
]
}
]
}
],
"_parse_type": "txt",
"_version_name": "0.6.1"
}
Pipeline Result
------------------
.. code:: python
from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
from magic_pdf.pipe.operators import PipeResult
from magic_pdf.data.dataset import Dataset
res = pdf_parse_union(*args, **kwargs)
res['_parse_type'] = PARSE_TYPE_OCR
res['_version_name'] = __version__
if 'lang' in kwargs and kwargs['lang'] is not None:
res['lang'] = kwargs['lang']
dataset : Dataset = some_dataset # not real dataset
pipeResult = PipeResult(res, dataset)
some_pdf_layout.pdf
~~~~~~~~~~~~~~~~~~~
Each page layout consists of one or more boxes. The number at the top
left of each box indicates its sequence number. Additionally, in
``layout.pdf``, different content blocks are highlighted with different
background colors.
.. figure:: ../_static/image/layout_example.png
:alt: layout example
layout example
some_pdf_spans.pdf
~~~~~~~~~~~~~~~~~~
All spans on the page are drawn with different colored line frames
according to the span type. This file can be used for quality control,
allowing for quick identification of issues such as missing text or
unrecognized inline formulas.
.. figure:: ../_static/image/spans_example.png
:alt: spans example
spans example
\ No newline at end of file
Convert Word
Convert Doc
=============
.. admonition:: Warning
......
......@@ -84,8 +84,6 @@ These stages are linked together through methods like ``apply``, ``doc_analyze``
.. admonition:: Tip
:class: tip
For more examples on how to use ``Dataset``, ``InferenceResult``, and ``PipeResult``, please refer to :doc:`../quick_start/to_markdown`
For more detailed information about ``Dataset``, ``InferenceResult``, and ``PipeResult``, please refer to :doc:`../../api/dataset`, :doc:`../../api/model_operators`, :doc:`../../api/pipe_operators`
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment