Commit b04867f9 authored by xu rui's avatar xu rui
Browse files

docs: check links in doc

parent cece8f53
...@@ -87,14 +87,14 @@ def read_local_office(path: str) -> list[PymuDocDataset]: ...@@ -87,14 +87,14 @@ def read_local_office(path: str) -> list[PymuDocDataset]:
FileNotFoundError: File not Found FileNotFoundError: File not Found
Exception: Unknown Exception raised Exception: Unknown Exception raised
""" """
suffixes = ['ppt', 'pptx', 'doc', 'docx'] suffixes = ['.ppt', '.pptx', '.doc', '.docx']
fns = [] fns = []
ret = [] ret = []
if os.path.isdir(path): if os.path.isdir(path):
for root, _, files in os.walk(path): for root, _, files in os.walk(path):
for file in files: for file in files:
suffix = file.split('.') suffix = Path(file).suffix
if suffix[-1] in suffixes: if suffix in suffixes:
fns.append((os.path.join(root, file))) fns.append((os.path.join(root, file)))
else: else:
fns.append(path) fns.append(path)
...@@ -116,12 +116,12 @@ def read_local_office(path: str) -> list[PymuDocDataset]: ...@@ -116,12 +116,12 @@ def read_local_office(path: str) -> list[PymuDocDataset]:
shutil.rmtree(temp_dir) shutil.rmtree(temp_dir)
return ret return ret
def read_local_images(path: str, suffixes: list[str]=['png', 'jpg']) -> list[ImageDataset]: def read_local_images(path: str, suffixes: list[str]=['.png', '.jpg']) -> list[ImageDataset]:
"""Read images from path or directory. """Read images from path or directory.
Args: Args:
path (str): image file path or directory that contains image files path (str): image file path or directory that contains image files
suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['jpg', 'png'] suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['.jpg', '.png']
Returns: Returns:
list[ImageDataset]: each image file will converted to a ImageDataset list[ImageDataset]: each image file will converted to a ImageDataset
...@@ -132,8 +132,8 @@ def read_local_images(path: str, suffixes: list[str]=['png', 'jpg']) -> list[Ima ...@@ -132,8 +132,8 @@ def read_local_images(path: str, suffixes: list[str]=['png', 'jpg']) -> list[Ima
reader = FileBasedDataReader() reader = FileBasedDataReader()
for root, _, files in os.walk(path): for root, _, files in os.walk(path):
for file in files: for file in files:
suffix = file.split('.') suffix = Path(file).suffix
if suffix[-1] in s_suffixes: if suffix in s_suffixes:
imgs_bits.append(reader.read(os.path.join(root, file))) imgs_bits.append(reader.read(os.path.join(root, file)))
return [ImageDataset(bits) for bits in imgs_bits] return [ImageDataset(bits) for bits in imgs_bits]
else: else:
......
...@@ -97,7 +97,7 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id): ...@@ -97,7 +97,7 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
fn = os.path.join(temp_dir, f"{path.stem}.pdf") fn = os.path.join(temp_dir, f"{path.stem}.pdf")
elif path.suffix in image_suffixes: elif path.suffix in image_suffixes:
with open(str(path), 'rb') as f: with open(str(path), 'rb') as f:
bits = f.read(_) bits = f.read()
pdf_bytes = fitz.open(stream=bits).convert_to_pdf() pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
fn = os.path.join(temp_dir, f"{path.stem}.pdf") fn = os.path.join(temp_dir, f"{path.stem}.pdf")
with open(fn, 'wb') as f: with open(fn, 'wb') as f:
...@@ -134,7 +134,7 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id): ...@@ -134,7 +134,7 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
if doc_path.suffix in pdf_suffixes + image_suffixes + ms_office_suffixes: if doc_path.suffix in pdf_suffixes + image_suffixes + ms_office_suffixes:
parse_doc(doc_path) parse_doc(doc_path)
else: else:
parse_doc(path) parse_doc(Path(path))
shutil.rmtree(temp_dir) shutil.rmtree(temp_dir)
......
...@@ -4,8 +4,11 @@ Glossary ...@@ -4,8 +4,11 @@ Glossary
=========== ===========
1. jsonl 1. jsonl
TODO: add description Newline-delimited (\n), and each line must be a valid, independent JSON object.
Currently, All the function shipped with **MinerU** assume that json object must contain one field named with either **path** or **file_location**
2. magic-pdf.json 2. magic-pdf.json
TODO: add description TODO
...@@ -134,6 +134,6 @@ Windows Platform ...@@ -134,6 +134,6 @@ Windows Platform
.. tip:: .. tip::
The MinerU is installed, Check out :doc:`../quick_start/command_line` to convert your first pdf **or** reading the following sections for more details about install The MinerU is installed, Check out :doc:`../usage/command_line` to convert your first pdf **or** reading the following sections for more details about install
Convert Files Under Directory
=================================
.. code:: python
...@@ -10,6 +10,19 @@ Convert Doc ...@@ -10,6 +10,19 @@ Convert Doc
For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output. For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
Command Line
^^^^^^^^^^^^^
.. code:: python
# make sure the file have correct suffix
magic-pdf -p a.doc -o output -m auto
API
^^^^^^^^
.. code:: python .. code:: python
import os import os
......
...@@ -10,6 +10,18 @@ Convert DocX ...@@ -10,6 +10,18 @@ Convert DocX
For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output. For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
Command Line
^^^^^^^^^^^^^
.. code:: python
# make sure the file have correct suffix
magic-pdf -p a.docx -o output -m auto
API
^^^^^
.. code:: python .. code:: python
import os import os
......
...@@ -3,6 +3,19 @@ ...@@ -3,6 +3,19 @@
Convert Image Convert Image
=============== ===============
Command Line
^^^^^^^^^^^^^
.. code:: python
# make sure the file have correct suffix
magic-pdf -p a.png -o output -m auto
API
^^^^^^
.. code:: python .. code:: python
import os import os
......
...@@ -3,6 +3,17 @@ ...@@ -3,6 +3,17 @@
Convert PDF Convert PDF
============ ============
Command Line
^^^^^^^^^^^^^
.. code:: python
# make sure the file have correct suffix
magic-pdf -p a.pdf -o output -m auto
API
^^^^^^
.. code:: python .. code:: python
import os import os
......
...@@ -10,6 +10,17 @@ Convert PPT ...@@ -10,6 +10,17 @@ Convert PPT
For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output. For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
Command Line
^^^^^^^^^^^^^
.. code:: python
# make sure the file have correct suffix
magic-pdf -p a.ppt -o output -m auto
API
^^^^^
.. code:: python .. code:: python
......
...@@ -11,6 +11,19 @@ Convert PPTX ...@@ -11,6 +11,19 @@ Convert PPTX
For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output. For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
Command Line
^^^^^^^^^^^^^
.. code:: python
# make sure the file have correct suffix
magic-pdf -p a.pptx -o output -m auto
API
^^^^^^
.. code:: python .. code:: python
......
...@@ -7,6 +7,5 @@ From the beginning to the end, Show how to using mineru via a minimal project ...@@ -7,6 +7,5 @@ From the beginning to the end, Show how to using mineru via a minimal project
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
tutorial/output_file_description
tutorial/pipeline tutorial/pipeline
...@@ -2,6 +2,10 @@ ...@@ -2,6 +2,10 @@
Api Usage Api Usage
=========== ===========
PDF
----
Local File Example Local File Example
^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^
...@@ -111,4 +115,112 @@ S3 File Example ...@@ -111,4 +115,112 @@ S3 File Example
pipe_result.dump_md(writer, f'{name_without_suff}.md', "unittest/tmp/images") # dump to remote s3 pipe_result.dump_md(writer, f'{name_without_suff}.md', "unittest/tmp/images") # dump to remote s3
MS-Office
----------
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.data.read_api import read_local_office
# prepare env
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
# proc
## Create Dataset Instance
input_file = "some_ppt.ppt" # replace with real ms-office file
input_file_name = input_file.split(".")[0]
ds = read_local_office(input_file)[0]
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
This code snippet can be used to manipulate **ppt**, **pptx**, **doc**, **docx** file
Image
---------
Single Image File
^^^^^^^^^^^^^^^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.data.read_api import read_local_images
# prepare env
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
# proc
## Create Dataset Instance
input_file = "some_image.jpg" # replace with real image file
input_file_name = input_file.split(".")[0]
ds = read_local_images(input_file)[0]
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
Directory That Contains Images
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.data.read_api import read_local_images
# prepare env
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
# proc
## Create Dataset Instance
input_directory = "some_image_dir/" # replace with real directory that contains images
dss = read_local_images(input_directory, suffixes=['.png', '.jpg'])[0]
count = 0
for ds in dss:
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{count}.md", image_dir
)
count += 1
Check :doc:`../data/data_reader_writer` for more [reader | writer] examples and check :doc:`../../api/pipe_operators` or :doc:`../../api/model_operators` for api details Check :doc:`../data/data_reader_writer` for more [reader | writer] examples and check :doc:`../../api/pipe_operators` or :doc:`../../api/model_operators` for api details
...@@ -10,7 +10,8 @@ Command Line ...@@ -10,7 +10,8 @@ Command Line
Options: Options:
-v, --version display the version and exit -v, --version display the version and exit
-p, --path PATH local pdf filepath or directory [required] -p, --path PATH local filepath or directory. support PDF, PPT,
PPTX, DOC, DOCX, PNG, JPG files [required]
-o, --output-dir PATH output local directory [required] -o, --output-dir PATH output local directory [required]
-m, --method [ocr|txt|auto] the method for parsing pdf. ocr: using ocr -m, --method [ocr|txt|auto] the method for parsing pdf. ocr: using ocr
technique to extract information from pdf. txt: technique to extract information from pdf. txt:
...@@ -40,6 +41,20 @@ Command Line ...@@ -40,6 +41,20 @@ Command Line
## command line example ## command line example
magic-pdf -p {some_pdf} -o {some_output_dir} -m auto magic-pdf -p {some_pdf} -o {some_output_dir} -m auto
.. admonition:: Important
:class: tip
The file must endswith with the following suffix.
.pdf
.png
.jpg
.ppt
.pptx
.doc
.docx
``{some_pdf}`` can be a single PDF file or a directory containing ``{some_pdf}`` can be a single PDF file or a directory containing
multiple PDFs. The results will be saved in the ``{some_output_dir}`` multiple PDFs. The results will be saved in the ``{some_output_dir}``
directory. The output file list is as follows: directory. The output file list is as follows:
...@@ -59,4 +74,4 @@ directory. The output file list is as follows: ...@@ -59,4 +74,4 @@ directory. The output file list is as follows:
:class: tip :class: tip
For more information about the output files, please refer to the :doc:`TODO: modify link <../tutorial/output_file_description>` For more information about the output files, please refer to the :doc:`../inference_result` or :doc:`../pipe_result`
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment