Commit b04867f9 authored by xu rui's avatar xu rui
Browse files

docs: check links in doc

parent cece8f53
......@@ -87,14 +87,14 @@ def read_local_office(path: str) -> list[PymuDocDataset]:
FileNotFoundError: File not Found
Exception: Unknown Exception raised
"""
suffixes = ['ppt', 'pptx', 'doc', 'docx']
suffixes = ['.ppt', '.pptx', '.doc', '.docx']
fns = []
ret = []
if os.path.isdir(path):
for root, _, files in os.walk(path):
for file in files:
suffix = file.split('.')
if suffix[-1] in suffixes:
suffix = Path(file).suffix
if suffix in suffixes:
fns.append((os.path.join(root, file)))
else:
fns.append(path)
......@@ -116,12 +116,12 @@ def read_local_office(path: str) -> list[PymuDocDataset]:
shutil.rmtree(temp_dir)
return ret
def read_local_images(path: str, suffixes: list[str]=['png', 'jpg']) -> list[ImageDataset]:
def read_local_images(path: str, suffixes: list[str]=['.png', '.jpg']) -> list[ImageDataset]:
"""Read images from path or directory.
Args:
path (str): image file path or directory that contains image files
suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['jpg', 'png']
suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['.jpg', '.png']
Returns:
list[ImageDataset]: each image file will converted to a ImageDataset
......@@ -132,8 +132,8 @@ def read_local_images(path: str, suffixes: list[str]=['png', 'jpg']) -> list[Ima
reader = FileBasedDataReader()
for root, _, files in os.walk(path):
for file in files:
suffix = file.split('.')
if suffix[-1] in s_suffixes:
suffix = Path(file).suffix
if suffix in s_suffixes:
imgs_bits.append(reader.read(os.path.join(root, file)))
return [ImageDataset(bits) for bits in imgs_bits]
else:
......
......@@ -97,7 +97,7 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
fn = os.path.join(temp_dir, f"{path.stem}.pdf")
elif path.suffix in image_suffixes:
with open(str(path), 'rb') as f:
bits = f.read(_)
bits = f.read()
pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
fn = os.path.join(temp_dir, f"{path.stem}.pdf")
with open(fn, 'wb') as f:
......@@ -134,7 +134,7 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
if doc_path.suffix in pdf_suffixes + image_suffixes + ms_office_suffixes:
parse_doc(doc_path)
else:
parse_doc(path)
parse_doc(Path(path))
shutil.rmtree(temp_dir)
......
......@@ -4,8 +4,11 @@ Glossary
===========
1. jsonl
TODO: add description
Newline-delimited (\n), and each line must be a valid, independent JSON object.
Currently, All the function shipped with **MinerU** assume that json object must contain one field named with either **path** or **file_location**
2. magic-pdf.json
TODO: add description
TODO
......@@ -134,6 +134,6 @@ Windows Platform
.. tip::
The MinerU is installed, Check out :doc:`../quick_start/command_line` to convert your first pdf **or** reading the following sections for more details about install
The MinerU is installed, Check out :doc:`../usage/command_line` to convert your first pdf **or** reading the following sections for more details about install
Convert Files Under Directory
=================================
.. code:: python
......@@ -10,6 +10,19 @@ Convert Doc
For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
Command Line
^^^^^^^^^^^^^
.. code:: python
# make sure the file have correct suffix
magic-pdf -p a.doc -o output -m auto
API
^^^^^^^^
.. code:: python
import os
......
......@@ -10,6 +10,18 @@ Convert DocX
For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
Command Line
^^^^^^^^^^^^^
.. code:: python
# make sure the file have correct suffix
magic-pdf -p a.docx -o output -m auto
API
^^^^^
.. code:: python
import os
......
......@@ -3,6 +3,19 @@
Convert Image
===============
Command Line
^^^^^^^^^^^^^
.. code:: python
# make sure the file have correct suffix
magic-pdf -p a.png -o output -m auto
API
^^^^^^
.. code:: python
import os
......
......@@ -3,6 +3,17 @@
Convert PDF
============
Command Line
^^^^^^^^^^^^^
.. code:: python
# make sure the file have correct suffix
magic-pdf -p a.pdf -o output -m auto
API
^^^^^^
.. code:: python
import os
......
......@@ -10,6 +10,17 @@ Convert PPT
For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
Command Line
^^^^^^^^^^^^^
.. code:: python
# make sure the file have correct suffix
magic-pdf -p a.ppt -o output -m auto
API
^^^^^
.. code:: python
......
......@@ -11,6 +11,19 @@ Convert PPTX
For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
Command Line
^^^^^^^^^^^^^
.. code:: python
# make sure the file have correct suffix
magic-pdf -p a.pptx -o output -m auto
API
^^^^^^
.. code:: python
......
......@@ -7,6 +7,5 @@ From the beginning to the end, Show how to using mineru via a minimal project
.. toctree::
:maxdepth: 1
tutorial/output_file_description
tutorial/pipeline
......@@ -2,6 +2,10 @@
Api Usage
===========
PDF
----
Local File Example
^^^^^^^^^^^^^^^^^^
......@@ -111,4 +115,112 @@ S3 File Example
pipe_result.dump_md(writer, f'{name_without_suff}.md', "unittest/tmp/images") # dump to remote s3
MS-Office
----------
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.data.read_api import read_local_office
# prepare env
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
# proc
## Create Dataset Instance
input_file = "some_ppt.ppt" # replace with real ms-office file
input_file_name = input_file.split(".")[0]
ds = read_local_office(input_file)[0]
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
This code snippet can be used to manipulate **ppt**, **pptx**, **doc**, **docx** file
Image
---------
Single Image File
^^^^^^^^^^^^^^^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.data.read_api import read_local_images
# prepare env
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
# proc
## Create Dataset Instance
input_file = "some_image.jpg" # replace with real image file
input_file_name = input_file.split(".")[0]
ds = read_local_images(input_file)[0]
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
Directory That Contains Images
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.data.read_api import read_local_images
# prepare env
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
# proc
## Create Dataset Instance
input_directory = "some_image_dir/" # replace with real directory that contains images
dss = read_local_images(input_directory, suffixes=['.png', '.jpg'])[0]
count = 0
for ds in dss:
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{count}.md", image_dir
)
count += 1
Check :doc:`../data/data_reader_writer` for more [reader | writer] examples and check :doc:`../../api/pipe_operators` or :doc:`../../api/model_operators` for api details
......@@ -10,7 +10,8 @@ Command Line
Options:
-v, --version display the version and exit
-p, --path PATH local pdf filepath or directory [required]
-p, --path PATH local filepath or directory. support PDF, PPT,
PPTX, DOC, DOCX, PNG, JPG files [required]
-o, --output-dir PATH output local directory [required]
-m, --method [ocr|txt|auto] the method for parsing pdf. ocr: using ocr
technique to extract information from pdf. txt:
......@@ -40,6 +41,20 @@ Command Line
## command line example
magic-pdf -p {some_pdf} -o {some_output_dir} -m auto
.. admonition:: Important
:class: tip
The file must endswith with the following suffix.
.pdf
.png
.jpg
.ppt
.pptx
.doc
.docx
``{some_pdf}`` can be a single PDF file or a directory containing
multiple PDFs. The results will be saved in the ``{some_output_dir}``
directory. The output file list is as follows:
......@@ -59,4 +74,4 @@ directory. The output file list is as follows:
:class: tip
For more information about the output files, please refer to the :doc:`TODO: modify link <../tutorial/output_file_description>`
For more information about the output files, please refer to the :doc:`../inference_result` or :doc:`../pipe_result`
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment