Unverified Commit 4bf148dd authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #1450 from icecraft/docs/update_docs

docs/update_docs
parents 27c0b150 87a6c51c
Convert DocX
=============
.. admonition:: Warning
:class: tip
When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
Command Line
^^^^^^^^^^^^^
.. code:: python
# make sure the file have correct suffix
magic-pdf -p a.docx -o output -m auto
API
^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.data.read_api import read_local_office
# prepare env
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
# proc
## Create Dataset Instance
input_file = "some_docx.docx" # replace with real ms-office file
input_file_name = input_file.split(".")[0]
ds = read_local_office(input_file)[0]
# ocr mode
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
# txt mode
ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
...@@ -45,8 +45,3 @@ API ...@@ -45,8 +45,3 @@ API
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md( ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir md_writer, f"{input_file_name}.md", image_dir
) )
# txt mode
ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
...@@ -17,7 +17,7 @@ Command Line ...@@ -17,7 +17,7 @@ Command Line
.. code:: python .. code:: python
# make sure the file have correct suffix # replace with real ms-office file, we support MS-DOC, MS-DOCX, MS-PPT, MS-PPTX now
magic-pdf -p a.doc -o output -m auto magic-pdf -p a.doc -o output -m auto
...@@ -30,6 +30,8 @@ API ...@@ -30,6 +30,8 @@ API
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.data.read_api import read_local_office from magic_pdf.data.read_api import read_local_office
from magic_pdf.config.enums import SupportedPdfParseMethod
# prepare env # prepare env
local_image_dir, local_md_dir = "output/images", "output" local_image_dir, local_md_dir = "output/images", "output"
...@@ -43,17 +45,16 @@ API ...@@ -43,17 +45,16 @@ API
# proc # proc
## Create Dataset Instance ## Create Dataset Instance
input_file = "some_doc.doc" # replace with real ms-office file input_file = "some_doc.doc" # replace with real ms-office file, we support MS-DOC, MS-DOCX, MS-PPT, MS-PPTX now
input_file_name = input_file.split(".")[0] input_file_name = input_file.split(".")[0]
ds = read_local_office(input_file)[0] ds = read_local_office(input_file)[0]
# ocr mode
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
# txt mode ## inference
ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md( if ds.classify() == SupportedPdfParseMethod.OCR:
md_writer, f"{input_file_name}.md", image_dir ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
) md_writer, f"{input_file_name}.md", image_dir)
else:
ds.apply(doc_analyze, ocr=False).pipe_txt_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir)
...@@ -44,12 +44,13 @@ API ...@@ -44,12 +44,13 @@ API
## Create Dataset Instance ## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes) ds = PymuDocDataset(pdf_bytes)
# ocr mode ## inference
if ds.classify() == SupportedPdfParseMethod.OCR:
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md( ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{name_without_suff}.md", image_dir md_writer, f"{name_without_suff}.md", image_dir
) )
# txt mode else:
ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md( ds.apply(doc_analyze, ocr=False).pipe_txt_mode(image_writer).dump_md(
md_writer, f"{name_without_suff}.md", image_dir md_writer, f"{name_without_suff}.md", image_dir
) )
Convert PPT
============
.. admonition:: Warning
:class: tip
When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
Command Line
^^^^^^^^^^^^^
.. code:: python
# make sure the file have correct suffix
magic-pdf -p a.ppt -o output -m auto
API
^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.data.read_api import read_local_office
# prepare env
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
# proc
## Create Dataset Instance
input_file = "some_ppt.ppt" # replace with real ms-office file
input_file_name = input_file.split(".")[0]
ds = read_local_office(input_file)[0]
# ocr mode
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
# txt mode
ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
Convert PPTX
=================
.. admonition:: Warning
:class: tip
When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
Command Line
^^^^^^^^^^^^^
.. code:: python
# make sure the file have correct suffix
magic-pdf -p a.pptx -o output -m auto
API
^^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.data.read_api import read_local_office
# prepare env
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
# proc
## Create Dataset Instance
input_file = "some_pptx.pptx" # replace with real ms-office file
input_file_name = input_file.split(".")[0]
ds = read_local_office(input_file)[0]
# ocr mode
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
# txt mode
ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment