Unverified Commit 4bf148dd authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #1450 from icecraft/docs/update_docs

docs/update_docs
parents 27c0b150 87a6c51c
Convert DocX
=============
.. admonition:: Warning
:class: tip
When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
Command Line
^^^^^^^^^^^^^
.. code:: python
# make sure the file have correct suffix
magic-pdf -p a.docx -o output -m auto
API
^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.data.read_api import read_local_office
# prepare env
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
# proc
## Create Dataset Instance
input_file = "some_docx.docx" # replace with real ms-office file
input_file_name = input_file.split(".")[0]
ds = read_local_office(input_file)[0]
# ocr mode
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
# txt mode
ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
......@@ -45,8 +45,3 @@ API
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
# txt mode
ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
......@@ -17,7 +17,7 @@ Command Line
.. code:: python
# make sure the file have correct suffix
# replace with real ms-office file, we support MS-DOC, MS-DOCX, MS-PPT, MS-PPTX now
magic-pdf -p a.doc -o output -m auto
......@@ -30,6 +30,8 @@ API
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.data.read_api import read_local_office
from magic_pdf.config.enums import SupportedPdfParseMethod
# prepare env
local_image_dir, local_md_dir = "output/images", "output"
......@@ -43,17 +45,16 @@ API
# proc
## Create Dataset Instance
input_file = "some_doc.doc" # replace with real ms-office file
input_file = "some_doc.doc" # replace with real ms-office file, we support MS-DOC, MS-DOCX, MS-PPT, MS-PPTX now
input_file_name = input_file.split(".")[0]
ds = read_local_office(input_file)[0]
# ocr mode
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
# txt mode
ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
## inference
if ds.classify() == SupportedPdfParseMethod.OCR:
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir)
else:
ds.apply(doc_analyze, ocr=False).pipe_txt_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir)
......@@ -44,12 +44,13 @@ API
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
# ocr mode
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
## inference
if ds.classify() == SupportedPdfParseMethod.OCR:
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{name_without_suff}.md", image_dir
)
# txt mode
ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
else:
ds.apply(doc_analyze, ocr=False).pipe_txt_mode(image_writer).dump_md(
md_writer, f"{name_without_suff}.md", image_dir
)
Convert PPT
============
.. admonition:: Warning
:class: tip
When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
Command Line
^^^^^^^^^^^^^
.. code:: python
# make sure the file have correct suffix
magic-pdf -p a.ppt -o output -m auto
API
^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.data.read_api import read_local_office
# prepare env
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
# proc
## Create Dataset Instance
input_file = "some_ppt.ppt" # replace with real ms-office file
input_file_name = input_file.split(".")[0]
ds = read_local_office(input_file)[0]
# ocr mode
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
# txt mode
ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
Convert PPTX
=================
.. admonition:: Warning
:class: tip
When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
Command Line
^^^^^^^^^^^^^
.. code:: python
# make sure the file have correct suffix
magic-pdf -p a.pptx -o output -m auto
API
^^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.data.read_api import read_local_office
# prepare env
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
# proc
## Create Dataset Instance
input_file = "some_pptx.pptx" # replace with real ms-office file
input_file_name = input_file.split(".")[0]
ds = read_local_office(input_file)[0]
# ocr mode
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
# txt mode
ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment