Merge pull request #1450 from icecraft/docs/update_docs

docs/update_docs

Merge pull request #1450 from icecraft/docs/update_docs
docs/update_docs
4bf148dd · Xiaomeng Zhao · GitHub · 27c0b150 · 87a6c51c · 27c0b150
Unverified Commit 4bf148dd authored Jan 08, 2025 by Xiaomeng Zhao Committed by GitHub Jan 08, 2025
6 changed files
--- a/next_docs/en/user_guide/quick_start/convert_docx.rst
+++ b/next_docs/en/user_guide/quick_start/convert_docx.rst
-Convert DocX
-=============
-.. admonition:: Warning
-    :class: tip
-    When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
-    For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
-Command Line
-^^^^^^^^^^^^^
-.. code:: python
-    # make sure the file have correct suffix
-    magic-pdf -p a.docx -o output -m auto
-API
-^^^^^
-.. code:: python
-    import os
-    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
-    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-    from magic_pdf.data.read_api import read_local_office
-    # prepare env
-    local_image_dir, local_md_dir = "output/images", "output"
-    image_dir = str(os.path.basename(local_image_dir))
-    os.makedirs(local_image_dir, exist_ok=True)
-    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
-        local_md_dir
-    )
-    # proc
-    ## Create Dataset Instance
-    input_file = "some_docx.docx"     # replace with real ms-office file
-    input_file_name = input_file.split(".")[0]
-    ds = read_local_office(input_file)[0]
-    # ocr mode
-    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
-        md_writer, f"{input_file_name}.md", image_dir
-    )
-    # txt mode
-    ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
-        md_writer, f"{input_file_name}.md", image_dir
-    )
--- a/next_docs/en/user_guide/quick_start/convert_image.rst
+++ b/next_docs/en/user_guide/quick_start/convert_image.rst
@@ -45,8 +45,3 @@ API
    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
        md_writer, f"{input_file_name}.md", image_dir
    )
-    # txt mode
-    ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
-        md_writer, f"{input_file_name}.md", image_dir
-    )
--- a/next_docs/en/user_guide/quick_start/convert_doc.rst
+++ b/next_docs/en/user_guide/quick_start/convert_doc.rst
@@ -17,7 +17,7 @@ Command Line
 .. code:: python
-    # make sure the file have correct suffix
+    # replace with real ms-office file, we support MS-DOC, MS-DOCX, MS-PPT, MS-PPTX now
    magic-pdf -p a.doc -o output -m auto
@@ -30,6 +30,8 @@ API
    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
    from magic_pdf.data.read_api import read_local_office
+    from magic_pdf.config.enums import SupportedPdfParseMethod
    # prepare env
    local_image_dir, local_md_dir = "output/images", "output"
@@ -43,17 +45,16 @@ API
    # proc
    ## Create Dataset Instance
-    input_file = "some_doc.doc"     # replace with real ms-office file
+    input_file = "some_doc.doc"     # replace with real ms-office file, we support MS-DOC, MS-DOCX, MS-PPT, MS-PPTX now
    input_file_name = input_file.split(".")[0]
    ds = read_local_office(input_file)[0]
-    # ocr mode
-    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
-        md_writer, f"{input_file_name}.md", image_dir
-    )
-    # txt mode
+    ## inference
-    ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
+    if ds.classify() == SupportedPdfParseMethod.OCR:
-        md_writer, f"{input_file_name}.md", image_dir
+        ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
-    )
+        md_writer, f"{input_file_name}.md", image_dir)
+    else:
+        ds.apply(doc_analyze, ocr=False).pipe_txt_mode(image_writer).dump_md(
+        md_writer, f"{input_file_name}.md", image_dir)
--- a/next_docs/en/user_guide/quick_start/convert_pdf.rst
+++ b/next_docs/en/user_guide/quick_start/convert_pdf.rst
@@ -44,12 +44,13 @@ API
    ## Create Dataset Instance
    ds = PymuDocDataset(pdf_bytes)
-    # ocr mode
+    ## inference
+    if ds.classify() == SupportedPdfParseMethod.OCR:
        ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
        md_writer, f"{name_without_suff}.md", image_dir
    )
-    # txt mode
+    else:
-    ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
+        ds.apply(doc_analyze, ocr=False).pipe_txt_mode(image_writer).dump_md(
        md_writer, f"{name_without_suff}.md", image_dir
    )
--- a/next_docs/en/user_guide/quick_start/convert_ppt.rst
+++ b/next_docs/en/user_guide/quick_start/convert_ppt.rst
-Convert PPT
-============
-.. admonition:: Warning
-    :class: tip
-    When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
-    For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
-Command Line
-^^^^^^^^^^^^^
-.. code:: python
-    # make sure the file have correct suffix
-    magic-pdf -p a.ppt -o output -m auto
-API
-^^^^^
-.. code:: python
-    import os
-    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
-    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-    from magic_pdf.data.read_api import read_local_office
-    # prepare env
-    local_image_dir, local_md_dir = "output/images", "output"
-    image_dir = str(os.path.basename(local_image_dir))
-    os.makedirs(local_image_dir, exist_ok=True)
-    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
-        local_md_dir
-    )
-    # proc
-    ## Create Dataset Instance
-    input_file = "some_ppt.ppt"     # replace with real ms-office file
-    input_file_name = input_file.split(".")[0]
-    ds = read_local_office(input_file)[0]
-    # ocr mode
-    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
-        md_writer, f"{input_file_name}.md", image_dir
-    )
-    # txt mode
-    ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
-        md_writer, f"{input_file_name}.md", image_dir
-    )
--- a/next_docs/en/user_guide/quick_start/convert_pptx.rst
+++ b/next_docs/en/user_guide/quick_start/convert_pptx.rst
-Convert PPTX
-=================
-.. admonition:: Warning
-    :class: tip
-    When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
-    For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
-Command Line
-^^^^^^^^^^^^^
-.. code:: python
-    # make sure the file have correct suffix
-    magic-pdf -p a.pptx -o output -m auto
-API
-^^^^^^
-.. code:: python
-    import os
-    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
-    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-    from magic_pdf.data.read_api import read_local_office
-    # prepare env
-    local_image_dir, local_md_dir = "output/images", "output"
-    image_dir = str(os.path.basename(local_image_dir))
-    os.makedirs(local_image_dir, exist_ok=True)
-    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
-        local_md_dir
-    )
-    # proc
-    ## Create Dataset Instance
-    input_file = "some_pptx.pptx"     # replace with real ms-office file
-    input_file_name = input_file.split(".")[0]
-    ds = read_local_office(input_file)[0]
-    # ocr mode
-    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
-        md_writer, f"{input_file_name}.md", image_dir
-    )
-    # txt mode
-    ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
-        md_writer, f"{input_file_name}.md", image_dir
-    )