feat: support convert ppt/pptx/doc/docx

f6af67eb · xu rui · f3ceebc4 · f6af67eb · f6af67eb · f6af67eb
Commit f6af67eb authored Dec 10, 2024 by xu rui
4 changed files
--- a/magic_pdf/data/read_api.py
+++ b/magic_pdf/data/read_api.py
 import json
 import os
+import tempfile
+import shutil
 from pathlib import Path
 from magic_pdf.config.exceptions import EmptyData, InvalidParams
 from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
                                               MultiBucketS3DataReader)
 from magic_pdf.data.dataset import ImageDataset, PymuDocDataset
+from magic_pdf.utils.office_to_pdf import convert_file_to_pdf, ConvertToPdfError
 def read_jsonl(
    s3_path_or_local: str, s3_client: MultiBucketS3DataReader | None = None
@@ -71,6 +73,36 @@ def read_local_pdfs(path: str) -> list[PymuDocDataset]:
        bits = reader.read(path)
        return [PymuDocDataset(bits)]
+def read_local_office(path: str) -> list[PymuDocDataset]:
+    """Read ms-office file (ppt, pptx, doc, docx) from path or directory.
+    Args:
+        path (str): ms-office file or directory that contains ms-office files
+    Returns:
+        list[PymuDocDataset]: each ms-office file will converted to a PymuDocDataset
+    """
+    suffixes = ['ppt', 'pptx', 'doc', 'docx']
+    fns = []
+    ret = []
+    if os.path.isdir(path):
+        for root, _, files in os.walk(path):
+            for file in files:
+                suffix = file.split('.')
+                if suffix[-1] in suffixes:
+                    fns.append((os.path.join(root, file)))
+    else:
+        fns.append(path)
+    reader = FileBasedDataReader()
+    temp_dir = tempfile.mkdtemp()
+    for fn in fns:
+        convert_file_to_pdf(fn, temp_dir)
+        fn_path = Path(fn)
+        pdf_fn = f"{temp_dir}/{fn_path.stem}.pdf"
+        ret.append(PymuDocDataset(reader.read(pdf_fn)))
+    shutil.rmtree(temp_dir)
+    return ret
 def read_local_images(path: str, suffixes: list[str]=[]) -> list[ImageDataset]:
    """Read images from path or directory.

--- a/magic_pdf/utils/office_to_pdf.py
+++ b/magic_pdf/utils/office_to_pdf.py
+import os
+import subprocess
+from pathlib import Path
+class ConvertToPdfError(Exception):
+    def __init__(self, msg):
+        self.msg = msg
+        super().__init__(self.msg)
+def convert_file_to_pdf(input_path, output_dir):
+    if not os.path.isfile(input_path):
+        raise FileNotFoundError(f"The input file {input_path} does not exist.")
+    os.makedirs(output_dir, exist_ok=True)
+    cmd = [
+        'soffice',
+        '--headless',
+        '--convert-to', 'pdf',
+        '--outdir', str(output_dir),
+        str(input_path)
+    ]
+    process = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    if process.returncode != 0:
+        raise ConvertToPdfError(process.stderr.decode())
--- a/next_docs/en/user_guide/install/config.rst
+++ b/next_docs/en/user_guide/install/config.rst
@@ -153,5 +153,8 @@ config_version
 The version of config schema.
+.. admonition:: Tip
+    :class: tip
+    Check `Config Schema <https://github.com/opendatalab/MinerU/blob/master/magic-pdf.template.json>`_ for the latest config schema.
-Check `Config Schema <https://github.com/opendatalab/MinerU/blob/master/magic-pdf.template.json>`_ for the latest config schema.
\ No newline at end of file
--- a/next_docs/en/user_guide/install/install.rst
+++ b/next_docs/en/user_guide/install/install.rst
@@ -89,7 +89,7 @@ Also you can try `online demo <https://www.modelscope.cn/studios/OpenDataLab/Min
 Create an environment
-~~~~~~~~~~~~~~~~~~~~~
+---------------------------
 .. code-block:: shell
@@ -99,7 +99,7 @@ Create an environment
 Download model weight files
-~~~~~~~~~~~~~~~~~~~~~~~~~~
+------------------------------
 .. code-block:: shell
@@ -108,6 +108,32 @@ Download model weight files
    python download_models_hf.py    
+Install LibreOffice[Optional]
+----------------------------------
+This section is required for handle **doc**, **docx**, **ppt**, **pptx** filetype, You can Skip this section if no need for those filetype processing.
+Linux/Macos Platform
+""""""""""""""""""""""
+.. code::
+    apt-get/yum/brew install libreoffice
+Windows Platform 
+""""""""""""""""""""
+.. code::
+    install libreoffice 
+    append "install_dir\LibreOffice\program" to ENVIRONMENT PATH
 .. tip::
    The MinerU is installed, Check out :doc:`../quick_start/command_line` to convert your first pdf **or** reading the following sections for more details about install