Commit f6af67eb authored by xu rui's avatar xu rui
Browse files

feat: support convert ppt/pptx/doc/docx

parent f3ceebc4
import json import json
import os import os
import tempfile
import shutil
from pathlib import Path from pathlib import Path
from magic_pdf.config.exceptions import EmptyData, InvalidParams from magic_pdf.config.exceptions import EmptyData, InvalidParams
from magic_pdf.data.data_reader_writer import (FileBasedDataReader, from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
MultiBucketS3DataReader) MultiBucketS3DataReader)
from magic_pdf.data.dataset import ImageDataset, PymuDocDataset from magic_pdf.data.dataset import ImageDataset, PymuDocDataset
from magic_pdf.utils.office_to_pdf import convert_file_to_pdf, ConvertToPdfError
def read_jsonl( def read_jsonl(
s3_path_or_local: str, s3_client: MultiBucketS3DataReader | None = None s3_path_or_local: str, s3_client: MultiBucketS3DataReader | None = None
...@@ -71,6 +73,36 @@ def read_local_pdfs(path: str) -> list[PymuDocDataset]: ...@@ -71,6 +73,36 @@ def read_local_pdfs(path: str) -> list[PymuDocDataset]:
bits = reader.read(path) bits = reader.read(path)
return [PymuDocDataset(bits)] return [PymuDocDataset(bits)]
def read_local_office(path: str) -> list[PymuDocDataset]:
"""Read ms-office file (ppt, pptx, doc, docx) from path or directory.
Args:
path (str): ms-office file or directory that contains ms-office files
Returns:
list[PymuDocDataset]: each ms-office file will converted to a PymuDocDataset
"""
suffixes = ['ppt', 'pptx', 'doc', 'docx']
fns = []
ret = []
if os.path.isdir(path):
for root, _, files in os.walk(path):
for file in files:
suffix = file.split('.')
if suffix[-1] in suffixes:
fns.append((os.path.join(root, file)))
else:
fns.append(path)
reader = FileBasedDataReader()
temp_dir = tempfile.mkdtemp()
for fn in fns:
convert_file_to_pdf(fn, temp_dir)
fn_path = Path(fn)
pdf_fn = f"{temp_dir}/{fn_path.stem}.pdf"
ret.append(PymuDocDataset(reader.read(pdf_fn)))
shutil.rmtree(temp_dir)
return ret
def read_local_images(path: str, suffixes: list[str]=[]) -> list[ImageDataset]: def read_local_images(path: str, suffixes: list[str]=[]) -> list[ImageDataset]:
"""Read images from path or directory. """Read images from path or directory.
......
import os
import subprocess
from pathlib import Path
class ConvertToPdfError(Exception):
def __init__(self, msg):
self.msg = msg
super().__init__(self.msg)
def convert_file_to_pdf(input_path, output_dir):
if not os.path.isfile(input_path):
raise FileNotFoundError(f"The input file {input_path} does not exist.")
os.makedirs(output_dir, exist_ok=True)
cmd = [
'soffice',
'--headless',
'--convert-to', 'pdf',
'--outdir', str(output_dir),
str(input_path)
]
process = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if process.returncode != 0:
raise ConvertToPdfError(process.stderr.decode())
...@@ -153,5 +153,8 @@ config_version ...@@ -153,5 +153,8 @@ config_version
The version of config schema. The version of config schema.
.. admonition:: Tip
:class: tip
Check `Config Schema <https://github.com/opendatalab/MinerU/blob/master/magic-pdf.template.json>`_ for the latest config schema.
Check `Config Schema <https://github.com/opendatalab/MinerU/blob/master/magic-pdf.template.json>`_ for the latest config schema.
\ No newline at end of file
...@@ -89,7 +89,7 @@ Also you can try `online demo <https://www.modelscope.cn/studios/OpenDataLab/Min ...@@ -89,7 +89,7 @@ Also you can try `online demo <https://www.modelscope.cn/studios/OpenDataLab/Min
Create an environment Create an environment
~~~~~~~~~~~~~~~~~~~~~ ---------------------------
.. code-block:: shell .. code-block:: shell
...@@ -99,7 +99,7 @@ Create an environment ...@@ -99,7 +99,7 @@ Create an environment
Download model weight files Download model weight files
~~~~~~~~~~~~~~~~~~~~~~~~~~ ------------------------------
.. code-block:: shell .. code-block:: shell
...@@ -108,6 +108,32 @@ Download model weight files ...@@ -108,6 +108,32 @@ Download model weight files
python download_models_hf.py python download_models_hf.py
Install LibreOffice[Optional]
----------------------------------
This section is required for handle **doc**, **docx**, **ppt**, **pptx** filetype, You can Skip this section if no need for those filetype processing.
Linux/Macos Platform
""""""""""""""""""""""
.. code::
apt-get/yum/brew install libreoffice
Windows Platform
""""""""""""""""""""
.. code::
install libreoffice
append "install_dir\LibreOffice\program" to ENVIRONMENT PATH
.. tip:: .. tip::
The MinerU is installed, Check out :doc:`../quick_start/command_line` to convert your first pdf **or** reading the following sections for more details about install The MinerU is installed, Check out :doc:`../quick_start/command_line` to convert your first pdf **or** reading the following sections for more details about install
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment