fix: convert image with pymupdf

3e8ee23e · icecraft · 3379f3b3 · 3e8ee23e
Commit 3e8ee23e authored Apr 03, 2025 by icecraft
Hide whitespace changes
Inline Side-by-side

Showing with 10 additions and 3 deletions

magic_pdf/tools/cli.py magic_pdf/tools/cli.py +10 -3

No files found.
--- a/magic_pdf/tools/cli.py
+++ b/magic_pdf/tools/cli.py
@@ -137,10 +137,17 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
        doc_paths = []
        for doc_path in Path(path).glob('*'):
            if doc_path.suffix in pdf_suffixes + image_suffixes + ms_office_suffixes:
-                if doc_path.suffix not in ms_office_suffixes:
+                if doc_path.suffix in ms_office_suffixes:
-                    basename = Path(doc_path).stem
                    convert_file_to_pdf(str(doc_path), temp_dir)
-                    doc_path = Path(os.path.join(temp_dir, f'{basename}.pdf'))
+                    doc_path = Path(os.path.join(temp_dir, f'{doc_path.stem}.pdf'))
+                elif doc_path.suffix in image_suffixes:
+                    with open(str(doc_path), 'rb') as f:
+                        bits = f.read()
+                        pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
+                    fn = os.path.join(temp_dir, f'{doc_path.stem}.pdf')
+                    with open(fn, 'wb') as f:
+                        f.write(pdf_bytes)
+                    doc_path = Path(fn)
                doc_paths.append(doc_path)
        datasets = batch_build_dataset(doc_paths, 4, lang)
        batch_do_parse(output_dir, [str(doc_path.stem) for doc_path in doc_paths], datasets, method, debug_able, lang=lang)