feat: add parallel evalution

b50f742f · icecraft · 3a2f86a1 · b50f742f · b50f742f · b50f742f
Commit b50f742f authored Mar 13, 2025 by icecraft
7 changed files
--- a/magic_pdf/data/batch_build_dataset.py
+++ b/magic_pdf/data/batch_build_dataset.py
-import os
+import concurrent.futures
 import glob
+import os
 import threading
-import concurrent.futures
+
 import fitz
-from magic_pdf.data.utils import fitz_doc_to_image  # PyMuPDF
+
 from magic_pdf.data.dataset import PymuDocDataset
+from magic_pdf.data.utils import fitz_doc_to_image  # PyMuPDF
+

 def partition_array_greedy(arr, k):
-    """
-    Partition an array into k parts using a simple greedy approach.
+    """Partition an array into k parts using a simple greedy approach.

    Parameters:
    -----------
@@ -24,7 +26,7 @@ def partition_array_greedy(arr, k):
    """
    # Handle edge cases
    if k <= 0:
-        raise ValueError("k must be a positive integer")
+        raise ValueError('k must be a positive integer')
    if k > len(arr):
        k = len(arr)  # Adjust k if it's too large
    if k == 1:
@@ -52,8 +54,7 @@ def partition_array_greedy(arr, k):


 def process_pdf_batch(pdf_jobs, idx):
-    """
-    Process a batch of PDF pages using multiple threads.
+    """Process a batch of PDF pages using multiple threads.

    Parameters:
    -----------
@@ -83,8 +84,8 @@ def process_pdf_batch(pdf_jobs, idx):
    return (idx, images)

 def batch_build_dataset(pdf_paths, k, lang=None):
-    """
-    Process multiple PDFs by partitioning them into k balanced parts and processing each part in parallel.
+    """Process multiple PDFs by partitioning them into k balanced parts and
+    processing each part in parallel.

    Parameters:
    -----------
@@ -116,13 +117,13 @@ def batch_build_dataset(pdf_paths, k, lang=None):
            total_pages += num_pages
            doc.close()
        except Exception as e:
-            print(f"Error opening {pdf_path}: {e}")
+            print(f'Error opening {pdf_path}: {e}')

    # Partition the jobs based on page countEach job has 1 page
    partitions = partition_array_greedy(pdf_info, k)

    for i, partition in enumerate(partitions):
-        print(f"Partition {i+1}: {len(partition)} pdfs")
+        print(f'Partition {i+1}: {len(partition)} pdfs')

    # Process each partition in parallel
    all_images_h = {}
@@ -145,15 +146,15 @@ def batch_build_dataset(pdf_paths, k, lang=None):
        for i, future in enumerate(concurrent.futures.as_completed(futures)):
            try:
                idx, images = future.result()
-                print(f"Partition {i+1} completed: processed {len(images)} images")
+                print(f'Partition {i+1} completed: processed {len(images)} images')
                all_images_h[idx] = images
            except Exception as e:
-                print(f"Error processing partition: {e}")
+                print(f'Error processing partition: {e}')
    results = [None] * len(pdf_paths)
    for i in range(len(partitions)):
        partition = partitions[i]
        for j in range(len(partition)):
-            with open(pdf_info[partition[j]][0], "rb") as f:
+            with open(pdf_info[partition[j]][0], 'rb') as f:
                pdf_bytes = f.read()
            dataset = PymuDocDataset(pdf_bytes, lang=lang)
            dataset.set_images(all_images_h[i][j])

--- a/magic_pdf/data/dataset.py
+++ b/magic_pdf/data/dataset.py
@@ -97,7 +97,7 @@ class Dataset(ABC):

    @abstractmethod
    def dump_to_file(self, file_path: str):
-        """Dump the file
+        """Dump the file.

        Args:
            file_path (str): the file path
@@ -119,7 +119,7 @@ class Dataset(ABC):

    @abstractmethod
    def classify(self) -> SupportedPdfParseMethod:
-        """classify the dataset 
+        """classify the dataset.

        Returns:
            SupportedPdfParseMethod: _description_
@@ -128,8 +128,7 @@ class Dataset(ABC):

    @abstractmethod
    def clone(self):
-        """clone this dataset
-        """
+        """clone this dataset."""
        pass


@@ -148,12 +147,13 @@ class PymuDocDataset(Dataset):
        if lang == '':
            self._lang = None
        elif lang == 'auto':
-            from magic_pdf.model.sub_modules.language_detection.utils import auto_detect_lang
+            from magic_pdf.model.sub_modules.language_detection.utils import \
+                auto_detect_lang
            self._lang = auto_detect_lang(bits)
-            logger.info(f"lang: {lang}, detect_lang: {self._lang}")
+            logger.info(f'lang: {lang}, detect_lang: {self._lang}')
        else:
            self._lang = lang
-            logger.info(f"lang: {lang}")
+            logger.info(f'lang: {lang}')

    def __len__(self) -> int:
        """The page number of the pdf."""
@@ -187,7 +187,7 @@ class PymuDocDataset(Dataset):
        return self._records[page_id]

    def dump_to_file(self, file_path: str):
-        """Dump the file
+        """Dump the file.

        Args:
            file_path (str): the file path
@@ -213,7 +213,7 @@ class PymuDocDataset(Dataset):
        return proc(self, *args, **kwargs)

    def classify(self) -> SupportedPdfParseMethod:
-        """classify the dataset 
+        """classify the dataset.

        Returns:
            SupportedPdfParseMethod: _description_
@@ -221,8 +221,7 @@ class PymuDocDataset(Dataset):
        return classify(self._data_bits)

    def clone(self):
-        """clone this dataset
-        """
+        """clone this dataset."""
        return PymuDocDataset(self._raw_data)

    def set_images(self, images):
@@ -274,7 +273,7 @@ class ImageDataset(Dataset):
        return self._records[page_id]

    def dump_to_file(self, file_path: str):
-        """Dump the file
+        """Dump the file.

        Args:
            file_path (str): the file path
@@ -297,7 +296,7 @@ class ImageDataset(Dataset):
        return proc(self, *args, **kwargs)

    def classify(self) -> SupportedPdfParseMethod:
-        """classify the dataset 
+        """classify the dataset.

        Returns:
            SupportedPdfParseMethod: _description_
@@ -305,8 +304,7 @@ class ImageDataset(Dataset):
        return SupportedPdfParseMethod.OCR

    def clone(self):
-        """clone this dataset
-        """
+        """clone this dataset."""
        return ImageDataset(self._raw_data)

    def set_images(self, images):

--- a/magic_pdf/data/utils.py
+++ b/magic_pdf/data/utils.py

 import multiprocessing as mp
 import threading
+from concurrent.futures import (ProcessPoolExecutor, ThreadPoolExecutor,
+                                as_completed)
+
 import fitz
 import numpy as np
 from loguru import logger

 from magic_pdf.utils.annotations import ImportPIL
-from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed


 @ImportPIL
@@ -76,7 +78,7 @@ def convert_page(bytes_page):
    return fitz_doc_to_image(page)

 def parallel_process_pdf_safe(pages, num_workers=None, **kwargs):
-    """Process PDF pages in parallel with serialization-safe approach"""
+    """Process PDF pages in parallel with serialization-safe approach."""
    if num_workers is None:
        num_workers = mp.cpu_count()

@@ -92,8 +94,7 @@ def parallel_process_pdf_safe(pages, num_workers=None, **kwargs):


 def threaded_process_pdf(pdf_path, num_threads=4, **kwargs):
-    """
-    Process all pages of a PDF using multiple threads
+    """Process all pages of a PDF using multiple threads.

    Parameters:
    -----------
@@ -130,13 +131,13 @@ def threaded_process_pdf(pdf_path, num_threads=4, **kwargs):
            try:
                results[page_num] = future.result()
            except Exception as e:
-                print(f"Error processing page {page_num}: {e}")
+                print(f'Error processing page {page_num}: {e}')
                results[page_num] = None

    # Close the document
    doc.close()

-if __name__ == "__main__":
+if __name__ == '__main__':
    pdf = fitz.open('/tmp/[MS-DOC].pdf')


@@ -167,6 +168,3 @@ if __name__ == "__main__":
    8                  7.900 sec
    16                 7.984 sec
    """
-
-
-
--- a/magic_pdf/model/doc_analyze_by_custom_model.py
+++ b/magic_pdf/model/doc_analyze_by_custom_model.py
+import concurrent.futures as fut
+import multiprocessing as mp
 import os
 import time
-import torch
+
 import numpy as np
-import multiprocessing as mp
-import concurrent.futures as fut
+import torch
+
 os.environ['FLAGS_npu_jit_compile'] = '0'  # 关闭paddle的jit编译
 os.environ['FLAGS_use_stride_kernel'] = '0'
 os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'  # 让mps可以fallback
@@ -29,6 +31,7 @@ from magic_pdf.libs.config_reader import (get_device, get_formula_config,
                                          get_local_models_dir,
                                          get_table_recog_config)
 from magic_pdf.model.model_list import MODEL
+
 # from magic_pdf.operators.models import InferenceResult

 MIN_BATCH_INFERENCE_SIZE = 100
@@ -310,14 +313,14 @@ def may_batch_image_analyze(
    device = get_device()

    npu_support = False
-    if str(device).startswith("npu"):
+    if str(device).startswith('npu'):
        import torch_npu
        if torch_npu.npu.is_available():
            npu_support = True
            torch.npu.set_compile_mode(jit_compile=False)

    if torch.cuda.is_available() and device != 'cpu' or npu_support:
-        gpu_memory = int(os.getenv("VIRTUAL_VRAM_SIZE", round(get_vram(device))))
+        gpu_memory = int(os.getenv('VIRTUAL_VRAM_SIZE', round(get_vram(device))))
        if gpu_memory is not None and gpu_memory >= 8:
            if gpu_memory >= 20:
                batch_ratio = 16
@@ -398,4 +401,3 @@ def may_batch_image_analyze(
        f' speed: {doc_analyze_speed} pages/second'
    )
    return (idx, results)
-
--- a/magic_pdf/model/sub_modules/model_init.py
+++ b/magic_pdf/model/sub_modules/model_init.py
 import os
+
 import torch
 from loguru import logger

 from magic_pdf.config.constants import MODEL_NAME
 from magic_pdf.model.model_list import AtomicModel
-from magic_pdf.model.sub_modules.language_detection.yolov11.YOLOv11 import YOLOv11LangDetModel
-from magic_pdf.model.sub_modules.layout.doclayout_yolo.DocLayoutYOLO import DocLayoutYOLOModel
-from magic_pdf.model.sub_modules.layout.layoutlmv3.model_init import Layoutlmv3_Predictor
+from magic_pdf.model.sub_modules.language_detection.yolov11.YOLOv11 import \
+    YOLOv11LangDetModel
+from magic_pdf.model.sub_modules.layout.doclayout_yolo.DocLayoutYOLO import \
+    DocLayoutYOLOModel
+from magic_pdf.model.sub_modules.layout.layoutlmv3.model_init import \
+    Layoutlmv3_Predictor
 from magic_pdf.model.sub_modules.mfd.yolov8.YOLOv8 import YOLOv8MFDModel
 from magic_pdf.model.sub_modules.mfr.unimernet.Unimernet import UnimernetModel

 try:
-    from magic_pdf_ascend_plugin.libs.license_verifier import load_license, LicenseFormatError, LicenseSignatureError, LicenseExpiredError
-    from magic_pdf_ascend_plugin.model_plugin.ocr.paddleocr.ppocr_273_npu import ModifiedPaddleOCR
-    from magic_pdf_ascend_plugin.model_plugin.table.rapidtable.rapid_table_npu import RapidTableModel
+    from magic_pdf_ascend_plugin.libs.license_verifier import (
+        LicenseExpiredError, LicenseFormatError, LicenseSignatureError,
+        load_license)
+    from magic_pdf_ascend_plugin.model_plugin.ocr.paddleocr.ppocr_273_npu import \
+        ModifiedPaddleOCR
+    from magic_pdf_ascend_plugin.model_plugin.table.rapidtable.rapid_table_npu import \
+        RapidTableModel
    license_key = load_license()
    logger.info(f'Using Ascend Plugin Success, License id is {license_key["payload"]["id"]},'
                f' License expired at {license_key["payload"]["date"]["end_date"]}')
@@ -21,21 +29,24 @@ except Exception as e:
    if isinstance(e, ImportError):
        pass
    elif isinstance(e, LicenseFormatError):
-        logger.error("Ascend Plugin: Invalid license format. Please check the license file.")
+        logger.error('Ascend Plugin: Invalid license format. Please check the license file.')
    elif isinstance(e, LicenseSignatureError):
-        logger.error("Ascend Plugin: Invalid signature. The license may be tampered with.")
+        logger.error('Ascend Plugin: Invalid signature. The license may be tampered with.')
    elif isinstance(e, LicenseExpiredError):
-        logger.error("Ascend Plugin: License has expired. Please renew your license.")
+        logger.error('Ascend Plugin: License has expired. Please renew your license.')
    elif isinstance(e, FileNotFoundError):
-        logger.error("Ascend Plugin: Not found License file.")
+        logger.error('Ascend Plugin: Not found License file.')
    else:
-        logger.error(f"Ascend Plugin: {e}")
+        logger.error(f'Ascend Plugin: {e}')
    from magic_pdf.model.sub_modules.ocr.paddleocr.ppocr_273_mod import ModifiedPaddleOCR
    # from magic_pdf.model.sub_modules.ocr.paddleocr.ppocr_291_mod import ModifiedPaddleOCR
    from magic_pdf.model.sub_modules.table.rapidtable.rapid_table import RapidTableModel

-from magic_pdf.model.sub_modules.table.structeqtable.struct_eqtable import StructTableModel
-from magic_pdf.model.sub_modules.table.tablemaster.tablemaster_paddle import TableMasterPaddleModel
+from magic_pdf.model.sub_modules.table.structeqtable.struct_eqtable import \
+    StructTableModel
+from magic_pdf.model.sub_modules.table.tablemaster.tablemaster_paddle import \
+    TableMasterPaddleModel
+

 def table_model_init(table_model_type, model_path, max_time, _device_='cpu', ocr_engine=None, table_sub_model_name=None):
    if table_model_type == MODEL_NAME.STRUCT_EQTABLE:
@@ -56,7 +67,7 @@ def table_model_init(table_model_type, model_path, max_time, _device_='cpu', ocr


 def mfd_model_init(weight, device='cpu'):
-    if str(device).startswith("npu"):
+    if str(device).startswith('npu'):
        device = torch.device(device)
    mfd_model = YOLOv8MFDModel(weight, device)
    return mfd_model
@@ -73,14 +84,14 @@ def layout_model_init(weight, config_file, device):


 def doclayout_yolo_model_init(weight, device='cpu'):
-    if str(device).startswith("npu"):
+    if str(device).startswith('npu'):
        device = torch.device(device)
    model = DocLayoutYOLOModel(weight, device)
    return model


 def langdetect_model_init(langdetect_model_weight, device='cpu'):
-    if str(device).startswith("npu"):
+    if str(device).startswith('npu'):
        device = torch.device(device)
    model = YOLOv11LangDetModel(langdetect_model_weight, device)
    return model

--- a/magic_pdf/tools/cli.py
+++ b/magic_pdf/tools/cli.py
 import os
 import shutil
 import tempfile
+from pathlib import Path
+
 import click
 import fitz
 from loguru import logger
-from pathlib import Path

 import magic_pdf.model as model_config
-from magic_pdf.data.data_reader_writer import FileBasedDataReader
 from magic_pdf.data.batch_build_dataset import batch_build_dataset
+from magic_pdf.data.data_reader_writer import FileBasedDataReader
 from magic_pdf.data.dataset import Dataset
 from magic_pdf.libs.version import __version__
-from magic_pdf.tools.common import do_parse, parse_pdf_methods, batch_do_parse
+from magic_pdf.tools.common import batch_do_parse, do_parse, parse_pdf_methods
 from magic_pdf.utils.office_to_pdf import convert_file_to_pdf

-
 pdf_suffixes = ['.pdf']
 ms_office_suffixes = ['.ppt', '.pptx', '.doc', '.docx']
 image_suffixes = ['.png', '.jpeg', '.jpg']
@@ -97,18 +97,18 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
    def read_fn(path: Path):
        if path.suffix in ms_office_suffixes:
            convert_file_to_pdf(str(path), temp_dir)
-            fn = os.path.join(temp_dir, f"{path.stem}.pdf")
+            fn = os.path.join(temp_dir, f'{path.stem}.pdf')
        elif path.suffix in image_suffixes:
            with open(str(path), 'rb') as f:
                bits = f.read()
            pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
-            fn = os.path.join(temp_dir, f"{path.stem}.pdf")
+            fn = os.path.join(temp_dir, f'{path.stem}.pdf')
            with open(fn, 'wb') as f:
                f.write(pdf_bytes)
        elif path.suffix in pdf_suffixes:
            fn = str(path)
        else:
-            raise Exception(f"Unknown file suffix: {path.suffix}")
+            raise Exception(f'Unknown file suffix: {path.suffix}')

        disk_rw = FileBasedDataReader(os.path.dirname(fn))
        return disk_rw.read(os.path.basename(fn))

--- a/magic_pdf/tools/common.py
+++ b/magic_pdf/tools/common.py
@@ -8,10 +8,10 @@ import magic_pdf.model as model_config
 from magic_pdf.config.enums import SupportedPdfParseMethod
 from magic_pdf.config.make_content_config import DropMode, MakeMode
 from magic_pdf.data.data_reader_writer import FileBasedDataWriter
-from magic_pdf.data.dataset import PymuDocDataset, Dataset
+from magic_pdf.data.dataset import Dataset, PymuDocDataset
 from magic_pdf.libs.draw_bbox import draw_char_bbox
-from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze, batch_doc_analyze
-
+from magic_pdf.model.doc_analyze_by_custom_model import (batch_doc_analyze,
+                                                         doc_analyze)

 # from io import BytesIO
 # from pypdf import PdfReader, PdfWriter