utils.py 5.21 KB
Newer Older
1

icecraft's avatar
icecraft committed
2
3
import multiprocessing as mp
import threading
icecraft's avatar
icecraft committed
4
5
6
from concurrent.futures import (ProcessPoolExecutor, ThreadPoolExecutor,
                                as_completed)

7
8
import fitz
import numpy as np
9
from loguru import logger
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25



def fitz_doc_to_image(doc, dpi=200) -> dict:
    """Convert fitz.Document to image, Then convert the image to numpy array.

    Args:
        doc (_type_): pymudoc page
        dpi (int, optional): reset the dpi of dpi. Defaults to 200.

    Returns:
        dict:  {'img': numpy array, 'width': width, 'height': height }
    """
    mat = fitz.Matrix(dpi / 72, dpi / 72)
    pm = doc.get_pixmap(matrix=mat, alpha=False)

myhloli's avatar
myhloli committed
26
27
    # If the width or height exceeds 4500 after scaling, do not scale further.
    if pm.width > 4500 or pm.height > 4500:
28
29
        pm = doc.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)

30
31
    # Convert pixmap samples directly to numpy array
    img = np.frombuffer(pm.samples, dtype=np.uint8).reshape(pm.height, pm.width, 3)
32
33
34
35

    img_dict = {'img': img, 'width': pm.width, 'height': pm.height}

    return img_dict
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59

def load_images_from_pdf(pdf_bytes: bytes, dpi=200, start_page_id=0, end_page_id=None) -> list:
    images = []
    with fitz.open('pdf', pdf_bytes) as doc:
        pdf_page_num = doc.page_count
        end_page_id = (
            end_page_id
            if end_page_id is not None and end_page_id >= 0
            else pdf_page_num - 1
        )
        if end_page_id > pdf_page_num - 1:
            logger.warning('end_page_id is out of range, use images length')
            end_page_id = pdf_page_num - 1

        for index in range(0, doc.page_count):
            if start_page_id <= index <= end_page_id:
                page = doc[index]
                mat = fitz.Matrix(dpi / 72, dpi / 72)
                pm = page.get_pixmap(matrix=mat, alpha=False)

                # If the width or height exceeds 4500 after scaling, do not scale further.
                if pm.width > 4500 or pm.height > 4500:
                    pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)

60
61
62
                # Convert pixmap samples directly to numpy array
                img = np.frombuffer(pm.samples, dtype=np.uint8).reshape(pm.height, pm.width, 3)

63
64
65
66
67
68
                img_dict = {'img': img, 'width': pm.width, 'height': pm.height}
            else:
                img_dict = {'img': [], 'width': 0, 'height': 0}

            images.append(img_dict)
    return images
icecraft's avatar
icecraft committed
69

icecraft's avatar
icecraft committed
70

icecraft's avatar
icecraft committed
71
72
73
74
def convert_page(bytes_page):
    pdfs = fitz.open('pdf', bytes_page)
    page = pdfs[0]
    return fitz_doc_to_image(page)
icecraft's avatar
icecraft committed
75

icecraft's avatar
icecraft committed
76
def parallel_process_pdf_safe(pages, num_workers=None, **kwargs):
icecraft's avatar
icecraft committed
77
    """Process PDF pages in parallel with serialization-safe approach."""
icecraft's avatar
icecraft committed
78
79
    if num_workers is None:
        num_workers = mp.cpu_count()
icecraft's avatar
icecraft committed
80

icecraft's avatar
icecraft committed
81
82
83
84
85
86
87

    # Process the extracted page data in parallel
    with ProcessPoolExecutor(max_workers=num_workers) as executor:
        # Process the page data
        results = list(
            executor.map(convert_page, pages)
        )
icecraft's avatar
icecraft committed
88

icecraft's avatar
icecraft committed
89
90
91
92
    return results


def threaded_process_pdf(pdf_path, num_threads=4, **kwargs):
icecraft's avatar
icecraft committed
93
94
    """Process all pages of a PDF using multiple threads.

icecraft's avatar
icecraft committed
95
96
97
98
99
100
101
102
    Parameters:
    -----------
    pdf_path : str
        Path to the PDF file
    num_threads : int
        Number of threads to use
    **kwargs :
        Additional arguments for fitz_doc_to_image
icecraft's avatar
icecraft committed
103

icecraft's avatar
icecraft committed
104
105
106
107
108
109
110
111
    Returns:
    --------
    images : list
        List of processed images, in page order
    """
    # Open the PDF
    doc = fitz.open(pdf_path)
    num_pages = len(doc)
icecraft's avatar
icecraft committed
112

icecraft's avatar
icecraft committed
113
114
    # Create a list to store results in the correct order
    results = [None] * num_pages
icecraft's avatar
icecraft committed
115

icecraft's avatar
icecraft committed
116
117
118
119
120
121
122
123
124
125
126
127
128
129
    # Create a thread pool
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        # Submit all tasks
        futures = {}
        for page_num in range(num_pages):
            page = doc[page_num]
            future = executor.submit(fitz_doc_to_image, page, **kwargs)
            futures[future] = page_num
        # Process results as they complete with progress bar
        for future in as_completed(futures):
            page_num = futures[future]
            try:
                results[page_num] = future.result()
            except Exception as e:
icecraft's avatar
icecraft committed
130
                print(f'Error processing page {page_num}: {e}')
icecraft's avatar
icecraft committed
131
                results[page_num] = None
icecraft's avatar
icecraft committed
132

icecraft's avatar
icecraft committed
133
134
135
    # Close the document
    doc.close()

icecraft's avatar
icecraft committed
136
if __name__ == '__main__':
icecraft's avatar
icecraft committed
137
    pdf = fitz.open('/tmp/[MS-DOC].pdf')
icecraft's avatar
icecraft committed
138
139


icecraft's avatar
icecraft committed
140
141
    pdf_page = [fitz.open() for i in range(pdf.page_count)]
    [pdf_page[i].insert_pdf(pdf, from_page=i, to_page=i) for i in range(pdf.page_count)]
icecraft's avatar
icecraft committed
142

icecraft's avatar
icecraft committed
143
144
    pdf_page = [v.tobytes() for v in pdf_page]
    results = parallel_process_pdf_safe(pdf_page, num_workers=16)
icecraft's avatar
icecraft committed
145

icecraft's avatar
icecraft committed
146
147
148
    # threaded_process_pdf('/tmp/[MS-DOC].pdf', num_threads=16)

    """ benchmark results of multi-threaded processing (fitz page to image)
icecraft's avatar
icecraft committed
149
150
    total page nums: 578
    thread nums,    time cost
icecraft's avatar
icecraft committed
151
152
153
154
155
156
157
158
    1               7.351 sec
    2               6.334 sec
    4               5.968 sec
    8               6.728 sec
    16              8.085 sec
    """

    """ benchmark results of multi-processor processing (fitz page to image)
icecraft's avatar
icecraft committed
159
160
    total page nums: 578
    processor nums,    time cost
icecraft's avatar
icecraft committed
161
    1                  17.170 sec
icecraft's avatar
icecraft committed
162
163
    2                  10.170 sec
    4                  7.841 sec
icecraft's avatar
icecraft committed
164
165
166
    8                  7.900 sec
    16                 7.984 sec
    """