Merge pull request #1166 from myhloli/dev

fix(pre_proc): prevent errors when imageWriter is None

Merge pull request #1166 from myhloli/dev
fix(pre_proc): prevent errors when imageWriter is None
a7296f78 · Xiaomeng Zhao · GitHub · 384e0379 · b0529b6f · a7296f78
Unverified Commit a7296f78 authored Dec 02, 2024 by Xiaomeng Zhao Committed by GitHub Dec 02, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 4 additions and 4 deletions

magic_pdf/data/utils.py magic_pdf/data/utils.py +2 -2

magic_pdf/pre_proc/cut_image.py magic_pdf/pre_proc/cut_image.py +2 -2

No files found.
--- a/magic_pdf/data/utils.py
+++ b/magic_pdf/data/utils.py
@@ -20,8 +20,8 @@ def fitz_doc_to_image(doc, dpi=200) -> dict:
    mat = fitz.Matrix(dpi / 72, dpi / 72)
    pm = doc.get_pixmap(matrix=mat, alpha=False)
-    # If the width or height exceeds 9000 after scaling, do not scale further.
+    # If the width or height exceeds 4500 after scaling, do not scale further.
-    if pm.width > 9000 or pm.height > 9000:
+    if pm.width > 4500 or pm.height > 4500:
        pm = doc.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
    img = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)

--- a/magic_pdf/pre_proc/cut_image.py
+++ b/magic_pdf/pre_proc/cut_image.py
@@ -12,12 +12,12 @@ def ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
    for span in spans:
        span_type = span['type']
        if span_type == ContentType.Image:
-            if not check_img_bbox(span['bbox']):
+            if not check_img_bbox(span['bbox']) or not imageWriter:
                continue
            span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('images'),
                                           imageWriter=imageWriter)
        elif span_type == ContentType.Table:
-            if not check_img_bbox(span['bbox']):
+            if not check_img_bbox(span['bbox']) or not imageWriter:
                continue
            span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('tables'),
                                           imageWriter=imageWriter)