"docs/developer_guide/setup_github_runner.md" did not exist on "e8e18dcdcca0e6d4eacccd074bea9da2ad6a3e18"
Unverified Commit a7296f78 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #1166 from myhloli/dev

fix(pre_proc): prevent errors when imageWriter is None
parents 384e0379 b0529b6f
...@@ -20,8 +20,8 @@ def fitz_doc_to_image(doc, dpi=200) -> dict: ...@@ -20,8 +20,8 @@ def fitz_doc_to_image(doc, dpi=200) -> dict:
mat = fitz.Matrix(dpi / 72, dpi / 72) mat = fitz.Matrix(dpi / 72, dpi / 72)
pm = doc.get_pixmap(matrix=mat, alpha=False) pm = doc.get_pixmap(matrix=mat, alpha=False)
# If the width or height exceeds 9000 after scaling, do not scale further. # If the width or height exceeds 4500 after scaling, do not scale further.
if pm.width > 9000 or pm.height > 9000: if pm.width > 4500 or pm.height > 4500:
pm = doc.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False) pm = doc.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
img = Image.frombytes('RGB', (pm.width, pm.height), pm.samples) img = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
......
...@@ -12,12 +12,12 @@ def ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter): ...@@ -12,12 +12,12 @@ def ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
for span in spans: for span in spans:
span_type = span['type'] span_type = span['type']
if span_type == ContentType.Image: if span_type == ContentType.Image:
if not check_img_bbox(span['bbox']): if not check_img_bbox(span['bbox']) or not imageWriter:
continue continue
span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('images'), span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('images'),
imageWriter=imageWriter) imageWriter=imageWriter)
elif span_type == ContentType.Table: elif span_type == ContentType.Table:
if not check_img_bbox(span['bbox']): if not check_img_bbox(span['bbox']) or not imageWriter:
continue continue
span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('tables'), span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('tables'),
imageWriter=imageWriter) imageWriter=imageWriter)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment