refactor: improve image handling by transitioning from NumPy arrays to PIL...

refactor: improve image handling by transitioning from NumPy arrays to PIL images in cropping functions

refactor: improve image handling by transitioning from NumPy arrays to PIL...
refactor: improve image handling by transitioning from NumPy arrays to PIL images in cropping functions
101b12a1 · myhloli · a9abb4e2 · 101b12a1 · 101b12a1 · 101b12a1
Commit 101b12a1 authored Jun 03, 2025 by myhloli
Showing with 58 additions and 51 deletions

mineru/backend/pipeline/batch_analyze.py mineru/backend/pipeline/batch_analyze.py +31 -32

mineru/cli/common.py mineru/cli/common.py +9 -11

mineru/utils/model_utils.py mineru/utils/model_utils.py +18 -8

No files found.
--- a/mineru/backend/pipeline/batch_analyze.py
+++ b/mineru/backend/pipeline/batch_analyze.py
@@ -71,7 +71,7 @@ class BatchAnalyze:
        for index in range(len(images)):
            _, ocr_enable, _lang = images_with_extra_info[index]
            layout_res = images_layout_res[index]
-            np_array_img = images[index]
+            pil_img = images[index]
            ocr_res_list, table_res_list, single_page_mfdetrec_res = (
                get_res_list_from_layout_res(layout_res)
@@ -80,13 +80,13 @@ class BatchAnalyze:
            ocr_res_list_all_page.append({'ocr_res_list':ocr_res_list,
                                          'lang':_lang,
                                          'ocr_enable':ocr_enable,
-                                          'np_array_img':np_array_img,
+                                          'pil_img':pil_img,
                                          'single_page_mfdetrec_res':single_page_mfdetrec_res,
                                          'layout_res':layout_res,
                                          })
            for table_res in table_res_list:
-                table_img, _ = crop_img(table_res, np_array_img)
+                table_img, _ = crop_img(table_res, pil_img)
                table_res_list_all_page.append({'table_res':table_res,
                                                'lang':_lang,
                                                'table_img':table_img,
@@ -103,14 +103,14 @@ class BatchAnalyze:
                        for res in ocr_res_list_dict['ocr_res_list']:
                            new_image, useful_list = crop_img(
-                                res, ocr_res_list_dict['np_array_img'], crop_paste_x=50, crop_paste_y=50
+                                res, ocr_res_list_dict['pil_img'], crop_paste_x=50, crop_paste_y=50
                            )
                            adjusted_mfdetrec_res = get_adjusted_mfdetrec_res(
                                ocr_res_list_dict['single_page_mfdetrec_res'], useful_list
                            )
                            # BGR转换
-                            new_image = cv2.cvtColor(new_image, cv2.COLOR_RGB2BGR)
+                            new_image = cv2.cvtColor(np.asarray(new_image), cv2.COLOR_RGB2BGR)
                            all_cropped_images_info.append((
                                new_image, useful_list, ocr_res_list_dict, res, adjusted_mfdetrec_res, _lang
@@ -215,37 +215,36 @@ class BatchAnalyze:
                        )
                        for res in ocr_res_list_dict['ocr_res_list']:
                            new_image, useful_list = crop_img(
-                                res, ocr_res_list_dict['np_array_img'], crop_paste_x=50, crop_paste_y=50
+                                res, ocr_res_list_dict['pil_img'], crop_paste_x=50, crop_paste_y=50
                            )
                            adjusted_mfdetrec_res = get_adjusted_mfdetrec_res(
                                ocr_res_list_dict['single_page_mfdetrec_res'], useful_list
                            )
+                            # OCR-det
-                        # OCR-det
+                            new_image = cv2.cvtColor(np.asarray(new_image), cv2.COLOR_RGB2BGR)
-                        new_image = cv2.cvtColor(new_image, cv2.COLOR_RGB2BGR)
+                            ocr_res = ocr_model.ocr(
-                        ocr_res = ocr_model.ocr(
+                                new_image, mfd_res=adjusted_mfdetrec_res, rec=False
-                            new_image, mfd_res=adjusted_mfdetrec_res, rec=False
+                            )[0]
-                        )[0]
+                            # Integration results
-                        # Integration results
+                            if ocr_res:
-                        if ocr_res:
+                                ocr_result_list = get_ocr_result_list(ocr_res, useful_list, ocr_res_list_dict['ocr_enable'],
-                            ocr_result_list = get_ocr_result_list(ocr_res, useful_list, ocr_res_list_dict['ocr_enable'],
+                                                                      new_image, _lang)
-                                                                  new_image, _lang)
+                                if res["category_id"] == 3:
-                            if res["category_id"] == 3:
+                                    # ocr_result_list中所有bbox的面积之和
-                                # ocr_result_list中所有bbox的面积之和
+                                    ocr_res_area = sum(
-                                ocr_res_area = sum(
+                                        get_coords_and_area(ocr_res_item)[4] for ocr_res_item in ocr_result_list if 'poly' in ocr_res_item)
-                                    get_coords_and_area(ocr_res_item)[4] for ocr_res_item in ocr_result_list if 'poly' in ocr_res_item)
+                                    # 求ocr_res_area和res的面积的比值
-                                # 求ocr_res_area和res的面积的比值
+                                    res_area = get_coords_and_area(res)[4]
-                                res_area = get_coords_and_area(res)[4]
+                                    if res_area > 0:
-                                if res_area > 0:
+                                        ratio = ocr_res_area / res_area
-                                    ratio = ocr_res_area / res_area
+                                        if ratio > 0.25:
-                                    if ratio > 0.25:
+                                            res["category_id"] = 1
-                                        res["category_id"] = 1
+                                        else:
-                                    else:
+                                            continue
-                                        continue
+                                ocr_res_list_dict['layout_res'].extend(ocr_result_list)
-                            ocr_res_list_dict['layout_res'].extend(ocr_result_list)
        # 表格识别 table recognition
        if self.table_enable:

--- a/mineru/cli/common.py
+++ b/mineru/cli/common.py
@@ -8,13 +8,13 @@ import pypdfium2 as pdfium
 from loguru import logger
 from mineru.backend.pipeline.model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json
-from ..api.vlm_middle_json_mkcontent import union_make
+from mineru.api.vlm_middle_json_mkcontent import union_make
-from ..backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
+from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
-from ..backend.pipeline.pipeline_analyze import doc_analyze as pipeline_doc_analyze
+from mineru.backend.pipeline.pipeline_analyze import doc_analyze as pipeline_doc_analyze
-from ..data.data_reader_writer import FileBasedDataWriter
+from mineru.data.data_reader_writer import FileBasedDataWriter
-from ..utils.draw_bbox import draw_layout_bbox, draw_span_bbox
+from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox
-from ..utils.enum_class import MakeMode
+from mineru.utils.enum_class import MakeMode
-from ..utils.pdf_image_tools import images_bytes_to_pdf_bytes
+from mineru.utils.pdf_image_tools import images_bytes_to_pdf_bytes
 pdf_suffixes = [".pdf"]
 image_suffixes = [".png", ".jpeg", ".jpg"]
@@ -211,11 +211,9 @@ def do_parse(
 if __name__ == "__main__":
-    pdf_path = "../../demo/demo2.pdf"
+    pdf_path = "../../demo/pdfs/demo2.pdf"
    with open(pdf_path, "rb") as f:
        try:
-            result = do_parse("./output", Path(pdf_path).stem, f.read())
+           do_parse("./output", [Path(pdf_path).stem], [f.read()],["ch"],)
        except Exception as e:
            logger.exception(e)
-        # dict转成json
-        print(json.dumps(result, ensure_ascii=False, indent=4))
--- a/mineru/utils/model_utils.py
+++ b/mineru/utils/model_utils.py
 import time
 import torch
 import gc
+from PIL import Image
 from loguru import logger
 import numpy as np
 from mineru.utils.boxbase import get_minbox_if_overlap_by_ratio
-def crop_img(input_res, input_np_img, crop_paste_x=0, crop_paste_y=0):
+def crop_img(input_res, input_img, crop_paste_x=0, crop_paste_y=0):
    crop_xmin, crop_ymin = int(input_res['poly'][0]), int(input_res['poly'][1])
    crop_xmax, crop_ymax = int(input_res['poly'][4]), int(input_res['poly'][5])
@@ -16,15 +17,24 @@ def crop_img(input_res, input_np_img, crop_paste_x=0, crop_paste_y=0):
    crop_new_width = crop_xmax - crop_xmin + crop_paste_x * 2
    crop_new_height = crop_ymax - crop_ymin + crop_paste_y * 2
-    # Create a white background array
+    if isinstance(input_img, np.ndarray):
-    return_image = np.ones((crop_new_height, crop_new_width, 3), dtype=np.uint8) * 255
-    # Crop the original image using numpy slicing
+        # Create a white background array
-    cropped_img = input_np_img[crop_ymin:crop_ymax, crop_xmin:crop_xmax]
+        return_image = np.ones((crop_new_height, crop_new_width, 3), dtype=np.uint8) * 255
-    # Paste the cropped image onto the white background
+        # Crop the original image using numpy slicing
-    return_image[crop_paste_y:crop_paste_y + (crop_ymax - crop_ymin),
+        cropped_img = input_img[crop_ymin:crop_ymax, crop_xmin:crop_xmax]
-    crop_paste_x:crop_paste_x + (crop_xmax - crop_xmin)] = cropped_img
+        # Paste the cropped image onto the white background
+        return_image[crop_paste_y:crop_paste_y + (crop_ymax - crop_ymin),
+        crop_paste_x:crop_paste_x + (crop_xmax - crop_xmin)] = cropped_img
+    else:
+        # Create a white background array
+        return_image = Image.new('RGB', (crop_new_width, crop_new_height), 'white')
+        # Crop image
+        crop_box = (crop_xmin, crop_ymin, crop_xmax, crop_ymax)
+        cropped_img = input_img.crop(crop_box)
+        return_image.paste(cropped_img, (crop_paste_x, crop_paste_y))
    return_list = [crop_paste_x, crop_paste_y, crop_xmin, crop_ymin, crop_xmax, crop_ymax, crop_new_width,
                   crop_new_height]