refactor: clean up unused OCR area calculation and update demo PDF path

3334157f · myhloli · 236a6033 · 3334157f · 3334157f · 3334157f
Commit 3334157f authored Jun 05, 2025 by myhloli
3 changed files
--- a/mineru/backend/pipeline/batch_analyze.py
+++ b/mineru/backend/pipeline/batch_analyze.py
@@ -230,19 +230,6 @@ class BatchAnalyze:
                        ocr_result_list = get_ocr_result_list(ocr_res, useful_list, ocr_res_list_dict['ocr_enable'],
                                                              new_image, _lang)

-                        # if res["category_id"] == 3 and ocr_res_list_dict['ocr_enable']:
-                        #     # ocr_result_list中所有bbox的面积之和
-                        #     ocr_res_area = sum(
-                        #         get_coords_and_area(ocr_res_item)[4] for ocr_res_item in ocr_result_list if 'poly' in ocr_res_item)
-                        #     # 求ocr_res_area和res的面积的比值
-                        #     res_area = get_coords_and_area(res)[4]
-                        #     if res_area > 0:
-                        #         ratio = ocr_res_area / res_area
-                        #         if ratio > 0.25:
-                        #             res["category_id"] = 1
-                        #         else:
-                        #             continue
-
                        ocr_res_list_dict['layout_res'].extend(ocr_result_list)

        # 表格识别 table recognition

--- a/mineru/backend/pipeline/model_json_to_middle_json.py
+++ b/mineru/backend/pipeline/model_json_to_middle_json.py
@@ -48,6 +48,7 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
    """获取所有的spans信息"""
    spans = magic_model.get_all_spans()

+    """某些图可能是文本块，通过简单的规则判断一下"""
    if len(maybe_text_image_blocks) > 0:
        for block in maybe_text_image_blocks:
            span_in_block_list = []
@@ -64,8 +65,10 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
                    if ratio > 0.25 and ocr:
                        # 移除block的group_id
                        block.pop('group_id', None)
+                        # 符合文本图的条件就把块加入到文本块列表中
                        text_blocks.append(block)
                    else:
+                        # 如果不符合文本图的条件，就把块加回到图片块列表中
                        img_body_blocks.append(block)
            else:
                img_body_blocks.append(block)

--- a/mineru/cli/common.py
+++ b/mineru/cli/common.py
@@ -215,9 +215,10 @@ def do_parse(


 if __name__ == "__main__":
-    pdf_path = "../../demo/pdfs/demo2.pdf"
-    with open(pdf_path, "rb") as f:
+    pdf_path = "../../demo/pdfs/hello-algo-1.1.0-zh-c-word转换的span有问题.pdf"
+    # pdf_path = "C:/Users/zhaoxiaomeng/Downloads/input_img_0.jpg"
+
    try:
-           do_parse("./output", [Path(pdf_path).stem], [f.read()],["ch"], end_page_id=20,)
+       do_parse("./output", [Path(pdf_path).stem], [read_fn(Path(pdf_path))],["ch"], end_page_id=20,)
    except Exception as e:
        logger.exception(e)