Merge branch 'master' of github.com:opendatalab/MinerU

d01acab4 · 徐超 · 9ec91339 · 2cb82b7f · d01acab4 · d01acab4
Commit d01acab4 authored Jul 26, 2024 by 徐超
15 changed files
--- a/magic_pdf/pre_proc/ocr_detect_all_bboxes.py
+++ b/magic_pdf/pre_proc/ocr_detect_all_bboxes.py
@@ -36,9 +36,12 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
    all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
    '''任何框体与舍弃框重叠，优先信任舍弃框'''
    all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
-    # @todo interline_equation 与title或text框冲突的情况，分两种情况处理
+
+    # interline_equation 与title或text框冲突的情况，分两种情况处理
    '''interline_equation框与文本类型框iou比较接近1的时候，信任行间公式框'''
+    all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
    '''interline_equation框被包含在文本类型框内，且interline_equation比文本区块小很多时信任文本框，这时需要舍弃公式框'''
+    # 通过后续大框套小框逻辑删除

    '''discarded_blocks中只保留宽度超过1/3页面宽度的，高度超过10的，处于页面下半50%区域的（限定footnote）'''
    for discarded in discarded_blocks:
@@ -57,6 +60,34 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
    return all_bboxes, all_discarded_blocks, drop_reasons


+def fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes):
+    # 先提取所有text和interline block
+    text_blocks = []
+    for block in all_bboxes:
+        if block[7] == BlockType.Text:
+            text_blocks.append(block)
+    interline_equation_blocks = []
+    for block in all_bboxes:
+        if block[7] == BlockType.InterlineEquation:
+            interline_equation_blocks.append(block)
+
+    need_remove = []
+
+    for interline_equation_block in interline_equation_blocks:
+        for text_block in text_blocks:
+            interline_equation_block_bbox = interline_equation_block[:4]
+            text_block_bbox = text_block[:4]
+            if calculate_iou(interline_equation_block_bbox, text_block_bbox) > 0.8:
+                if text_block not in need_remove:
+                    need_remove.append(text_block)
+
+    if len(need_remove) > 0:
+        for block in need_remove:
+            all_bboxes.remove(block)
+
+    return all_bboxes
+
+
 def fix_text_overlap_title_blocks(all_bboxes):
    # 先提取所有text和title block
    text_blocks = []
@@ -68,12 +99,19 @@ def fix_text_overlap_title_blocks(all_bboxes):
        if block[7] == BlockType.Title:
            title_blocks.append(block)

+    need_remove = []
+
    for text_block in text_blocks:
        for title_block in title_blocks:
            text_block_bbox = text_block[:4]
            title_block_bbox = title_block[:4]
            if calculate_iou(text_block_bbox, title_block_bbox) > 0.8:
-                all_bboxes.remove(title_block)
+                if title_block not in need_remove:
+                    need_remove.append(title_block)
+
+    if len(need_remove) > 0:
+        for block in need_remove:
+            all_bboxes.remove(block)

    return all_bboxes


--- a/magic_pdf/pre_proc/ocr_span_list_modify.py
+++ b/magic_pdf/pre_proc/ocr_span_list_modify.py
@@ -5,19 +5,24 @@ from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, g
 from magic_pdf.libs.drop_tag import DropTag
 from magic_pdf.libs.ocr_content_type import ContentType, BlockType

+
 def remove_overlaps_low_confidence_spans(spans):
    dropped_spans = []
    #  删除重叠spans中置信度低的的那些
    for span1 in spans:
        for span2 in spans:
            if span1 != span2:
-                if calculate_iou(span1['bbox'], span2['bbox']) > 0.9:
-                    if span1['score'] < span2['score']:
-                        span_need_remove = span1
-                    else:
-                        span_need_remove = span2
-                    if span_need_remove is not None and span_need_remove not in dropped_spans:
-                        dropped_spans.append(span_need_remove)
+                # span1 或 span2 任何一个都不应该在 dropped_spans 中
+                if span1 in dropped_spans or span2 in dropped_spans:
+                    continue
+                else:
+                    if calculate_iou(span1['bbox'], span2['bbox']) > 0.9:
+                        if span1['score'] < span2['score']:
+                            span_need_remove = span1
+                        else:
+                            span_need_remove = span2
+                        if span_need_remove is not None and span_need_remove not in dropped_spans:
+                            dropped_spans.append(span_need_remove)

    if len(dropped_spans) > 0:
        for span_need_remove in dropped_spans:

--- a/magic_pdf/resources/fasttext-langdetect/lid.176.ftz
+++ b/magic_pdf/resources/fasttext-langdetect/lid.176.ftz
--- a/magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml
+++ b/magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml
 AUG:
  DETR: true
-CACHE_DIR: /mnt/localdata/users/yupanhuang/cache/huggingface
+CACHE_DIR: ~/cache/huggingface
 CUDNN_BENCHMARK: false
 DATALOADER:
  ASPECT_RATIO_GROUPING: true
@@ -294,7 +294,7 @@ MODEL:
    POS_TYPE: abs
  WEIGHTS: 
 OUTPUT_DIR: 
-SCIHUB_DATA_DIR_TRAIN: /mnt/petrelfs/share_data/zhaozhiyuan/publaynet/layout_scihub/train
+SCIHUB_DATA_DIR_TRAIN: ~/publaynet/layout_scihub/train
 SEED: 42
 SOLVER:
  AMP:

--- a/requirements.txt
+++ b/requirements.txt
@@ -7,5 +7,5 @@ numpy>=1.21.6
 fast-langdetect>=0.2.1
 wordninja>=2.0.0
 scikit-learn>=1.0.2
-pdfminer.six>=20231228
+pdfminer.six==20231228
 # The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator.
--- a/setup.py
+++ b/setup.py
@@ -32,9 +32,8 @@ if __name__ == '__main__':
        },
        install_requires=parse_requirements('requirements.txt'),  # 项目依赖的第三方库
        extras_require={
-            "gpu": ["paddleocr==2.7.3", "paddlepaddle-gpu"],
-            "cpu": ["paddleocr==2.7.3", "paddlepaddle"],
-            "full-cpu": ["unimernet", "matplotlib", "ultralytics", "paddleocr==2.7.3", "paddlepaddle"],
+            "lite": ["paddleocr==2.7.3", "paddlepaddle", "paddlepaddle-gpu"],
+            "full": ["unimernet", "matplotlib", "ultralytics", "paddleocr==2.7.3", "paddlepaddle", "paddlepaddle-gpu"],
        },
        description="A practical tool for converting PDF to Markdown",  # 简短描述
        long_description=long_description,  # 详细描述

--- a/signatures/version1/cla.json
+++ b/signatures/version1/cla.json
+{
+   "signedContributors": [
+   ]
+}
\ No newline at end of file
--- a/tests/__pycache__/test_unit.cpython-39-pytest-7.4.0.pyc
+++ b/tests/__pycache__/test_unit.cpython-39-pytest-7.4.0.pyc
--- a/tests/test_cli/__pycache__/test_bench.cpython-39-pytest-7.4.0.pyc
+++ b/tests/test_cli/__pycache__/test_bench.cpython-39-pytest-7.4.0.pyc
--- a/tests/test_cli/__pycache__/test_cli.cpython-39-pytest-7.4.0.pyc
+++ b/tests/test_cli/__pycache__/test_cli.cpython-39-pytest-7.4.0.pyc
--- a/tests/test_cli/conf/__pycache__/conf.cpython-39.pyc
+++ b/tests/test_cli/conf/__pycache__/conf.cpython-39.pyc
--- a/tests/test_cli/lib/__pycache__/__init__.cpython-39.pyc
+++ b/tests/test_cli/lib/__pycache__/__init__.cpython-39.pyc
--- a/tests/test_cli/lib/__pycache__/calculate_score.cpython-39.pyc
+++ b/tests/test_cli/lib/__pycache__/calculate_score.cpython-39.pyc
--- a/tests/test_cli/lib/__pycache__/scoring.cpython-39.pyc
+++ b/tests/test_cli/lib/__pycache__/scoring.cpython-39.pyc
--- a/tests/test_cli/test_bench_gpu.py
+++ b/tests/test_cli/test_bench_gpu.py
@@ -6,7 +6,7 @@ import json
 from magic_pdf.pipe.UNIPipe import UNIPipe
 from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
 from lib import calculate_score
-
+import shutil
 pdf_res_path = conf.conf["pdf_res_path"]
 code_path = conf.conf["code_path"]
 pdf_dev_path = conf.conf["pdf_dev_path"]
@@ -58,8 +58,8 @@ def pdf_to_markdown():
        if not os.path.exists(dir_path):
            os.makedirs(dir_path, exist_ok=True)
        res_path = os.path.join(dir_path, f"{demo_name}.md")
-        #src_path = os.path.join(pdf_res_path, "pdf", f"{demo_name}.pdf") 
-        #shutil.copy(src_path, res_path)
+        src_path = os.path.join(pdf_res_path, demo_name, "auto", f"{demo_name}.md")
+        shutil.copy(src_path, res_path)