refactor: improve text processing by adding ligature and unicode replacement functions

f2115541 · myhloli · 76e1a7c1 · f2115541 · f2115541 · f2115541
Commit f2115541 authored Jun 04, 2025 by myhloli
3 changed files
--- a/mineru/backend/pipeline/batch_analyze.py
+++ b/mineru/backend/pipeline/batch_analyze.py
@@ -132,7 +132,6 @@ class BatchAnalyze:
                # 获取OCR模型
                ocr_model = atom_model_manager.get_atom_model(
                    atom_model_name='ocr',
-                    ocr_show_log=False,
                    det_db_box_thresh=0.3,
                    lang=lang
                )

--- a/mineru/utils/span_block_fix.py
+++ b/mineru/utils/span_block_fix.py
@@ -38,7 +38,7 @@ def fill_spans_in_blocks(blocks, spans, radio):


 def span_block_type_compatible(span_type, block_type):
-    if span_type in [ContentType.TEXT, ContentType.INTERLINE_EQUATION]:
+    if span_type in [ContentType.TEXT, ContentType.INLINE_EQUATION]:
        return block_type in [
            BlockType.TEXT,
            BlockType.TITLE,

--- a/mineru/utils/span_pre_proc.py
+++ b/mineru/utils/span_pre_proc.py
 # Copyright (c) Opendatalab. All rights reserved.
+import re
 import cv2
 import numpy as np

@@ -100,6 +101,19 @@ def remove_overlaps_min_spans(spans):
    return spans, dropped_spans


+def __replace_ligatures(text: str):
+    ligatures = {
+        'ﬁ': 'fi', 'ﬂ': 'fl', 'ﬀ': 'ff', 'ﬃ': 'ffi', 'ﬄ': 'ffl', 'ﬅ': 'ft', 'ﬆ': 'st'
+    }
+    return re.sub('|'.join(map(re.escape, ligatures.keys())), lambda m: ligatures[m.group()], text)
+
+def __replace_unicode(text: str):
+    ligatures = {
+        '\r\n': '', '\u0002': '-',
+    }
+    return re.sub('|'.join(map(re.escape, ligatures.keys())), lambda m: ligatures[m.group()], text)
+
+
 def txt_spans_extract(pdf_page, spans, pil_img, scale):

    textpage = pdf_page.get_textpage()
@@ -117,6 +131,8 @@ def txt_spans_extract(pdf_page, spans, pil_img, scale):
        text = textpage.get_text_bounded(left=rect_box[0], top=rect_box[1],
                                         right=rect_box[2], bottom=rect_box[3])
        if text and len(text) > 0:
+            text = __replace_unicode(text)
+            text = __replace_ligatures(text)
            span['content'] = text.strip()
            span['score'] = 1.0
        else: