fix: add new enum values and improve MIN_BATCH_INFERENCE_SIZE documentation in pipeline_analyze.py

58b8e8a9 · myhloli · 20dcbd21 · 58b8e8a9 · 58b8e8a9
Commit 58b8e8a9 authored Jun 17, 2025 by myhloli
Hide whitespace changes
Inline Side-by-side

Showing with 8 additions and 2 deletions

mineru/backend/pipeline/pipeline_analyze.py mineru/backend/pipeline/pipeline_analyze.py +6 -2

mineru/utils/enum_class.py mineru/utils/enum_class.py +2 -0

No files found.
--- a/mineru/backend/pipeline/pipeline_analyze.py
+++ b/mineru/backend/pipeline/pipeline_analyze.py
@@ -76,7 +76,11 @@ def doc_analyze(
        formula_enable=True,
        table_enable=True,
 ):
-    MIN_BATCH_INFERENCE_SIZE = int(os.environ.get('MINERU_MIN_BATCH_INFERENCE_SIZE', 100))
+    """
+    适当调大MIN_BATCH_INFERENCE_SIZE可以提高性能，可能会增加显存使用量，
+    可通过环境变量MINERU_MIN_BATCH_INFERENCE_SIZE设置，默认值为100。
+    """
+    min_batch_inference_size = int(os.environ.get('MINERU_MIN_BATCH_INFERENCE_SIZE', 100))
    # 收集所有页面信息
    all_pages_info = []  # 存储(dataset_index, page_index, img, ocr, lang, width, height)
@@ -109,7 +113,7 @@ def doc_analyze(
    # 准备批处理
    images_with_extra_info = [(info[2], info[3], info[4]) for info in all_pages_info]
-    batch_size = MIN_BATCH_INFERENCE_SIZE
+    batch_size = min_batch_inference_size
    batch_images = [
        images_with_extra_info[i:i + batch_size]
        for i in range(0, len(images_with_extra_info), batch_size)

--- a/mineru/utils/enum_class.py
+++ b/mineru/utils/enum_class.py
@@ -33,9 +33,11 @@ class CategoryId:
    TableCaption = 6
    TableFootnote = 7
    InterlineEquation_Layout = 8
+    InterlineEquationNumber_Layout = 9
    InlineEquation = 13
    InterlineEquation_YOLO = 14
    OcrText = 15
+    LowScoreText = 16
    ImageFootnote = 101