Merge pull request #1944 from icecraft/feat/remove_intree_parallel_inference

style: remove unused code

Merge pull request #1944 from icecraft/feat/remove_intree_parallel_inference
style: remove unused code
b3faee93 · Xiaomeng Zhao · GitHub · e33ec616 · e9c24739 · b3faee93
Unverified Commit b3faee93 authored Mar 19, 2025 by Xiaomeng Zhao Committed by GitHub Mar 19, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 22 additions and 78 deletions

magic_pdf/model/doc_analyze_by_custom_model.py magic_pdf/model/doc_analyze_by_custom_model.py +21 -77

magic_pdf/tools/common.py magic_pdf/tools/common.py +1 -1

No files found.
--- a/magic_pdf/model/doc_analyze_by_custom_model.py
+++ b/magic_pdf/model/doc_analyze_by_custom_model.py
@@ -34,8 +34,6 @@ from magic_pdf.model.model_list import MODEL

 # from magic_pdf.operators.models import InferenceResult

-MIN_BATCH_INFERENCE_SIZE = 100
-
 class ModelSingleton:
    _instance = None
    _models = {}
@@ -143,17 +141,14 @@ def doc_analyze(
    layout_model=None,
    formula_enable=None,
    table_enable=None,
-    one_shot: bool = True,
 ):
    end_page_id = (
        end_page_id
        if end_page_id is not None and end_page_id >= 0
        else len(dataset) - 1
    )
-    parallel_count = None
-    if os.environ.get('MINERU_PARALLEL_INFERENCE_COUNT'):
-        parallel_count = int(os.environ['MINERU_PARALLEL_INFERENCE_COUNT'])

+    MIN_BATCH_INFERENCE_SIZE = int(os.environ.get('MINERU_MIN_BATCH_INFERENCE_SIZE', 100))
    images = []
    page_wh_list = []
    for index in range(len(dataset)):
@@ -163,41 +158,16 @@ def doc_analyze(
            images.append(img_dict['img'])
            page_wh_list.append((img_dict['width'], img_dict['height']))

-    if one_shot and len(images) >= MIN_BATCH_INFERENCE_SIZE:
-        if parallel_count is None:
-            parallel_count = 2 # should check the gpu memory firstly !
-        # split images into parallel_count batches
-        if parallel_count > 1:
-            batch_size = (len(images) + parallel_count - 1) // parallel_count
-            batch_images = [images[i:i+batch_size] for i in range(0, len(images), batch_size)]
-        else:
-            batch_images = [images]
-        results = []
-        parallel_count = len(batch_images) # adjust to real parallel count
-        # using concurrent.futures to analyze
-        """
-        with fut.ProcessPoolExecutor(max_workers=parallel_count) as executor:
-            futures = [executor.submit(may_batch_image_analyze, batch_image, sn, ocr, show_log, lang, layout_model, formula_enable, table_enable) for sn, batch_image in enumerate(batch_images)]
-            for future in fut.as_completed(futures):
-                sn, result = future.result()
-                result_history[sn] = result
-
-        for key in sorted(result_history.keys()):
-            results.extend(result_history[key])
-        """
-        results = []
-        pool = mp.Pool(processes=parallel_count)
-        mapped_results = pool.starmap(may_batch_image_analyze, [(batch_image, sn, ocr, show_log, lang, layout_model, formula_enable, table_enable) for sn, batch_image in enumerate(batch_images)])
-        for sn, result in mapped_results:
-            results.extend(result)
-
+    if len(images) >= MIN_BATCH_INFERENCE_SIZE:
+        batch_size = MIN_BATCH_INFERENCE_SIZE
+        batch_images = [images[i:i+batch_size] for i in range(0, len(images), batch_size)]
    else:
-        _, results = may_batch_image_analyze(
-            images,
-            0,
-            ocr,
-            show_log,
-            lang, layout_model, formula_enable, table_enable)
+        batch_images = [images]
+
+    results = []
+    for sn, batch_image in enumerate(batch_images):
+        _, result = may_batch_image_analyze(batch_image, sn, ocr, show_log, lang, layout_model, formula_enable, table_enable)
+        results.extend(result)

    model_json = []
    for index in range(len(dataset)):
@@ -224,11 +194,8 @@ def batch_doc_analyze(
    layout_model=None,
    formula_enable=None,
    table_enable=None,
-    one_shot: bool = True,
 ):
-    parallel_count = None
-    if os.environ.get('MINERU_PARALLEL_INFERENCE_COUNT'):
-        parallel_count = int(os.environ['MINERU_PARALLEL_INFERENCE_COUNT'])
+    MIN_BATCH_INFERENCE_SIZE = int(os.environ.get('MINERU_MIN_BATCH_INFERENCE_SIZE', 100))
    images = []
    page_wh_list = []
    for dataset in datasets:
@@ -238,40 +205,17 @@ def batch_doc_analyze(
            images.append(img_dict['img'])
            page_wh_list.append((img_dict['width'], img_dict['height']))

-    if one_shot and len(images) >= MIN_BATCH_INFERENCE_SIZE:
-        if parallel_count is None:
-            parallel_count = 2 # should check the gpu memory firstly !
-        # split images into parallel_count batches
-        if parallel_count > 1:
-            batch_size = (len(images) + parallel_count - 1) // parallel_count
-            batch_images = [images[i:i+batch_size] for i in range(0, len(images), batch_size)]
-        else:
-            batch_images = [images]
-        results = []
-        parallel_count = len(batch_images) # adjust to real parallel count
-        # using concurrent.futures to analyze
-        """
-        with fut.ProcessPoolExecutor(max_workers=parallel_count) as executor:
-            futures = [executor.submit(may_batch_image_analyze, batch_image, sn, ocr, show_log, lang, layout_model, formula_enable, table_enable) for sn, batch_image in enumerate(batch_images)]
-            for future in fut.as_completed(futures):
-                sn, result = future.result()
-                result_history[sn] = result
-
-        for key in sorted(result_history.keys()):
-            results.extend(result_history[key])
-        """
-        results = []
-        pool = mp.Pool(processes=parallel_count)
-        mapped_results = pool.starmap(may_batch_image_analyze, [(batch_image, sn, ocr, show_log, lang, layout_model, formula_enable, table_enable) for sn, batch_image in enumerate(batch_images)])
-        for sn, result in mapped_results:
-            results.extend(result)
+    if len(images) >= MIN_BATCH_INFERENCE_SIZE:
+        batch_size = MIN_BATCH_INFERENCE_SIZE
+        batch_images = [images[i:i+batch_size] for i in range(0, len(images), batch_size)]
    else:
-        _, results = may_batch_image_analyze(
-            images,
-            0,
-            ocr,
-            show_log,
-            lang, layout_model, formula_enable, table_enable)
+        batch_images = [images]
+
+    results = []
+
+    for sn, batch_image in enumerate(batch_images):
+        _, result = may_batch_image_analyze(batch_image, sn, ocr, show_log, lang, layout_model, formula_enable, table_enable)
+        results.extend(result)
    infer_results = []

    from magic_pdf.operators.models import InferenceResult

--- a/magic_pdf/tools/common.py
+++ b/magic_pdf/tools/common.py
@@ -314,7 +314,7 @@ def batch_do_parse(
            dss.append(PymuDocDataset(v, lang=lang))
        else:
            dss.append(v)
-    infer_results = batch_doc_analyze(dss, lang=lang, layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable, one_shot=True)
+    infer_results = batch_doc_analyze(dss, lang=lang, layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
    for idx, infer_result in enumerate(infer_results):
        _do_parse(output_dir, pdf_file_names[idx], dss[idx], infer_result.get_infer_res(), parse_method, debug_able, f_draw_span_bbox=f_draw_span_bbox, f_draw_layout_bbox=f_draw_layout_bbox, f_dump_md=f_dump_md, f_dump_middle_json=f_dump_middle_json, f_dump_model_json=f_dump_model_json, f_dump_orig_pdf=f_dump_orig_pdf, f_dump_content_list=f_dump_content_list, f_make_md_mode=f_make_md_mode, f_draw_model_bbox=f_draw_model_bbox, f_draw_line_sort_bbox=f_draw_line_sort_bbox, f_draw_char_bbox=f_draw_char_bbox)