Merge pull request #1777 from opendatalab/dev

perf(model): optimize batch analyze process

Merge pull request #1777 from opendatalab/dev
perf(model): optimize batch analyze process
d3c822f8 · Xiaomeng Zhao · GitHub · dd7af4d4 · fddf111f · d3c822f8
Unverified Commit d3c822f8 authored Feb 25, 2025 by Xiaomeng Zhao Committed by GitHub Feb 25, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 33 additions and 7 deletions

magic_pdf/dict2md/ocr_mkcontent.py magic_pdf/dict2md/ocr_mkcontent.py +24 -0

magic_pdf/model/doc_analyze_by_custom_model.py magic_pdf/model/doc_analyze_by_custom_model.py +9 -7

No files found.
--- a/magic_pdf/dict2md/ocr_mkcontent.py
+++ b/magic_pdf/dict2md/ocr_mkcontent.py
@@ -126,11 +126,35 @@ def detect_language(text):
        return 'empty'
+def full_to_half(text: str) -> str:
+    """Convert full-width characters to half-width characters using code point manipulation.
+    Args:
+        text: String containing full-width characters
+    Returns:
+        String with full-width characters converted to half-width
+    """
+    result = []
+    for char in text:
+        code = ord(char)
+        # Full-width ASCII variants (FF01-FF5E)
+        if 0xFF01 <= code <= 0xFF5E:
+            result.append(chr(code - 0xFEE0))  # Shift to ASCII range
+        # Full-width space
+        elif code == 0x3000:
+            result.append(' ')
+        else:
+            result.append(char)
+    return ''.join(result)
 def merge_para_with_text(para_block):
    block_text = ''
    for line in para_block['lines']:
        for span in line['spans']:
            if span['type'] in [ContentType.Text]:
+                span['content'] = full_to_half(span['content'])
                block_text += span['content']
    block_lang = detect_lang(block_text)

--- a/magic_pdf/model/doc_analyze_by_custom_model.py
+++ b/magic_pdf/model/doc_analyze_by_custom_model.py
@@ -157,6 +157,7 @@ def doc_analyze(
    )
    batch_analyze = False
+    batch_ratio = 1
    device = get_device()
    npu_support = False
@@ -181,7 +182,6 @@ def doc_analyze(
                batch_ratio = 2
            logger.info(f'gpu_memory: {gpu_memory} GB, batch_ratio: {batch_ratio}')
-            batch_model = BatchAnalyze(model=custom_model, batch_ratio=batch_ratio)
            batch_analyze = True
    model_json = []
@@ -190,24 +190,26 @@ def doc_analyze(
    if batch_analyze:
        # batch analyze
        images = []
+        page_wh_list = []
        for index in range(len(dataset)):
            if start_page_id <= index <= end_page_id:
                page_data = dataset.get_page(index)
                img_dict = page_data.get_image()
                images.append(img_dict['img'])
+                page_wh_list.append((img_dict['width'], img_dict['height']))
+        batch_model = BatchAnalyze(model=custom_model, batch_ratio=batch_ratio)
        analyze_result = batch_model(images)
        for index in range(len(dataset)):
-            page_data = dataset.get_page(index)
-            img_dict = page_data.get_image()
-            page_width = img_dict['width']
-            page_height = img_dict['height']
            if start_page_id <= index <= end_page_id:
                result = analyze_result.pop(0)
+                page_width, page_height = page_wh_list.pop(0)
            else:
                result = []
+                page_height = 0
+                page_width = 0
-            page_info = {'page_no': index, 'height': page_height, 'width': page_width}
+            page_info = {'page_no': index, 'width': page_width, 'height': page_height}
            page_dict = {'layout_dets': result, 'page_info': page_info}
            model_json.append(page_dict)
@@ -227,7 +229,7 @@ def doc_analyze(
            else:
                result = []
-            page_info = {'page_no': index, 'height': page_height, 'width': page_width}
+            page_info = {'page_no': index, 'width': page_width, 'height': page_height}
            page_dict = {'layout_dets': result, 'page_info': page_info}
            model_json.append(page_dict)