refactor(pdf_parse): improve text content extraction from PDF spans

- Optimize character sorting for accurate text assembly - Handle empty char scenarios to prevent errors - Remove unnecessary comments and improve code readability - Enhance OCR text content handling by removing low-confidence spans

refactor(pdf_parse): improve text content extraction from PDF spans
- Optimize character sorting for accurate text assembly - Handle empty char scenarios to prevent errors - Remove unnecessary comments and improve code readability - Enhance OCR text content handling by removing low-confidence spans
14656085 · myhloli · 7964ae45 · 14656085
Commit 14656085 authored Nov 25, 2024 by myhloli
Show whitespace changes
Inline Side-by-side

Showing with 16 additions and 17 deletions

magic_pdf/pdf_parse_union_core_v2.py magic_pdf/pdf_parse_union_core_v2.py +16 -17

No files found.
--- a/magic_pdf/pdf_parse_union_core_v2.py
+++ b/magic_pdf/pdf_parse_union_core_v2.py
@@ -89,28 +89,25 @@ def __replace_STX_ETX(text_str: str):
 def chars_to_content(span):
-    # # 先给chars按char['bbox']的x坐标排序
+    # 检查span中的char是否为空
-    # span['chars'] = sorted(span['chars'], key=lambda x: x['bbox'][0])
+    if len(span['chars']) == 0:
+        span['content'] = ''
+    else:
        # 先给chars按char['bbox']的中心点的x坐标排序
        span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
-    content = ''
        # 求char的平均宽度
-    if len(span['chars']) == 0:
-        span['content'] = content
-        del span['chars']
-        return
-    else:
        char_width_sum = sum([char['bbox'][2] - char['bbox'][0] for char in span['chars']])
        char_avg_width = char_width_sum / len(span['chars'])
+        content = ''
        for char in span['chars']:
            # 如果下一个char的x0和上一个char的x1距离超过一个字符宽度，则需要在中间插入一个空格
            if char['bbox'][0] - span['chars'][span['chars'].index(char) - 1]['bbox'][2] > char_avg_width:
                content += ' '
            content += char['c']
        span['content'] = __replace_STX_ETX(content)
    del span['chars']
@@ -218,6 +215,8 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
                    ocr_text, ocr_score = ocr_res[0][0]
                    if ocr_score > 0.5 and len(ocr_text) > 0:
                        span['content'] = ocr_text
+                    else:
+                        spans.remove(span)
    return spans