Merge pull request #1136 from myhloli/dev

refactor(ocr): improve text processing and span handling

Merge pull request #1136 from myhloli/dev
refactor(ocr): improve text processing and span handling
b4dfa0f9 · Xiaomeng Zhao · GitHub · c295587b · 88c0854a · b4dfa0f9
Unverified Commit b4dfa0f9 authored Nov 28, 2024 by Xiaomeng Zhao Committed by GitHub Nov 28, 2024
Show whitespace changes
Inline Side-by-side

Showing with 13 additions and 22 deletions

magic_pdf/dict2md/ocr_mkcontent.py magic_pdf/dict2md/ocr_mkcontent.py +9 -21

magic_pdf/pdf_parse_union_core_v2.py magic_pdf/pdf_parse_union_core_v2.py +4 -1

No files found.
--- a/magic_pdf/dict2md/ocr_mkcontent.py
+++ b/magic_pdf/dict2md/ocr_mkcontent.py
@@ -136,14 +136,11 @@ def merge_para_with_text(para_block):
            para_text += '  \n'

        line_text = ''
-        line_lang = ''
        for span in line['spans']:
            span_type = span['type']
            if span_type == ContentType.Text:
                line_text += span['content'].strip()

-        if line_text != '':
-            line_lang = detect_lang(line_text)
        for j, span in enumerate(line['spans']):

            span_type = span['type']
@@ -157,27 +154,18 @@ def merge_para_with_text(para_block):

            content = content.strip()
            if content != '':
-                langs = ['zh', 'ja', 'ko']
-                if line_lang in langs:  # 遇到一些一个字一个span的文档，这种单字语言判断不准，需要用整行文本判断
-                    if span_type in [ContentType.Text, ContentType.InterlineEquation]:
-                        para_text += content  # 中文/日语/韩文语境下，content间不需要空格分隔
-                    elif span_type == ContentType.InlineEquation:
-                        para_text += f' {content} '
-                else:
                if span_type in [ContentType.Text, ContentType.InlineEquation]:
                    # 如果span是line的最后一个且末尾带有-连字符，那么末尾不应该加空格,同时应该把-删除
                    if j == len(line['spans'])-1 and __is_hyphen_at_line_end(content):
                        para_text += content[:-1]
-                        elif len(content) == 1 and content not in ['A', 'I', 'a', 'i'] and not content.isdigit():
-                            para_text += content
-                        else:  # 西方文本语境下 content间需要空格分隔
+                    else:  # content间需要空格分隔
                        para_text += f'{content} '
                elif span_type == ContentType.InterlineEquation:
                    para_text += content
            else:
                continue
    # 连写字符拆分
-    para_text = __replace_ligatures(para_text)
+    # para_text = __replace_ligatures(para_text)

    return para_text


--- a/magic_pdf/pdf_parse_union_core_v2.py
+++ b/magic_pdf/pdf_parse_union_core_v2.py
@@ -84,6 +84,9 @@ def chars_to_content(span):
 LINE_STOP_FLAG = ('.', '!', '?', '。', '！', '？', ')', '）', '"', '”', ':', '：', ';', '；', ']', '】', '}', '}', '>', '》', '、', ',', '，', '-', '—', '–',)
 def fill_char_in_spans(spans, all_chars):

+    # 简单从上到下排一下序
+    spans = sorted(spans, key=lambda x: x['bbox'][1])
+
    for char in all_chars:
        for span in spans:
            # 判断char是否属于LINE_STOP_FLAG
@@ -137,7 +140,7 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):

 def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):

-    text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
+    text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP | fitz.TEXT_CID_FOR_UNKNOWN_UNICODE)['blocks']

    all_pymu_chars = []
    for block in text_blocks_raw: