fix(language): remove invalid UTF-16 surrogate pairs from input text

- Add `remove_invalid_surrogates` function to filter out invalid UTF-16 surrogate pairs - Integrate the new function into the `detect_lang` workflow - Include a test case with UTF-16 surrogates to verify the fix

fix(language): remove invalid UTF-16 surrogate pairs from input text
- Add `remove_invalid_surrogates` function to filter out invalid UTF-16 surrogate pairs - Integrate the new function into the `detect_lang` workflow - Include a test case with UTF-16 surrogates to verify the fix
1a549a0e · myhloli · 916ced9f · 1a549a0e
Commit 1a549a0e authored Jan 15, 2025 by myhloli
Hide whitespace changes
Inline Side-by-side

Showing with 9 additions and 0 deletions

magic_pdf/libs/language.py magic_pdf/libs/language.py +9 -0

No files found.
--- a/magic_pdf/libs/language.py
+++ b/magic_pdf/libs/language.py
@@ -12,12 +12,20 @@ if not os.getenv("FTLANG_CACHE"):
 from fast_langdetect import detect_language


+def remove_invalid_surrogates(text):
+    # 移除无效的 UTF-16 代理对
+    return ''.join(c for c in text if not (0xD800 <= ord(c) <= 0xDFFF))
+
+
 def detect_lang(text: str) -> str:

    if len(text) == 0:
        return ""

    text = text.replace("\n", "")
+    text = remove_invalid_surrogates(text)
+
+    # print(text)
    try:
        lang_upper = detect_language(text)
    except:
@@ -37,3 +45,4 @@ if __name__ == '__main__':
    print(detect_lang("<html>This is a test</html>"))
    print(detect_lang("这个是中文测试。"))
    print(detect_lang("<html>这个是中文测试。</html>"))
+    print(detect_lang("〖\ud835\udc46\ud835〗这是个包含utf-16的中文测试"))
\ No newline at end of file