Unverified Commit fbc7611e authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #1546 from myhloli/dev

fix(language): remove invalid UTF-16 surrogate pairs from input text
parents 1954573b 1a549a0e
...@@ -12,12 +12,20 @@ if not os.getenv("FTLANG_CACHE"): ...@@ -12,12 +12,20 @@ if not os.getenv("FTLANG_CACHE"):
from fast_langdetect import detect_language from fast_langdetect import detect_language
def remove_invalid_surrogates(text):
# 移除无效的 UTF-16 代理对
return ''.join(c for c in text if not (0xD800 <= ord(c) <= 0xDFFF))
def detect_lang(text: str) -> str: def detect_lang(text: str) -> str:
if len(text) == 0: if len(text) == 0:
return "" return ""
text = text.replace("\n", "") text = text.replace("\n", "")
text = remove_invalid_surrogates(text)
# print(text)
try: try:
lang_upper = detect_language(text) lang_upper = detect_language(text)
except: except:
...@@ -37,3 +45,4 @@ if __name__ == '__main__': ...@@ -37,3 +45,4 @@ if __name__ == '__main__':
print(detect_lang("<html>This is a test</html>")) print(detect_lang("<html>This is a test</html>"))
print(detect_lang("这个是中文测试。")) print(detect_lang("这个是中文测试。"))
print(detect_lang("<html>这个是中文测试。</html>")) print(detect_lang("<html>这个是中文测试。</html>"))
print(detect_lang("〖\ud835\udc46\ud835〗这是个包含utf-16的中文测试"))
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment