"vscode:/vscode.git/clone" did not exist on "8ab0b22ef706d979928c4697321b85dd794c934d"
language.py 812 Bytes
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
2
import regex
import unicodedata
赵小蒙's avatar
赵小蒙 committed
3
from fast_langdetect import detect_langs
赵小蒙's avatar
赵小蒙 committed
4
5
6
7
8
9
10
11
12
13
14
15

RE_BAD_CHARS = regex.compile(r"\p{Cc}|\p{Cs}")


def remove_bad_chars(text):
    return RE_BAD_CHARS.sub("", text)


def detect_lang(text: str) -> str:
    if len(text) == 0:
        return ""
    try:
赵小蒙's avatar
赵小蒙 committed
16
        lang_upper = detect_langs(text)
赵小蒙's avatar
赵小蒙 committed
17
    except:
赵小蒙's avatar
赵小蒙 committed
18
19
        html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
        lang_upper = detect_langs(html_no_ctrl_chars)
赵小蒙's avatar
赵小蒙 committed
20
    try:
赵小蒙's avatar
赵小蒙 committed
21
        lang = lang_upper.lower()
赵小蒙's avatar
赵小蒙 committed
22
23
24
25
26
27
28
29
30
    except:
        lang = ""
    return lang


if __name__ == '__main__':
    print(detect_lang("This is a test."))
    print(detect_lang("<html>This is a test</html>"))
    print(detect_lang("这个是中文测试。"))
赵小蒙's avatar
赵小蒙 committed
31
    print(detect_lang("<html>这个是中文测试。</html>"))