language.py 1.07 KB
Newer Older
1
import os
赵小蒙's avatar
赵小蒙 committed
2
import unicodedata
3
4
5
6
7
8
9
10
11

if not os.getenv("FTLANG_CACHE"):
    current_file_path = os.path.abspath(__file__)
    current_dir = os.path.dirname(current_file_path)
    root_dir = os.path.dirname(current_dir)
    ftlang_cache_dir = os.path.join(root_dir, 'resources', 'fasttext-langdetect')
    os.environ["FTLANG_CACHE"] = str(ftlang_cache_dir)
    # print(os.getenv("FTLANG_CACHE"))

12
from fast_langdetect import detect_language
赵小蒙's avatar
赵小蒙 committed
13
14
15


def detect_lang(text: str) -> str:
16

赵小蒙's avatar
赵小蒙 committed
17
18
19
    if len(text) == 0:
        return ""
    try:
20
        lang_upper = detect_language(text)
赵小蒙's avatar
赵小蒙 committed
21
    except:
赵小蒙's avatar
赵小蒙 committed
22
        html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
23
        lang_upper = detect_language(html_no_ctrl_chars)
赵小蒙's avatar
赵小蒙 committed
24
    try:
赵小蒙's avatar
赵小蒙 committed
25
        lang = lang_upper.lower()
赵小蒙's avatar
赵小蒙 committed
26
27
28
29
30
31
    except:
        lang = ""
    return lang


if __name__ == '__main__':
32
    print(os.getenv("FTLANG_CACHE"))
赵小蒙's avatar
赵小蒙 committed
33
34
35
    print(detect_lang("This is a test."))
    print(detect_lang("<html>This is a test</html>"))
    print(detect_lang("这个是中文测试。"))
赵小蒙's avatar
赵小蒙 committed
36
    print(detect_lang("<html>这个是中文测试。</html>"))