use fast_langdetect replace cld2

ce0d9905 · 赵小蒙 · 06063014 · ce0d9905 · ce0d9905
Commit ce0d9905 authored Jun 17, 2024 by 赵小蒙
Hide whitespace changes
Inline Side-by-side

Showing with 7 additions and 13 deletions

magic_pdf/libs/language.py magic_pdf/libs/language.py +6 -11

requirements.txt requirements.txt +1 -2

No files found.
--- a/magic_pdf/libs/language.py
+++ b/magic_pdf/libs/language.py
-import pycld2 as cld2
 import regex
 import unicodedata
-
+from fast_langdetect import detect_langs

 RE_BAD_CHARS = regex.compile(r"\p{Cc}|\p{Cs}")

@@ -13,17 +12,13 @@ def remove_bad_chars(text):
 def detect_lang(text: str) -> str:
    if len(text) == 0:
        return ""
-
    try:
-        _, _, details = cld2.detect(text)
+        lang_upper = detect_langs(text)
    except:
-        # cld2 doesn't like control characters
-        # https://github.com/mikemccand/chromium-compact-language-detector/issues/22#issuecomment-435904616
-        html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C',]])
-        _, _, details = cld2.detect(html_no_ctrl_chars)
-    lang = ""
+        html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
+        lang_upper = detect_langs(html_no_ctrl_chars)
    try:
-        lang = details[0][1].lower()
+        lang = lang_upper.lower()
    except:
        lang = ""
    return lang
@@ -33,4 +28,4 @@ if __name__ == '__main__':
    print(detect_lang("This is a test."))
    print(detect_lang("<html>This is a test</html>"))
    print(detect_lang("这个是中文测试。"))
-    print(detect_lang("<html>这个是中文测试。</html>"))
\ No newline at end of file
+    print(detect_lang("<html>这个是中文测试。</html>"))
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,13 +7,12 @@ loguru>=0.6.0
 matplotlib>=3.8.3
 numpy>=1.21.6
 pandas>=1.3.5
-pycld2>=0.41
+fast-langdetect>=0.1.1
 regex>=2023.12.25
 termcolor>=2.4.0
 wordninja>=2.0.0
 scikit-learn>=1.0.2
 nltk==3.8.1
 s3pathlib>=2.1.1
-pytest
 paddlepaddle
 paddleocr @ https://github.com/magicpdf/PaddleOCR/releases/download/paddleocr-2.8.2-released/paddleocr-2.8.2-py3-none-any.whl
\ No newline at end of file