Commit ce0d9905 authored by 赵小蒙's avatar 赵小蒙
Browse files

use fast_langdetect replace cld2

parent 06063014
import pycld2 as cld2
import regex
import unicodedata
from fast_langdetect import detect_langs
RE_BAD_CHARS = regex.compile(r"\p{Cc}|\p{Cs}")
......@@ -13,17 +12,13 @@ def remove_bad_chars(text):
def detect_lang(text: str) -> str:
if len(text) == 0:
return ""
try:
_, _, details = cld2.detect(text)
lang_upper = detect_langs(text)
except:
# cld2 doesn't like control characters
# https://github.com/mikemccand/chromium-compact-language-detector/issues/22#issuecomment-435904616
html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C',]])
_, _, details = cld2.detect(html_no_ctrl_chars)
lang = ""
html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
lang_upper = detect_langs(html_no_ctrl_chars)
try:
lang = details[0][1].lower()
lang = lang_upper.lower()
except:
lang = ""
return lang
......@@ -33,4 +28,4 @@ if __name__ == '__main__':
print(detect_lang("This is a test."))
print(detect_lang("<html>This is a test</html>"))
print(detect_lang("这个是中文测试。"))
print(detect_lang("<html>这个是中文测试。</html>"))
\ No newline at end of file
print(detect_lang("<html>这个是中文测试。</html>"))
......@@ -7,13 +7,12 @@ loguru>=0.6.0
matplotlib>=3.8.3
numpy>=1.21.6
pandas>=1.3.5
pycld2>=0.41
fast-langdetect>=0.1.1
regex>=2023.12.25
termcolor>=2.4.0
wordninja>=2.0.0
scikit-learn>=1.0.2
nltk==3.8.1
s3pathlib>=2.1.1
pytest
paddlepaddle
paddleocr @ https://github.com/magicpdf/PaddleOCR/releases/download/paddleocr-2.8.2-released/paddleocr-2.8.2-py3-none-any.whl
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment