Commit ce0d9905 authored by 赵小蒙's avatar 赵小蒙
Browse files

use fast_langdetect replace cld2

parent 06063014
import pycld2 as cld2
import regex import regex
import unicodedata import unicodedata
from fast_langdetect import detect_langs
RE_BAD_CHARS = regex.compile(r"\p{Cc}|\p{Cs}") RE_BAD_CHARS = regex.compile(r"\p{Cc}|\p{Cs}")
...@@ -13,17 +12,13 @@ def remove_bad_chars(text): ...@@ -13,17 +12,13 @@ def remove_bad_chars(text):
def detect_lang(text: str) -> str: def detect_lang(text: str) -> str:
if len(text) == 0: if len(text) == 0:
return "" return ""
try: try:
_, _, details = cld2.detect(text) lang_upper = detect_langs(text)
except: except:
# cld2 doesn't like control characters html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
# https://github.com/mikemccand/chromium-compact-language-detector/issues/22#issuecomment-435904616 lang_upper = detect_langs(html_no_ctrl_chars)
html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C',]])
_, _, details = cld2.detect(html_no_ctrl_chars)
lang = ""
try: try:
lang = details[0][1].lower() lang = lang_upper.lower()
except: except:
lang = "" lang = ""
return lang return lang
...@@ -33,4 +28,4 @@ if __name__ == '__main__': ...@@ -33,4 +28,4 @@ if __name__ == '__main__':
print(detect_lang("This is a test.")) print(detect_lang("This is a test."))
print(detect_lang("<html>This is a test</html>")) print(detect_lang("<html>This is a test</html>"))
print(detect_lang("这个是中文测试。")) print(detect_lang("这个是中文测试。"))
print(detect_lang("<html>这个是中文测试。</html>")) print(detect_lang("<html>这个是中文测试。</html>"))
\ No newline at end of file
...@@ -7,13 +7,12 @@ loguru>=0.6.0 ...@@ -7,13 +7,12 @@ loguru>=0.6.0
matplotlib>=3.8.3 matplotlib>=3.8.3
numpy>=1.21.6 numpy>=1.21.6
pandas>=1.3.5 pandas>=1.3.5
pycld2>=0.41 fast-langdetect>=0.1.1
regex>=2023.12.25 regex>=2023.12.25
termcolor>=2.4.0 termcolor>=2.4.0
wordninja>=2.0.0 wordninja>=2.0.0
scikit-learn>=1.0.2 scikit-learn>=1.0.2
nltk==3.8.1 nltk==3.8.1
s3pathlib>=2.1.1 s3pathlib>=2.1.1
pytest
paddlepaddle paddlepaddle
paddleocr @ https://github.com/magicpdf/PaddleOCR/releases/download/paddleocr-2.8.2-released/paddleocr-2.8.2-py3-none-any.whl paddleocr @ https://github.com/magicpdf/PaddleOCR/releases/download/paddleocr-2.8.2-released/paddleocr-2.8.2-py3-none-any.whl
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment