Unverified Commit e778264f authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #1469 from opendatalab/dev

fix(language): enhance language detection and text processing
parents 1b654fc2 0ebbfa5c
...@@ -16,11 +16,14 @@ def detect_lang(text: str) -> str: ...@@ -16,11 +16,14 @@ def detect_lang(text: str) -> str:
if len(text) == 0: if len(text) == 0:
return "" return ""
text = text.replace("\n", "")
try: try:
lang_upper = detect_language(text) lang_upper = detect_language(text)
except: except:
html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]]) html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
lang_upper = detect_language(html_no_ctrl_chars) lang_upper = detect_language(html_no_ctrl_chars)
try: try:
lang = lang_upper.lower() lang = lang_upper.lower()
except: except:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment