Commit 3f60a60e authored by WrRan's avatar WrRan
Browse files

text in never_split should not lowercase

parent 751beb9e
...@@ -182,7 +182,7 @@ class BasicTokenizer(object): ...@@ -182,7 +182,7 @@ class BasicTokenizer(object):
orig_tokens = whitespace_tokenize(text) orig_tokens = whitespace_tokenize(text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if self.do_lower_case: if self.do_lower_case and token not in self.never_split:
token = token.lower() token = token.lower()
token = self._run_strip_accents(token) token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token)) split_tokens.extend(self._run_split_on_punc(token))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment