Commit a9515851 authored by danai-antoniou's avatar danai-antoniou
Browse files

Moved duplicate token check

parent d7395789
...@@ -508,14 +508,12 @@ class PreTrainedTokenizer(object): ...@@ -508,14 +508,12 @@ class PreTrainedTokenizer(object):
if not new_tokens: if not new_tokens:
return 0 return 0
if len(new_tokens) != len(set(new_tokens)):
raise ValueError("The provided list of tokens contains duplicates.")
to_add_tokens = [] to_add_tokens = []
for token in new_tokens: for token in new_tokens:
assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode)) assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode))
if token != self.unk_token and \ if token != self.unk_token and \
self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token): self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token) and \
token not in to_add_tokens:
to_add_tokens.append(token) to_add_tokens.append(token)
logger.info("Adding %s to the vocabulary", token) logger.info("Adding %s to the vocabulary", token)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment