Unverified Commit 9a8c168f authored by Quentin Lhoest's avatar Quentin Lhoest Committed by GitHub
Browse files

Sort unique_no_split_tokens to make it deterministic (#6461)

* change unique_no_split_tokens's type to set

* use sorted list instead of set

* style
parent 1d6e71e1
......@@ -207,10 +207,10 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
# Make sure we don't split on any special tokens (even they were already in the vocab before e.g. for Albert)
if special_tokens:
self.unique_no_split_tokens = list(set(self.unique_no_split_tokens).union(set(new_tokens)))
self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(new_tokens)))
else:
# Or on the newly added tokens
self.unique_no_split_tokens = list(set(self.unique_no_split_tokens).union(set(tokens_to_add)))
self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(tokens_to_add)))
return len(tokens_to_add)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment