Unverified Commit 1c7c34bc authored by Yih-Dar's avatar Yih-Dar Committed by GitHub
Browse files

Improve `PreTrainedTokenizerFast` loading time when there are many added tokens (#31404)



* use hash

* use hash

* update

---------
Co-authored-by: default avatarydshieh <ydshieh@users.noreply.github.com>
parent 6e56b834
...@@ -172,10 +172,12 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): ...@@ -172,10 +172,12 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
# allows converting a slow -> fast, non-legacy: if the `tokenizer.json` does not have all the added tokens # allows converting a slow -> fast, non-legacy: if the `tokenizer.json` does not have all the added tokens
# uses the information stored in `added_tokens_decoder`. # uses the information stored in `added_tokens_decoder`.
# this is costly for fast tokenizers as we re-compute the regex again. But not all tokens are added tokens # this is costly for fast tokenizers as we re-compute the regex again. But not all tokens are added tokens
# Use hash to speed up the very slow operation `token not in added_tokens_decoder`.
added_tokens_decoder_hash = {hash(repr(token)) for token in self.added_tokens_decoder}
tokens_to_add = [ tokens_to_add = [
token token
for index, token in sorted(added_tokens_decoder.items(), key=lambda x: x[0]) for index, token in sorted(added_tokens_decoder.items(), key=lambda x: x[0])
if token not in self.added_tokens_decoder if hash(repr(token)) not in added_tokens_decoder_hash
] ]
encoder = list(self.added_tokens_encoder.keys()) + [str(token) for token in tokens_to_add] encoder = list(self.added_tokens_encoder.keys()) + [str(token) for token in tokens_to_add]
# if some of the special tokens are strings, we check if we don't already have a token # if some of the special tokens are strings, we check if we don't already have a token
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment