"...lm-evaluation-harness.git" did not exist on "b97c561f53e444e52e34a3c412f9d755fcf98e5a"
Unverified Commit af673222 authored by cchen-dialpad's avatar cchen-dialpad Committed by GitHub
Browse files

Improve the speed of adding tokens from added_tokens.json (#10780)

* use bisect to add one token to unique_no_split_tokens

* fix style
parent c301c263
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
Tokenization classes for python tokenizers. For fast tokenizers (provided by HuggingFace's tokenizers library) see Tokenization classes for python tokenizers. For fast tokenizers (provided by HuggingFace's tokenizers library) see
tokenization_utils_fast.py tokenization_utils_fast.py
""" """
import bisect
import itertools import itertools
import re import re
import unicodedata import unicodedata
...@@ -99,6 +100,19 @@ def _is_start_of_word(text): ...@@ -99,6 +100,19 @@ def _is_start_of_word(text):
return bool(_is_control(first_char) | _is_punctuation(first_char) | _is_whitespace(first_char)) return bool(_is_control(first_char) | _is_punctuation(first_char) | _is_whitespace(first_char))
def _insert_one_token_to_ordered_list(token_list: List[str], new_token: str):
"""
Inserts one token to an ordered list if it does not already exist. Note: token_list must be sorted.
"""
insertion_idx = bisect.bisect_left(token_list, new_token)
# Checks if new_token is already in the ordered token_list
if insertion_idx < len(token_list) and token_list[insertion_idx] == new_token:
# new_token is in token_list, don't add
return
else:
token_list.insert(insertion_idx, new_token)
@add_end_docstrings(INIT_TOKENIZER_DOCSTRING) @add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
class PreTrainedTokenizer(PreTrainedTokenizerBase): class PreTrainedTokenizer(PreTrainedTokenizerBase):
""" """
...@@ -199,9 +213,15 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): ...@@ -199,9 +213,15 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
# Make sure we don't split on any special tokens (even they were already in the vocab before e.g. for Albert) # Make sure we don't split on any special tokens (even they were already in the vocab before e.g. for Albert)
if special_tokens: if special_tokens:
if len(new_tokens) == 1:
_insert_one_token_to_ordered_list(self.unique_no_split_tokens, new_tokens[0])
else:
self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(new_tokens))) self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(new_tokens)))
else: else:
# Or on the newly added tokens # Or on the newly added tokens
if len(tokens_to_add) == 1:
_insert_one_token_to_ordered_list(self.unique_no_split_tokens, tokens_to_add[0])
else:
self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(tokens_to_add))) self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(tokens_to_add)))
return len(tokens_to_add) return len(tokens_to_add)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment