Add length to PreTrainedTokenizer train_new_from_iterator (#16493)

d04adc35 · dctelus · GitHub · 147c8166 · d04adc35
Unverified Commit d04adc35 authored Mar 30, 2022 by dctelus Committed by GitHub Mar 30, 2022
Hide whitespace changes
Inline Side-by-side

Showing with 10 additions and 2 deletions

src/transformers/tokenization_utils_fast.py src/transformers/tokenization_utils_fast.py +10 -2

No files found.
--- a/src/transformers/tokenization_utils_fast.py
+++ b/src/transformers/tokenization_utils_fast.py
@@ -601,7 +601,13 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
        return file_names
    def train_new_from_iterator(
-        self, text_iterator, vocab_size, new_special_tokens=None, special_tokens_map=None, **kwargs
+        self,
+        text_iterator,
+        vocab_size,
+        length=None,
+        new_special_tokens=None,
+        special_tokens_map=None,
+        **kwargs,
    ):
        """
        Trains a tokenizer on a new corpus with the same defaults (in terms of special tokens or tokenization pipeline)
@@ -613,6 +619,8 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
                if you have everything in memory.
            vocab_size (`int`):
                The size of the vocabulary you want for your tokenizer.
+            length (`int`, *optional*):
+                The total number of sequences in the iterator. This is used to provide meaningful progress tracking
            new_special_tokens (list of `str` or `AddedToken`, *optional*):
                A list of new special tokens to add to the tokenizer you are training.
            special_tokens_map (`Dict[str, str]`, *optional*):
@@ -694,7 +702,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
        trainer_class = MODEL_TO_TRAINER_MAPPING[tokenizer_json["model"]["type"]]
        trainer = trainer_class(vocab_size=vocab_size, special_tokens=special_tokens, **kwargs)
-        tokenizer.train_from_iterator(text_iterator, trainer=trainer)
+        tokenizer.train_from_iterator(text_iterator, length=length, trainer=trainer)
        if post_processor is not None:
            trained_tokenizer_json = json.loads(tokenizer.to_str())