Unverified Commit d04adc35 authored by dctelus's avatar dctelus Committed by GitHub
Browse files

Add length to PreTrainedTokenizer train_new_from_iterator (#16493)

parent 147c8166
...@@ -601,7 +601,13 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): ...@@ -601,7 +601,13 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
return file_names return file_names
def train_new_from_iterator( def train_new_from_iterator(
self, text_iterator, vocab_size, new_special_tokens=None, special_tokens_map=None, **kwargs self,
text_iterator,
vocab_size,
length=None,
new_special_tokens=None,
special_tokens_map=None,
**kwargs,
): ):
""" """
Trains a tokenizer on a new corpus with the same defaults (in terms of special tokens or tokenization pipeline) Trains a tokenizer on a new corpus with the same defaults (in terms of special tokens or tokenization pipeline)
...@@ -613,6 +619,8 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): ...@@ -613,6 +619,8 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
if you have everything in memory. if you have everything in memory.
vocab_size (`int`): vocab_size (`int`):
The size of the vocabulary you want for your tokenizer. The size of the vocabulary you want for your tokenizer.
length (`int`, *optional*):
The total number of sequences in the iterator. This is used to provide meaningful progress tracking
new_special_tokens (list of `str` or `AddedToken`, *optional*): new_special_tokens (list of `str` or `AddedToken`, *optional*):
A list of new special tokens to add to the tokenizer you are training. A list of new special tokens to add to the tokenizer you are training.
special_tokens_map (`Dict[str, str]`, *optional*): special_tokens_map (`Dict[str, str]`, *optional*):
...@@ -694,7 +702,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): ...@@ -694,7 +702,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
trainer_class = MODEL_TO_TRAINER_MAPPING[tokenizer_json["model"]["type"]] trainer_class = MODEL_TO_TRAINER_MAPPING[tokenizer_json["model"]["type"]]
trainer = trainer_class(vocab_size=vocab_size, special_tokens=special_tokens, **kwargs) trainer = trainer_class(vocab_size=vocab_size, special_tokens=special_tokens, **kwargs)
tokenizer.train_from_iterator(text_iterator, trainer=trainer) tokenizer.train_from_iterator(text_iterator, length=length, trainer=trainer)
if post_processor is not None: if post_processor is not None:
trained_tokenizer_json = json.loads(tokenizer.to_str()) trained_tokenizer_json = json.loads(tokenizer.to_str())
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment