Unverified Commit e2a6445e authored by Funtowicz Morgan's avatar Funtowicz Morgan Committed by GitHub
Browse files

Tokenizer fast warnings (#2922)



* Remove warning when pad_to_max_length is not set.
Signed-off-by: default avatarMorgan Funtowicz <morgan@huggingface.co>

* Move RoberTa warning to RoberTa and not GPT2 base tokenizer.
Signed-off-by: default avatarMorgan Funtowicz <morgan@huggingface.co>
parent 9b309331
...@@ -269,9 +269,3 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast): ...@@ -269,9 +269,3 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast):
unk_token=unk_token, unk_token=unk_token,
**kwargs, **kwargs,
) )
logger.warning(
"RobertaTokenizerFast has an issue when working on mask language modeling "
"where it introduces an extra encoded space before the mask token."
"See https://github.com/huggingface/transformers/pull/2778 for more information."
)
...@@ -211,6 +211,12 @@ class RobertaTokenizerFast(GPT2TokenizerFast): ...@@ -211,6 +211,12 @@ class RobertaTokenizerFast(GPT2TokenizerFast):
self.max_len_single_sentence = self.max_len - self.num_added_tokens(False) # take into account special tokens self.max_len_single_sentence = self.max_len - self.num_added_tokens(False) # take into account special tokens
self.max_len_sentences_pair = self.max_len - self.num_added_tokens(True) # take into account special tokens self.max_len_sentences_pair = self.max_len - self.num_added_tokens(True) # take into account special tokens
logger.warning(
"RobertaTokenizerFast has an issue when working on mask language modeling "
"where it introduces an extra encoded space before the mask token."
"See https://github.com/huggingface/transformers/pull/2778 for more information."
)
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id] output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
if token_ids_1 is None: if token_ids_1 is None:
......
...@@ -85,7 +85,7 @@ def truncate_and_pad( ...@@ -85,7 +85,7 @@ def truncate_and_pad(
pad_type_id=pad_token_type_id, pad_type_id=pad_token_type_id,
pad_token=pad_token, pad_token=pad_token,
) )
else: elif pad_to_max_length:
logger.warning( logger.warning(
"Disabled padding because no padding token set (pad_token: {}, pad_token_id: {}).\n" "Disabled padding because no padding token set (pad_token: {}, pad_token_id: {}).\n"
"To remove this error, you can add a new pad token and then resize model embedding:\n" "To remove this error, you can add a new pad token and then resize model embedding:\n"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment