Tokenizer fast warnings (#2922)

* Remove warning when pad_to_max_length is not set. Signed-off-by: Morgan Funtowicz <morgan@huggingface.co> * Move RoberTa warning to RoberTa and not GPT2 base tokenizer. Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

Tokenizer fast warnings (#2922)
* Remove warning when pad_to_max_length is not set. Signed-off-by: Morgan Funtowicz <morgan@huggingface.co> * Move RoberTa warning to RoberTa and not GPT2 base tokenizer. Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>
e2a6445e · Funtowicz Morgan · GitHub · 9b309331 · e2a6445e · e2a6445e
Unverified Commit e2a6445e authored Feb 20, 2020 by Funtowicz Morgan Committed by GitHub Feb 20, 2020
3 changed files
--- a/src/transformers/tokenization_gpt2.py
+++ b/src/transformers/tokenization_gpt2.py
@@ -269,9 +269,3 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast):
            unk_token=unk_token,
            **kwargs,
        )
-        logger.warning(
-            "RobertaTokenizerFast has an issue when working on mask language modeling "
-            "where it introduces an extra encoded space before the mask token."
-            "See https://github.com/huggingface/transformers/pull/2778 for more information."
-        )
--- a/src/transformers/tokenization_roberta.py
+++ b/src/transformers/tokenization_roberta.py
@@ -211,6 +211,12 @@ class RobertaTokenizerFast(GPT2TokenizerFast):
        self.max_len_single_sentence = self.max_len - self.num_added_tokens(False)  # take into account special tokens
        self.max_len_sentences_pair = self.max_len - self.num_added_tokens(True)  # take into account special tokens
+        logger.warning(
+            "RobertaTokenizerFast has an issue when working on mask language modeling "
+            "where it introduces an extra encoded space before the mask token."
+            "See https://github.com/huggingface/transformers/pull/2778 for more information."
+        )
    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
        if token_ids_1 is None:

--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -85,7 +85,7 @@ def truncate_and_pad(
            pad_type_id=pad_token_type_id,
            pad_token=pad_token,
        )
-    else:
+    elif pad_to_max_length:
        logger.warning(
            "Disabled padding because no padding token set (pad_token: {}, pad_token_id: {}).\n"
            "To remove this error, you can add a new pad token and then resize model embedding:\n"