Skipping outputs (#3116)

* Minimal example * Proposal 2 * Proposal 2 for fast tokenizers * Typings * Docs * Revert "Docs" for easier review This reverts commit eaf0f97062e809887704a542144c537f769d5223. * Remove unnecessary assignments * Tests * Fix faulty type * Remove prints * return_outputs -> model_input_names * Revert "Revert "Docs" for easier review" This reverts commit 6fdc69408102bf695797f2dfddbb6350c6b9e722. * code quality

Skipping outputs (#3116)
* Minimal example * Proposal 2 * Proposal 2 for fast tokenizers * Typings * Docs * Revert "Docs" for easier review This reverts commit eaf0f97062e809887704a542144c537f769d5223. * Remove unnecessary assignments * Tests * Fix faulty type * Remove prints * return_outputs -> model_input_names * Revert "Revert "Docs" for easier review" This reverts commit 6fdc69408102bf695797f2dfddbb6350c6b9e722. * code quality
5164ea91 · Lysandre Debut · GitHub · 49debe62 · 5164ea91 · 5164ea91
Unverified Commit 5164ea91 authored Mar 09, 2020 by Lysandre Debut Committed by GitHub Mar 09, 2020
4 changed files
--- a/src/transformers/tokenization_distilbert.py
+++ b/src/transformers/tokenization_distilbert.py
@@ -69,6 +69,7 @@ class DistilBertTokenizer(BertTokenizer):
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    model_input_names = ["attention_mask"]


 class DistilBertTokenizerFast(BertTokenizerFast):
@@ -76,3 +77,4 @@ class DistilBertTokenizerFast(BertTokenizerFast):
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    model_input_names = ["attention_mask"]
--- a/src/transformers/tokenization_roberta.py
+++ b/src/transformers/tokenization_roberta.py
@@ -119,6 +119,7 @@ class RobertaTokenizer(GPT2Tokenizer):
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["attention_mask"]

    def __init__(
        self,
@@ -244,6 +245,7 @@ class RobertaTokenizerFast(GPT2TokenizerFast):
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["attention_mask"]

    def __init__(
        self,

--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -22,6 +22,7 @@ import os
 import re
 from collections import defaultdict
 from contextlib import contextmanager
+from typing import List, Optional, Tuple, Union

 from tokenizers.implementations import BaseTokenizer

@@ -138,6 +139,7 @@ class PreTrainedTokenizer(object):
    pretrained_vocab_files_map = {}
    pretrained_init_configuration = {}
    max_model_input_sizes = {}
+    model_input_names = ["token_type_ids", "attention_mask"]

    SPECIAL_TOKENS_ATTRIBUTES = [
        "bos_token",
@@ -316,6 +318,7 @@ class PreTrainedTokenizer(object):

        # Padding side is right by default and over-riden in subclasses. If specified in the kwargs, it is changed.
        self.padding_side = kwargs.pop("padding_side", self.padding_side)
+        self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)

        # Added tokens
        self.added_tokens_encoder = {}
@@ -849,14 +852,14 @@ class PreTrainedTokenizer(object):

    def encode(
        self,
-        text,
-        text_pair=None,
-        add_special_tokens=True,
-        max_length=None,
-        stride=0,
-        truncation_strategy="longest_first",
-        pad_to_max_length=False,
-        return_tensors=None,
+        text: str,
+        text_pair: Optional[str] = None,
+        add_special_tokens: bool = True,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        truncation_strategy: str = "longest_first",
+        pad_to_max_length: bool = False,
+        return_tensors: Optional[str] = None,
        **kwargs
    ):
        """
@@ -865,34 +868,43 @@ class PreTrainedTokenizer(object):
        Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``.

        Args:
-            text: The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
+            text (:obj:`str` or :obj:`List[str]`):
+                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
                the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
                method)
-            text_pair: Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
+            text_pair (:obj:`str` or :obj:`List[str]`, `optional`, defaults to :obj:`None`):
+                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
                string using the `tokenize` method) or a list of integers (tokenized string ids using the
                `convert_tokens_to_ids` method)
-            add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
+            add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                If set to ``True``, the sequences will be encoded with the special tokens relative
                to their model.
-            max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
+            max_length (:obj:`int`, `optional`, defaults to :obj:`None`):
+                If set to a number, will limit the total sequence returned so that it has a maximum length.
                If there are overflowing tokens, those will be added to the returned dictionary
-            stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
+            stride (:obj:`int`, `optional`, defaults to ``0``):
+                If set to a number along with max_length, the overflowing tokens returned will contain some tokens
                from the main sequence returned. The value of this argument defines the number of additional tokens.
-            truncation_strategy: string selected in the following options:
+            truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`):
+                String selected in the following options:
+
                - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
-                    starting from the longest one at each token (when there is a pair of input sequences)
+                  starting from the longest one at each token (when there is a pair of input sequences)
                - 'only_first': Only truncate the first sequence
                - 'only_second': Only truncate the second sequence
                - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
-            pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and
-                padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length.
-                The tokenizer padding sides are handled by the class attribute `padding_side` which can be set to the following strings:
+            pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                If set to True, the returned sequences will be padded according to the model's padding side and
+                padding index, up to their max length. If no max length is specified, the padding is done up to the
+                model's max length. The tokenizer padding sides are handled by the class attribute `padding_side`
+                which can be set to the following strings:
+
                - 'left': pads on the left of the sequences
                - 'right': pads on the right of the sequences
                Defaults to False: no padding.
-            return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
-                or PyTorch torch.Tensor instead of a list of python integers.
-            add_prefix_space: Only applies to GPT-2 and RoBERTa tokenizers. When `True`, this ensures that the sequence
-                begins with an empty space. False by default except for when using RoBERTa with `add_special_tokens=True`.
+            return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`):
+                Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant`
+                or PyTorch :obj:`torch.Tensor` instead of a list of python integers.
            **kwargs: passed to the `self.tokenize()` method
        """
        encoded_inputs = self.encode_plus(
@@ -911,59 +923,79 @@ class PreTrainedTokenizer(object):

    def encode_plus(
        self,
-        text,
-        text_pair=None,
-        add_special_tokens=True,
-        max_length=None,
-        stride=0,
-        truncation_strategy="longest_first",
-        pad_to_max_length=False,
-        return_tensors=None,
-        return_token_type_ids=True,
-        return_attention_mask=True,
-        return_overflowing_tokens=False,
-        return_special_tokens_mask=False,
-        return_offsets_mapping=False,
+        text: str,
+        text_pair: Optional[str] = None,
+        add_special_tokens: bool = True,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        truncation_strategy: str = "longest_first",
+        pad_to_max_length: bool = False,
+        return_tensors: Optional[str] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
        **kwargs
    ):
        """
-        Returns a dictionary containing the encoded sequence or sequence pair and additional informations:
+        Returns a dictionary containing the encoded sequence or sequence pair and additional information:
        the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.

        Args:
-            text: The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
+            text (:obj:`str` or :obj:`List[str]`):
+                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
                the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
                method)
-            text_pair: Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
+            text_pair (:obj:`str` or :obj:`List[str]`, `optional`, defaults to :obj:`None`):
+                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
                string using the `tokenize` method) or a list of integers (tokenized string ids using the
                `convert_tokens_to_ids` method)
-            add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
+            add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                If set to ``True``, the sequences will be encoded with the special tokens relative
                to their model.
-            max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
+            max_length (:obj:`int`, `optional`, defaults to :obj:`None`):
+                If set to a number, will limit the total sequence returned so that it has a maximum length.
                If there are overflowing tokens, those will be added to the returned dictionary
-            stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
+            stride (:obj:`int`, `optional`, defaults to ``0``):
+                If set to a number along with max_length, the overflowing tokens returned will contain some tokens
                from the main sequence returned. The value of this argument defines the number of additional tokens.
-            truncation_strategy: string selected in the following options:
+            truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`):
+                String selected in the following options:
+
                - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
-                    starting from the longest one at each token (when there is a pair of input sequences)
+                  starting from the longest one at each token (when there is a pair of input sequences)
                - 'only_first': Only truncate the first sequence
                - 'only_second': Only truncate the second sequence
                - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
-            pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and
-                padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length.
-                The tokenizer padding sides are handled by the class attribute `padding_side` which can be set to the following strings:
+            pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                If set to True, the returned sequences will be padded according to the model's padding side and
+                padding index, up to their max length. If no max length is specified, the padding is done up to the
+                model's max length. The tokenizer padding sides are handled by the class attribute `padding_side`
+                which can be set to the following strings:
+
                - 'left': pads on the left of the sequences
                - 'right': pads on the right of the sequences
                Defaults to False: no padding.
-            return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
-                or PyTorch torch.Tensor instead of a list of python integers.
-            add_prefix_space: Only applies to GPT-2 and RoBERTa tokenizers. When `True`, this ensures that the sequence
-                begins with an empty space. False by default except for when using RoBERTa with `add_special_tokens=True`.
-            return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True).
-            return_attention_mask: (optional) Set to False to avoid returning attention mask (default True)
-            return_overflowing_tokens: (optional) Set to True to return overflowing token information (default False).
-            return_special_tokens_mask: (optional) Set to True to return special tokens mask information (default False).
-            return_offsets_mapping: (optional) Set to True to return (char_start, char_end) for each token (default False).
+            return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`):
+                Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant`
+                or PyTorch :obj:`torch.Tensor` instead of a list of python integers.
+            return_token_type_ids (:obj:`bool`, `optional`, defaults to :obj:`None`):
+                Whether to return token type IDs. If left to the default, will return the token type IDs according
+                to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+
+                `What are token type IDs? <../glossary.html#token-type-ids>`_
+            return_attention_mask (:obj:`bool`, `optional`, defaults to :obj:`none`):
+                Whether to return the attention mask. If left to the default, will return the attention mask according
+                to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True to return overflowing token information (default False).
+            return_special_tokens_mask (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True to return special tokens mask information (default False).
+            return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True to return (char_start, char_end) for each token (default False).
                If using Python's tokenizer, this method will raise NotImplementedError. This one is only available on
                Rust-based tokenizers inheriting from PreTrainedTokenizerFast.
            **kwargs: passed to the `self.tokenize()` method
@@ -981,13 +1013,14 @@ class PreTrainedTokenizer(object):
                }

            With the fields:
-                ``input_ids``: list of token ids to be fed to a model
-                ``token_type_ids``: list of token type ids to be fed to a model
-                ``attention_mask``: list of indices specifying which tokens should be attended to by the model
-                ``overflowing_tokens``: list of overflowing tokens if a max length is specified.
-                ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
-                ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
-                tokens and 1 specifying sequence tokens.
+
+            - ``input_ids``: list of token ids to be fed to a model
+            - ``token_type_ids``: list of token type ids to be fed to a model
+            - ``attention_mask``: list of indices specifying which tokens should be attended to by the model
+            - ``overflowing_tokens``: list of overflowing tokens if a max length is specified.
+            - ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
+            - ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
+              tokens and 1 specifying sequence tokens.
        """

        def get_input_ids(text):
@@ -1038,19 +1071,19 @@ class PreTrainedTokenizer(object):

    def batch_encode_plus(
        self,
-        batch_text_or_text_pairs=None,
-        add_special_tokens=True,
-        max_length=None,
-        stride=0,
-        truncation_strategy="longest_first",
-        pad_to_max_length=False,
-        return_tensors=None,
-        return_token_type_ids=True,
-        return_attention_masks=True,
-        return_overflowing_tokens=False,
-        return_special_tokens_masks=False,
-        return_offsets_mapping=False,
-        return_input_lengths=False,
+        batch_text_or_text_pairs: Union[str, List[str]],
+        add_special_tokens: bool = True,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        truncation_strategy: str = "longest_first",
+        pad_to_max_length: bool = False,
+        return_tensors: Optional[str] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_masks: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_masks: bool = False,
+        return_offsets_mapping: bool = False,
+        return_input_lengths: bool = False,
        **kwargs
    ):
        """
@@ -1058,32 +1091,59 @@ class PreTrainedTokenizer(object):
        the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.

        Args:
-            batch_text_or_text_pairs: Batch of sequences or pair of sequences to be encoded.
+            batch_text_or_text_pairs (:obj:`List[str]` or :obj:`List[List[str]]`):
+                Batch of sequences or pair of sequences to be encoded.
                This can be a list of string/string-sequences/int-sequences or a list of pair of
                string/string-sequences/int-sequence (see details in encode_plus)
-            add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
+            add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                If set to ``True``, the sequences will be encoded with the special tokens relative
                to their model.
-            max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
-                If there are overflowing tokens, those will be added to the returned dictionary`
-            stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
+            max_length (:obj:`int`, `optional`, defaults to :obj:`None`):
+                If set to a number, will limit the total sequence returned so that it has a maximum length.
+                If there are overflowing tokens, those will be added to the returned dictionary
+            stride (:obj:`int`, `optional`, defaults to ``0``):
+                If set to a number along with max_length, the overflowing tokens returned will contain some tokens
                from the main sequence returned. The value of this argument defines the number of additional tokens.
-            truncation_strategy: string selected in the following options:
+            truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`):
+                String selected in the following options:
+
                - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
-                    starting from the longest one at each token (when there is a pair of input sequences)
+                  starting from the longest one at each token (when there is a pair of input sequences)
                - 'only_first': Only truncate the first sequence
                - 'only_second': Only truncate the second sequence
                - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
-            pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and
-                padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length.
-                The tokenizer padding sides are handled by the class attribute `padding_side` which can be set to the following strings:
+            pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                If set to True, the returned sequences will be padded according to the model's padding side and
+                padding index, up to their max length. If no max length is specified, the padding is done up to the
+                model's max length. The tokenizer padding sides are handled by the class attribute `padding_side`
+                which can be set to the following strings:
+
                - 'left': pads on the left of the sequences
                - 'right': pads on the right of the sequences
                Defaults to False: no padding.
-            return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
-                or PyTorch torch.Tensor instead of a list of python integers.
-            return_input_lengths: (optional) If set the resulting dictionary will include the length of each sample
-            return_attention_masks: (optional) Set to True to return the attention mask (default False)
-            return_offsets_mapping: (optional) Not available, should be set to False or it will throw NotImplementError
+            return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`):
+                Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant`
+                or PyTorch :obj:`torch.Tensor` instead of a list of python integers.
+            return_token_type_ids (:obj:`bool`, `optional`, defaults to :obj:`None`):
+                Whether to return token type IDs. If left to the default, will return the token type IDs according
+                to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+
+                `What are token type IDs? <../glossary.html#token-type-ids>`_
+            return_attention_masks (:obj:`bool`, `optional`, defaults to :obj:`none`):
+                Whether to return the attention mask. If left to the default, will return the attention mask according
+                to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True to return overflowing token information (default False).
+            return_special_tokens_masks (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True to return special tokens mask information (default False).
+            return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True to return (char_start, char_end) for each token (default False).
+                If using Python's tokenizer, this method will raise NotImplementedError. This one is only available on
+                Rust-based tokenizers inheriting from PreTrainedTokenizerFast.
+            return_input_lengths (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                If set the resulting dictionary will include the length of each sample
            **kwargs: passed to the `self.tokenize()` method

        Return:
@@ -1099,13 +1159,14 @@ class PreTrainedTokenizer(object):
                }

            With the fields:
-                ``input_ids``: list of token ids to be fed to a model
-                ``token_type_ids``: list of token type ids to be fed to a model
-                ``attention_mask``: list of indices specifying which tokens should be attended to by the model
-                ``overflowing_tokens``: list of overflowing tokens if a max length is specified.
-                ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
-                ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
-                tokens and 1 specifying sequence tokens.
+
+            - ``input_ids``: list of token ids to be fed to a model
+            - ``token_type_ids``: list of token type ids to be fed to a model
+            - ``attention_mask``: list of indices specifying which tokens should be attended to by the model
+            - ``overflowing_tokens``: list of overflowing tokens if a max length is specified.
+            - ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
+            - ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
+              tokens and 1 specifying sequence tokens.
        """

        def get_input_ids(text):
@@ -1220,18 +1281,18 @@ class PreTrainedTokenizer(object):

    def prepare_for_model(
        self,
-        ids,
-        pair_ids=None,
-        max_length=None,
-        add_special_tokens=True,
-        stride=0,
-        truncation_strategy="longest_first",
-        pad_to_max_length=False,
-        return_tensors=None,
-        return_token_type_ids=True,
-        return_attention_mask=True,
-        return_overflowing_tokens=False,
-        return_special_tokens_mask=False,
+        ids: List[int],
+        pair_ids: Optional[List[int]] = None,
+        max_length: Optional[int] = None,
+        add_special_tokens: bool = True,
+        stride: int = 0,
+        truncation_strategy: str = "longest_first",
+        pad_to_max_length: bool = False,
+        return_tensors: Optional[str] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
    ):
        """
        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
@@ -1292,6 +1353,11 @@ class PreTrainedTokenizer(object):
        len_ids = len(ids)
        len_pair_ids = len(pair_ids) if pair else 0

+        if return_token_type_ids is None:
+            return_token_type_ids = "token_type_ids" in self.model_input_names
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names
+
        encoded_inputs = {}

        # Handle max sequence length
@@ -1617,6 +1683,9 @@ class PreTrainedTokenizer(object):


 class PreTrainedTokenizerFast(PreTrainedTokenizer):
+
+    model_input_names = ["token_type_ids", "attention_mask"]
+
    def __init__(self, tokenizer: BaseTokenizer, **kwargs):
        if tokenizer is None:
            raise ValueError("Provided tokenizer cannot be None")
@@ -1685,16 +1754,21 @@ class PreTrainedTokenizerFast(PreTrainedTokenizer):
        if self._tokenizer is not None:
            self._tokenizer.add_special_tokens(self.all_special_tokens)

-    @staticmethod
    def _convert_encoding(
+        self,
        encoding,
        return_tensors=None,
-        return_token_type_ids=True,
-        return_attention_mask=True,
+        return_token_type_ids=None,
+        return_attention_mask=None,
        return_overflowing_tokens=False,
        return_special_tokens_mask=False,
        return_offsets_mapping=False,
    ):
+        if return_token_type_ids is None:
+            return_token_type_ids = "token_type_ids" in self.model_input_names
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names
+
        if return_overflowing_tokens and encoding.overflowing is not None:
            encodings = [encoding] + encoding.overflowing
        else:
@@ -1774,18 +1848,18 @@ class PreTrainedTokenizerFast(PreTrainedTokenizer):

    def batch_encode_plus(
        self,
-        batch_text_or_text_pairs=None,
-        add_special_tokens=True,
-        max_length=None,
-        stride=0,
-        truncation_strategy="longest_first",
-        pad_to_max_length=False,
-        return_tensors=None,
-        return_token_type_ids=True,
-        return_attention_mask=True,
-        return_overflowing_tokens=False,
-        return_special_tokens_mask=False,
-        return_offsets_mapping=False,
+        batch_text_or_text_pairs: Optional[Union[List[str], List[Tuple[str]]]] = None,
+        add_special_tokens: bool = True,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        truncation_strategy: str = "longest_first",
+        pad_to_max_length: bool = False,
+        return_tensors: Optional[str] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
        **kwargs
    ):
        if not add_special_tokens:
@@ -1868,19 +1942,19 @@ class PreTrainedTokenizerFast(PreTrainedTokenizer):

    def encode_plus(
        self,
-        text,
-        text_pair=None,
-        add_special_tokens=False,
-        max_length=None,
-        pad_to_max_length=False,
-        stride=0,
-        truncation_strategy="longest_first",
-        return_tensors=None,
-        return_token_type_ids=True,
-        return_attention_mask=True,
-        return_overflowing_tokens=False,
-        return_special_tokens_mask=False,
-        return_offsets_mapping=False,
+        text: str,
+        text_pair: Optional[str] = None,
+        add_special_tokens: bool = False,
+        max_length: Optional[int] = None,
+        pad_to_max_length: bool = False,
+        stride: int = 0,
+        truncation_strategy: str = "longest_first",
+        return_tensors: Optional[bool] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
        **kwargs
    ):
        batched_input = [(text, text_pair)] if text_pair else [text]

--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -48,7 +48,7 @@ class TokenizerTesterMixin:
        # to the concatenated encode_plus format: [{'input_ids': [...], ...}, {'input_ids': [...], ...}]
        return [
            {value: batch_encode_plus_sequences[value][i] for value in batch_encode_plus_sequences.keys()}
-            for i in range(len(batch_encode_plus_sequences))
+            for i in range(len(batch_encode_plus_sequences["input_ids"]))
        ]

    def test_tokenizers_common_properties(self):
@@ -261,7 +261,10 @@ class TokenizerTesterMixin:
    def test_mask_output(self):
        tokenizer = self.get_tokenizer()

-        if tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer":
+        if (
+            tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer"
+            and "token_type_ids" in tokenizer.model_input_names
+        ):
            seq_0 = "Test this method."
            seq_1 = "With these inputs."
            information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True)
@@ -504,51 +507,58 @@ class TokenizerTesterMixin:

        encoded_sequence = tokenizer.encode_plus(sequence, return_special_tokens_mask=True)
        input_ids = encoded_sequence["input_ids"]
-        token_type_ids = encoded_sequence["token_type_ids"]
-        attention_mask = encoded_sequence["attention_mask"]
        special_tokens_mask = encoded_sequence["special_tokens_mask"]
        sequence_length = len(input_ids)

        # Test right padding
        tokenizer.padding_side = "right"

-        padded_sequence = tokenizer.encode_plus(
+        right_padded_sequence = tokenizer.encode_plus(
            sequence,
            max_length=sequence_length + padding_size,
            pad_to_max_length=True,
            return_special_tokens_mask=True,
        )
-        padded_input_ids = padded_sequence["input_ids"]
-        padded_token_type_ids = padded_sequence["token_type_ids"]
-        padded_attention_mask = padded_sequence["attention_mask"]
-        padded_special_tokens_mask = padded_sequence["special_tokens_mask"]
-        padded_sequence_length = len(padded_input_ids)
+        right_padded_input_ids = right_padded_sequence["input_ids"]

-        assert sequence_length + padding_size == padded_sequence_length
-        assert input_ids + [padding_idx] * padding_size == padded_input_ids
-        assert token_type_ids + [token_type_padding_idx] * padding_size == padded_token_type_ids
-        assert attention_mask + [0] * padding_size == padded_attention_mask
-        assert special_tokens_mask + [1] * padding_size == padded_special_tokens_mask
+        right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
+        right_padded_sequence_length = len(right_padded_input_ids)
+
+        assert sequence_length + padding_size == right_padded_sequence_length
+        assert input_ids + [padding_idx] * padding_size == right_padded_input_ids
+        assert special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask

        # Test left padding
        tokenizer.padding_side = "left"
-        padded_sequence = tokenizer.encode_plus(
+        left_padded_sequence = tokenizer.encode_plus(
            sequence,
            max_length=sequence_length + padding_size,
            pad_to_max_length=True,
            return_special_tokens_mask=True,
        )
-        padded_input_ids = padded_sequence["input_ids"]
-        padded_token_type_ids = padded_sequence["token_type_ids"]
-        padded_attention_mask = padded_sequence["attention_mask"]
-        padded_special_tokens_mask = padded_sequence["special_tokens_mask"]
-        padded_sequence_length = len(padded_input_ids)
+        left_padded_input_ids = left_padded_sequence["input_ids"]
+        left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
+        left_padded_sequence_length = len(left_padded_input_ids)

-        assert sequence_length + padding_size == padded_sequence_length
-        assert [padding_idx] * padding_size + input_ids == padded_input_ids
-        assert [token_type_padding_idx] * padding_size + token_type_ids == padded_token_type_ids
-        assert [0] * padding_size + attention_mask == padded_attention_mask
-        assert [1] * padding_size + special_tokens_mask == padded_special_tokens_mask
+        assert sequence_length + padding_size == left_padded_sequence_length
+        assert [padding_idx] * padding_size + input_ids == left_padded_input_ids
+        assert [1] * padding_size + special_tokens_mask == left_padded_special_tokens_mask
+
+        if "token_type_ids" in tokenizer.model_input_names:
+            token_type_ids = encoded_sequence["token_type_ids"]
+            left_padded_token_type_ids = left_padded_sequence["token_type_ids"]
+            right_padded_token_type_ids = right_padded_sequence["token_type_ids"]
+
+            assert token_type_ids + [token_type_padding_idx] * padding_size == right_padded_token_type_ids
+            assert [token_type_padding_idx] * padding_size + token_type_ids == left_padded_token_type_ids
+
+        if "attention_mask" in tokenizer.model_input_names:
+            attention_mask = encoded_sequence["attention_mask"]
+            right_padded_attention_mask = right_padded_sequence["attention_mask"]
+            left_padded_attention_mask = left_padded_sequence["attention_mask"]
+
+            assert attention_mask + [0] * padding_size == right_padded_attention_mask
+            assert [0] * padding_size + attention_mask == left_padded_attention_mask

    def test_separate_tokenizers(self):
        # This tests that tokenizers don't impact others. Unfortunately the case where it fails is when