Black 20 release

a75c64d8 · Lysandre · e78c1103 · a75c64d8 · a75c64d8 · a75c64d8
Commit a75c64d8 authored Aug 26, 2020 by Lysandre
20 changed files
--- a/src/transformers/modeling_tf_xlm.py
+++ b/src/transformers/modeling_tf_xlm.py
@@ -83,7 +83,7 @@ def create_sinusoidal_embeddings(n_pos, dim, out):


 def gelu(x):
-    """ Gaussian Error Linear Unit.
+    """Gaussian Error Linear Unit.
    Original Implementation of the gelu activation function in Google Bert repo when initially created.
        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
@@ -333,9 +333,9 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
        raise NotImplementedError

    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        See base class PreTrainedModel
        """
        raise NotImplementedError

@@ -516,8 +516,8 @@ class TFXLMMainLayer(tf.keras.layers.Layer):


 class TFXLMPreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
    """

    config_class = XLMConfig
@@ -858,7 +858,7 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):

    @property
    def dummy_inputs(self):
-        """ Dummy inputs to build the network.
+        """Dummy inputs to build the network.

        Returns:
            tf.Tensor with dummy inputs

--- a/src/transformers/modeling_tf_xlm_roberta.py
+++ b/src/transformers/modeling_tf_xlm_roberta.py
@@ -77,7 +77,8 @@ class TFXLMRobertaModel(TFRobertaModel):


 @add_start_docstrings(
-    """XLM-RoBERTa Model with a `language modeling` head on top. """, XLM_ROBERTA_START_DOCSTRING,
+    """XLM-RoBERTa Model with a `language modeling` head on top. """,
+    XLM_ROBERTA_START_DOCSTRING,
 )
 class TFXLMRobertaForMaskedLM(TFRobertaForMaskedLM):
    """

--- a/src/transformers/modeling_tf_xlnet.py
+++ b/src/transformers/modeling_tf_xlnet.py
@@ -62,9 +62,9 @@ TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [


 def gelu(x):
-    """ Implementation of the gelu activation function.
-        XLNet is using OpenAI GPT's gelu
-        Also see https://arxiv.org/abs/1606.08415
+    """Implementation of the gelu activation function.
+    XLNet is using OpenAI GPT's gelu
+    Also see https://arxiv.org/abs/1606.08415
    """
    cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
    return x * cdf
@@ -807,8 +807,8 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):


 class TFXLNetPreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
    """

    config_class = XLNetConfig
@@ -1213,33 +1213,33 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel, TFCausalLanguageModelingLoss):
        training=False,
    ):
        r"""
-        labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the cross entropy classification loss.
-            Indices should be in ``[0, ..., config.vocab_size - 1]``.
+            labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the cross entropy classification loss.
+                Indices should be in ``[0, ..., config.vocab_size - 1]``.

-    Return:
+        Return:

-    Examples::
+        Examples::

-        import tensorflow as tf
-        import numpy as np
-        from transformers import XLNetTokenizer, TFXLNetLMHeadModel
+            import tensorflow as tf
+            import numpy as np
+            from transformers import XLNetTokenizer, TFXLNetLMHeadModel

-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        model = TFXLNetLMHeadModel.from_pretrained('xlnet-large-cased')
+            tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
+            model = TFXLNetLMHeadModel.from_pretrained('xlnet-large-cased')

-        # We show how to setup inputs to predict a next token using a bi-directional context.
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=True))[None, :]  # We will predict the masked token
+            # We show how to setup inputs to predict a next token using a bi-directional context.
+            input_ids = tf.constant(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=True))[None, :]  # We will predict the masked token

-        perm_mask = np.zeros((1, input_ids.shape[1], input_ids.shape[1]))
-        perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
+            perm_mask = np.zeros((1, input_ids.shape[1], input_ids.shape[1]))
+            perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token

-        target_mapping = np.zeros((1, 1, input_ids.shape[1]))  # Shape [1, 1, seq_length] => let's predict one token
-        target_mapping[0, 0, -1] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)
+            target_mapping = np.zeros((1, 1, input_ids.shape[1]))  # Shape [1, 1, seq_length] => let's predict one token
+            target_mapping[0, 0, -1] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)

-        outputs = model(input_ids, perm_mask=tf.constant(perm_mask, dtype=tf.float32), target_mapping=tf.constant(target_mapping, dtype=tf.float32))
+            outputs = model(input_ids, perm_mask=tf.constant(perm_mask, dtype=tf.float32), target_mapping=tf.constant(target_mapping, dtype=tf.float32))

-        next_token_logits = outputs[0]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
+            next_token_logits = outputs[0]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]

        """
        return_dict = return_dict if return_dict is not None else self.transformer.return_dict
@@ -1401,7 +1401,7 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):

    @property
    def dummy_inputs(self):
-        """ Dummy inputs to build the network.
+        """Dummy inputs to build the network.

        Returns:
            tf.Tensor with dummy inputs

--- a/src/transformers/modeling_transfo_xl.py
+++ b/src/transformers/modeling_transfo_xl.py
@@ -45,8 +45,8 @@ TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [


 def build_tf_to_pytorch_map(model, config):
-    """ A map of modules from TF to PyTorch.
-        This time I use a map to keep the PyTorch model as identical to the original PyTorch model as possible.
+    """A map of modules from TF to PyTorch.
+    This time I use a map to keep the PyTorch model as identical to the original PyTorch model as possible.
    """
    tf_to_pt_map = {}

@@ -112,8 +112,7 @@ def build_tf_to_pytorch_map(model, config):


 def load_tf_weights_in_transfo_xl(model, config, tf_path):
-    """ Load tf checkpoints in a pytorch model
-    """
+    """Load tf checkpoints in a pytorch model"""
    try:
        import numpy as np
        import tensorflow as tf
@@ -386,7 +385,12 @@ class RelPartialLearnableDecoderLayer(nn.Module):
    def forward(self, dec_inp, r, dec_attn_mask=None, mems=None, head_mask=None, output_attentions=False):

        attn_outputs = self.dec_attn(
-            dec_inp, r, attn_mask=dec_attn_mask, mems=mems, head_mask=head_mask, output_attentions=output_attentions,
+            dec_inp,
+            r,
+            attn_mask=dec_attn_mask,
+            mems=mems,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
        )
        ff_output = self.pos_ff(attn_outputs[0])

@@ -456,8 +460,8 @@ class AdaptiveEmbedding(nn.Module):


 class TransfoXLPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
    """

    config_class = TransfoXLConfig
@@ -474,8 +478,7 @@ class TransfoXLPreTrainedModel(PreTrainedModel):
        nn.init.constant_(bias, 0.0)

    def _init_weights(self, m):
-        """ Initialize the weights.
-        """
+        """Initialize the weights."""
        classname = m.__class__.__name__
        if classname.find("Linear") != -1:
            if hasattr(m, "weight") and m.weight is not None:
@@ -515,7 +518,7 @@ class TransfoXLPreTrainedModel(PreTrainedModel):
                self._init_bias(m.r_bias)

    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, layer: Optional[int] = -1):
-        """ Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
+        """Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
        Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.

        Arguments:
@@ -948,7 +951,10 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
            return tuple(v for v in [core_out, new_mems, hids, attentions] if v is not None)

        return TransfoXLModelOutput(
-            last_hidden_state=core_out, mems=new_mems, hidden_states=hids, attentions=attentions,
+            last_hidden_state=core_out,
+            mems=new_mems,
+            hidden_states=hids,
+            attentions=attentions,
        )


@@ -1064,8 +1070,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
        )

    def get_output_embeddings(self):
-        """ Double-check if you are using adaptive softmax.
-        """
+        """Double-check if you are using adaptive softmax."""
        if self.sample_softmax > 0:
            return self.out_layer
        else:

--- a/src/transformers/modeling_transfo_xl_utilities.py
+++ b/src/transformers/modeling_transfo_xl_utilities.py
@@ -85,17 +85,17 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):

    def forward(self, hidden, labels=None, keep_order=False):
        """
-            Params:
-                hidden :: [len*bsz x d_proj]
-                labels :: [len*bsz]
-            Return:
-                if labels is None:
-                    out :: [len*bsz x n_tokens] log probabilities of tokens over the vocabulary
-                else:
-                    out :: [(len-1)*bsz] Negative log likelihood
-            We could replace this implementation by the native PyTorch one
-            if their's had an option to set bias on all clusters in the native one.
-            here: https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138
+        Params:
+            hidden :: [len*bsz x d_proj]
+            labels :: [len*bsz]
+        Return:
+            if labels is None:
+                out :: [len*bsz x n_tokens] log probabilities of tokens over the vocabulary
+            else:
+                out :: [(len-1)*bsz] Negative log likelihood
+        We could replace this implementation by the native PyTorch one
+        if their's had an option to set bias on all clusters in the native one.
+        here: https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138
        """

        if labels is not None:
@@ -191,7 +191,7 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
        return out

    def log_prob(self, hidden):
-        r""" Computes log probabilities for all :math:`n\_classes`
+        r"""Computes log probabilities for all :math:`n\_classes`
        From: https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/adaptive.py
        Args:
            hidden (Tensor): a minibatch of examples

--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -51,8 +51,7 @@ try:
 except ImportError:
    # Older PyTorch compatibility
    class Identity(nn.Module):
-        r"""A placeholder identity operator that is argument-insensitive.
-        """
+        r"""A placeholder identity operator that is argument-insensitive."""

        def __init__(self, *args, **kwargs):
            super().__init__()
@@ -488,8 +487,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
            )

    def _tie_or_clone_weights(self, output_embeddings, input_embeddings):
-        """ Tie or clone module weights depending of whether we are using TorchScript or not
-        """
+        """Tie or clone module weights depending of whether we are using TorchScript or not"""
        if self.config.torchscript:
            output_embeddings.weight = nn.Parameter(input_embeddings.weight.clone())
        else:
@@ -498,7 +496,10 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
        if getattr(output_embeddings, "bias", None) is not None:
            output_embeddings.bias.data = torch.nn.functional.pad(
                output_embeddings.bias.data,
-                (0, output_embeddings.weight.shape[0] - output_embeddings.bias.shape[0],),
+                (
+                    0,
+                    output_embeddings.weight.shape[0] - output_embeddings.bias.shape[0],
+                ),
                "constant",
                0,
            )
@@ -906,7 +907,13 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
            def load(module: nn.Module, prefix=""):
                local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
                module._load_from_state_dict(
-                    state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs,
+                    state_dict,
+                    prefix,
+                    local_metadata,
+                    True,
+                    missing_keys,
+                    unexpected_keys,
+                    error_msgs,
                )
                for name, child in module._modules.items():
                    if child is not None:
@@ -1242,24 +1249,24 @@ class SQuADHead(nn.Module):
        return_dict: bool = False,
    ) -> Union[SquadHeadOutput, Tuple[torch.FloatTensor]]:
        """
-    Args:
-        hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len, hidden_size)`):
-            Final hidden states of the model on the sequence tokens.
-        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Positions of the first token for the labeled span.
-        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Positions of the last token for the labeled span.
-        cls_index (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Position of the CLS token for each sentence in the batch. If :obj:`None`, takes the last token.
-        is_impossible (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Whether the question has a possible answer in the paragraph or not.
-        p_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len)`, `optional`):
-            Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS).
-            1.0 means token should be masked.
-        return_dict (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to return a :class:`~transformers.file_utils.ModelOuput` instead of a plain tuple.
+        Args:
+            hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len, hidden_size)`):
+                Final hidden states of the model on the sequence tokens.
+            start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+                Positions of the first token for the labeled span.
+            end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+                Positions of the last token for the labeled span.
+            cls_index (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+                Position of the CLS token for each sentence in the batch. If :obj:`None`, takes the last token.
+            is_impossible (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+                Whether the question has a possible answer in the paragraph or not.
+            p_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len)`, `optional`):
+                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS).
+                1.0 means token should be masked.
+            return_dict (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not to return a :class:`~transformers.file_utils.ModelOuput` instead of a plain tuple.

-    Returns:
+        Returns:
        """
        start_logits = self.start_logits(hidden_states, p_mask=p_mask)

@@ -1375,7 +1382,7 @@ class SequenceSummary(nn.Module):
            self.summary = nn.Linear(config.hidden_size, num_classes)

        activation_string = getattr(config, "summary_activation", None)
-        self.activation: Callable = (get_activation(activation_string) if activation_string else Identity())
+        self.activation: Callable = get_activation(activation_string) if activation_string else Identity()

        self.first_dropout = Identity()
        if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0:
@@ -1409,7 +1416,11 @@ class SequenceSummary(nn.Module):
            output = hidden_states.mean(dim=1)
        elif self.summary_type == "cls_index":
            if cls_index is None:
-                cls_index = torch.full_like(hidden_states[..., :1, :], hidden_states.shape[-2] - 1, dtype=torch.long,)
+                cls_index = torch.full_like(
+                    hidden_states[..., :1, :],
+                    hidden_states.shape[-2] - 1,
+                    dtype=torch.long,
+                )
            else:
                cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)
                cls_index = cls_index.expand((-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),))

--- a/src/transformers/modeling_xlm.py
+++ b/src/transformers/modeling_xlm.py
@@ -228,8 +228,8 @@ class TransformerFFN(nn.Module):


 class XLMPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
    """

    config_class = XLMConfig
@@ -462,9 +462,9 @@ class XLMModel(XLMPreTrainedModel):
        self.embeddings = new_embeddings

    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        See base class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.attentions[layer].prune_heads(heads)
@@ -572,7 +572,11 @@ class XLMModel(XLMPreTrainedModel):

            # self attention
            attn_outputs = self.attentions[i](
-                tensor, attn_mask, cache=cache, head_mask=head_mask[i], output_attentions=output_attentions,
+                tensor,
+                attn_mask,
+                cache=cache,
+                head_mask=head_mask[i],
+                output_attentions=output_attentions,
            )
            attn = attn_outputs[0]
            if output_attentions:
@@ -633,8 +637,7 @@ class XLMPredLayer(nn.Module):
            )

    def forward(self, x, y=None):
-        """ Compute the loss, and optionally the scores.
-        """
+        """Compute the loss, and optionally the scores."""
        outputs = ()
        if self.asm is False:
            scores = self.proj(x)
@@ -969,38 +972,38 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
        return_dict=None,
    ):
        r"""
-        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-        is_impossible (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
-            Labels whether a question has an answer or no answer (SQuAD 2.0)
-        cls_index (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the classification token to use as input for computing plausibility of the answer.
-        p_mask (``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
-            Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...).
-            1.0 means token should be masked. 0.0 mean token is not masked.
-
-    Returns:
-
-    Example::
-
-        >>> from transformers import XLMTokenizer, XLMForQuestionAnswering
-        >>> import torch
-
-        >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        >>> model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048', return_dict=True)
-
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        >>> start_positions = torch.tensor([1])
-        >>> end_positions = torch.tensor([3])
-
-        >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        >>> loss = outputs.loss
+            start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for position (index) of the start of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`).
+                Position outside of the sequence are not taken into account for computing the loss.
+            end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for position (index) of the end of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`).
+                Position outside of the sequence are not taken into account for computing the loss.
+            is_impossible (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
+                Labels whether a question has an answer or no answer (SQuAD 2.0)
+            cls_index (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
+                Labels for position (index) of the classification token to use as input for computing plausibility of the answer.
+            p_mask (``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
+                Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...).
+                1.0 means token should be masked. 0.0 mean token is not masked.
+
+        Returns:
+
+        Example::
+
+            >>> from transformers import XLMTokenizer, XLMForQuestionAnswering
+            >>> import torch
+
+            >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+            >>> model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048', return_dict=True)
+
+            >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+            >>> start_positions = torch.tensor([1])
+            >>> end_positions = torch.tensor([3])
+
+            >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+            >>> loss = outputs.loss
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

@@ -1131,7 +1134,10 @@ class XLMForTokenClassification(XLMPreTrainedModel):
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )



--- a/src/transformers/modeling_xlm_roberta.py
+++ b/src/transformers/modeling_xlm_roberta.py
@@ -68,7 +68,8 @@ class XLMRobertaModel(RobertaModel):


 @add_start_docstrings(
-    """XLM-RoBERTa Model with a `language modeling` head on top. """, XLM_ROBERTA_START_DOCSTRING,
+    """XLM-RoBERTa Model with a `language modeling` head on top. """,
+    XLM_ROBERTA_START_DOCSTRING,
 )
 class XLMRobertaForMaskedLM(RobertaForMaskedLM):
    """

--- a/src/transformers/modeling_xlnet.py
+++ b/src/transformers/modeling_xlnet.py
@@ -58,9 +58,9 @@ XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [


 def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None):
-    """ A map of modules from TF to PyTorch.
-        I use a map to keep the PyTorch model as
-        identical to the original PyTorch model as possible.
+    """A map of modules from TF to PyTorch.
+    I use a map to keep the PyTorch model as
+    identical to the original PyTorch model as possible.
    """

    tf_to_pt_map = {}
@@ -141,8 +141,7 @@ def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None):


 def load_tf_weights_in_xlnet(model, config, tf_path):
-    """ Load tf checkpoints in a pytorch model
-    """
+    """Load tf checkpoints in a pytorch model"""
    try:
        import numpy as np
        import tensorflow as tf
@@ -548,8 +547,8 @@ class XLNetLayer(nn.Module):


 class XLNetPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
    """

    config_class = XLNetConfig
@@ -557,8 +556,7 @@ class XLNetPreTrainedModel(PreTrainedModel):
    base_model_prefix = "transformer"

    def _init_weights(self, module):
-        """ Initialize the weights.
-        """
+        """Initialize the weights."""
        if isinstance(module, (nn.Linear, nn.Embedding)):
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
@@ -1350,46 +1348,46 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
        return_dict=None,
    ):
        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_predict)`, `optional`, defaults to :obj:`None`):
-            Labels for masked language modeling.
-            `num_predict` corresponds to `target_mapping.shape[1]`. If `target_mapping` is `None`, then `num_predict` corresponds to `sequence_length`.
-            The labels should correspond to the masked input words that should be predicted and depends on `target_mapping`. Note in order to perform standard auto-regressive language modeling a `<mask>` token has to be added to the `input_ids` (see `prepare_inputs_for_generation` fn and examples below)
-            Indices are selected in ``[-100, 0, ..., config.vocab_size]``
-            All labels set to ``-100`` are ignored, the loss is only
-            computed for labels in ``[0, ..., config.vocab_size]``
-
-    Return:
-
-    Examples::
-
-        from transformers import XLNetTokenizer, XLNetLMHeadModel
-        import torch
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased', return_dict=True)
-
-        # We show how to setup inputs to predict a next token using a bi-directional context.
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=False)).unsqueeze(0)  # We will predict the masked token
-        perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
-        perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
-        target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float)  # Shape [1, 1, seq_length] => let's predict one token
-        target_mapping[0, 0, -1] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)
-
-        outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping)
-        next_token_logits = outputs[0]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
-
-        # The same way can the XLNetLMHeadModel be used to be trained by standard auto-regressive language modeling.
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=False)).unsqueeze(0)  # We will predict the masked token
-        labels = torch.tensor(tokenizer.encode("cute", add_special_tokens=False)).unsqueeze(0)
-        assert labels.shape[0] == 1, 'only one word will be predicted'
-        perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
-        perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token as is done in standard auto-regressive lm training
-        target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float)  # Shape [1, 1, seq_length] => let's predict one token
-        target_mapping[0, 0, -1] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)
-
-        outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping, labels=labels)
-        loss = outputs.loss
-        next_token_logits = outputs.logits  # Logits have shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_predict)`, `optional`, defaults to :obj:`None`):
+                Labels for masked language modeling.
+                `num_predict` corresponds to `target_mapping.shape[1]`. If `target_mapping` is `None`, then `num_predict` corresponds to `sequence_length`.
+                The labels should correspond to the masked input words that should be predicted and depends on `target_mapping`. Note in order to perform standard auto-regressive language modeling a `<mask>` token has to be added to the `input_ids` (see `prepare_inputs_for_generation` fn and examples below)
+                Indices are selected in ``[-100, 0, ..., config.vocab_size]``
+                All labels set to ``-100`` are ignored, the loss is only
+                computed for labels in ``[0, ..., config.vocab_size]``
+
+        Return:
+
+        Examples::
+
+            from transformers import XLNetTokenizer, XLNetLMHeadModel
+            import torch
+
+            tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
+            model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased', return_dict=True)
+
+            # We show how to setup inputs to predict a next token using a bi-directional context.
+            input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=False)).unsqueeze(0)  # We will predict the masked token
+            perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
+            perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
+            target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float)  # Shape [1, 1, seq_length] => let's predict one token
+            target_mapping[0, 0, -1] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)
+
+            outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping)
+            next_token_logits = outputs[0]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
+
+            # The same way can the XLNetLMHeadModel be used to be trained by standard auto-regressive language modeling.
+            input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=False)).unsqueeze(0)  # We will predict the masked token
+            labels = torch.tensor(tokenizer.encode("cute", add_special_tokens=False)).unsqueeze(0)
+            assert labels.shape[0] == 1, 'only one word will be predicted'
+            perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
+            perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token as is done in standard auto-regressive lm training
+            target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float)  # Shape [1, 1, seq_length] => let's predict one token
+            target_mapping[0, 0, -1] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)
+
+            outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping, labels=labels)
+            loss = outputs.loss
+            next_token_logits = outputs.logits  # Logits have shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)
@@ -1867,38 +1865,38 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
        return_dict=None,
    ):
        r"""
-        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-        is_impossible (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
-            Labels whether a question has an answer or no answer (SQuAD 2.0)
-        cls_index (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the classification token to use as input for computing plausibility of the answer.
-        p_mask (``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
-            Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...).
-            1.0 means token should be masked. 0.0 mean token is not masked.
-
-    Returns:
-
-    Example::
-
-        >>> from transformers import XLNetTokenizer, XLNetForQuestionAnswering
-        >>> import torch
-
-        >>> tokenizer =  XLNetTokenizer.from_pretrained('xlnet-base-cased')
-        >>> model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased', return_dict=True)
-
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        >>> start_positions = torch.tensor([1])
-        >>> end_positions = torch.tensor([3])
-        >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-
-        >>> loss = outputs.loss
+            start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for position (index) of the start of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`).
+                Position outside of the sequence are not taken into account for computing the loss.
+            end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for position (index) of the end of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`).
+                Position outside of the sequence are not taken into account for computing the loss.
+            is_impossible (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
+                Labels whether a question has an answer or no answer (SQuAD 2.0)
+            cls_index (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
+                Labels for position (index) of the classification token to use as input for computing plausibility of the answer.
+            p_mask (``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
+                Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...).
+                1.0 means token should be masked. 0.0 mean token is not masked.
+
+        Returns:
+
+        Example::
+
+            >>> from transformers import XLNetTokenizer, XLNetForQuestionAnswering
+            >>> import torch
+
+            >>> tokenizer =  XLNetTokenizer.from_pretrained('xlnet-base-cased')
+            >>> model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased', return_dict=True)
+
+            >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+            >>> start_positions = torch.tensor([1])
+            >>> end_positions = torch.tensor([3])
+            >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+
+            >>> loss = outputs.loss
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)

--- a/src/transformers/optimization_tf.py
+++ b/src/transformers/optimization_tf.py
@@ -122,7 +122,9 @@ def create_optimizer(
    )
    if num_warmup_steps:
        lr_schedule = WarmUp(
-            initial_learning_rate=init_lr, decay_schedule_fn=lr_schedule, warmup_steps=num_warmup_steps,
+            initial_learning_rate=init_lr,
+            decay_schedule_fn=lr_schedule,
+            warmup_steps=num_warmup_steps,
        )
    if weight_decay_rate > 0.0:
        optimizer = AdamWeightDecay(

--- a/src/transformers/pipelines.py
+++ b/src/transformers/pipelines.py
@@ -208,7 +208,11 @@ class PipelineDataFormat:
    SUPPORTED_FORMATS = ["json", "csv", "pipe"]

    def __init__(
-        self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite: bool = False,
+        self,
+        output_path: Optional[str],
+        input_path: Optional[str],
+        column: Optional[str],
+        overwrite: bool = False,
    ):
        self.output_path = output_path
        self.input_path = input_path
@@ -261,7 +265,11 @@ class PipelineDataFormat:

    @staticmethod
    def from_str(
-        format: str, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False,
+        format: str,
+        output_path: Optional[str],
+        input_path: Optional[str],
+        column: Optional[str],
+        overwrite=False,
    ) -> "PipelineDataFormat":
        """
        Creates an instance of the right subclass of :class:`~transformers.pipelines.PipelineDataFormat` depending
@@ -305,7 +313,11 @@ class CsvPipelineDataFormat(PipelineDataFormat):
    """

    def __init__(
-        self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False,
+        self,
+        output_path: Optional[str],
+        input_path: Optional[str],
+        column: Optional[str],
+        overwrite=False,
    ):
        super().__init__(output_path, input_path, column, overwrite=overwrite)

@@ -346,7 +358,11 @@ class JsonPipelineDataFormat(PipelineDataFormat):
    """

    def __init__(
-        self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False,
+        self,
+        output_path: Optional[str],
+        input_path: Optional[str],
+        column: Optional[str],
+        overwrite=False,
    ):
        super().__init__(output_path, input_path, column, overwrite=overwrite)

@@ -610,7 +626,10 @@ class Pipeline(_ScikitCompat):
        # Parse arguments
        inputs = self._args_parser(*args, **kwargs)
        inputs = self.tokenizer(
-            inputs, add_special_tokens=add_special_tokens, return_tensors=self.framework, padding=padding,
+            inputs,
+            add_special_tokens=add_special_tokens,
+            return_tensors=self.framework,
+            padding=padding,
        )

        return inputs
@@ -1349,7 +1368,10 @@ class TokenClassificationPipeline(Pipeline):
            with self.device_placement():

                tokens = self.tokenizer(
-                    sentence, return_attention_mask=False, return_tensors=self.framework, truncation=True,
+                    sentence,
+                    return_attention_mask=False,
+                    return_tensors=self.framework,
+                    truncation=True,
                )

                # Forward
@@ -1925,7 +1947,9 @@ class SummarizationPipeline(Pipeline):
                )

            summaries = self.model.generate(
-                inputs["input_ids"], attention_mask=inputs["attention_mask"], **generate_kwargs,
+                inputs["input_ids"],
+                attention_mask=inputs["attention_mask"],
+                **generate_kwargs,
            )

            results = []
@@ -1935,7 +1959,9 @@ class SummarizationPipeline(Pipeline):
                    record["summary_token_ids"] = summary
                if return_text:
                    record["summary_text"] = self.tokenizer.decode(
-                        summary, skip_special_tokens=True, clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+                        summary,
+                        skip_special_tokens=True,
+                        clean_up_tokenization_spaces=clean_up_tokenization_spaces,
                    )
                results.append(record)
            return results
@@ -2032,7 +2058,9 @@ class TranslationPipeline(Pipeline):
                )

            translations = self.model.generate(
-                inputs["input_ids"], attention_mask=inputs["attention_mask"], **generate_kwargs,
+                inputs["input_ids"],
+                attention_mask=inputs["attention_mask"],
+                **generate_kwargs,
            )
            results = []
            for translation in translations:
@@ -2271,7 +2299,9 @@ class ConversationalPipeline(Pipeline):
                    "You might consider trimming the early phase of the conversation".format(input_length, max_length)
                )
            generated_responses = self.model.generate(
-                inputs["input_ids"], attention_mask=inputs["attention_mask"], **generate_kwargs,
+                inputs["input_ids"],
+                attention_mask=inputs["attention_mask"],
+                **generate_kwargs,
            )

            cleaned_history = self._clean_padding_history(generated_responses)
@@ -2355,7 +2385,8 @@ class ConversationalPipeline(Pipeline):
        max_len = max([len(item) for item in outputs])
        outputs = [output + [self.pad_token_id] * (max_len - len(output)) for output in outputs]
        outputs = BatchEncoding(
-            {"input_ids": outputs, "attention_mask": [[1] * len(outputs)]}, tensor_type=self.framework,
+            {"input_ids": outputs, "attention_mask": [[1] * len(outputs)]},
+            tensor_type=self.framework,
        )
        return outputs


--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -169,7 +169,7 @@ def assert_screenout(out, what):


 class CaptureStd:
-    """ Context manager to capture:
+    """Context manager to capture:
    stdout, clean it up and make it available via obj.out
    stderr, and make it available via obj.err


--- a/src/transformers/tokenization_auto.py
+++ b/src/transformers/tokenization_auto.py
@@ -105,31 +105,31 @@ TOKENIZER_MAPPING = OrderedDict(

 class AutoTokenizer:
    r""":class:`~transformers.AutoTokenizer` is a generic tokenizer class
-        that will be instantiated as one of the tokenizer classes of the library
-        when created with the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)`
-        class method.
-
-        The `from_pretrained()` method takes care of returning the correct tokenizer class instance
-        based on the `model_type` property of the config object, or when it's missing,
-        falling back to using pattern matching on the `pretrained_model_name_or_path` string:
-
-            - `t5`: T5Tokenizer (T5 model)
-            - `distilbert`: DistilBertTokenizer (DistilBert model)
-            - `albert`: AlbertTokenizer (ALBERT model)
-            - `camembert`: CamembertTokenizer (CamemBERT model)
-            - `xlm-roberta`: XLMRobertaTokenizer (XLM-RoBERTa model)
-            - `longformer`: LongformerTokenizer (AllenAI Longformer model)
-            - `roberta`: RobertaTokenizer (RoBERTa model)
-            - `bert`: BertTokenizer (Bert model)
-            - `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
-            - `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
-            - `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
-            - `xlnet`: XLNetTokenizer (XLNet model)
-            - `xlm`: XLMTokenizer (XLM model)
-            - `ctrl`: CTRLTokenizer (Salesforce CTRL model)
-            - `electra`: ElectraTokenizer (Google ELECTRA model)
-
-        This class cannot be instantiated using `__init__()` (throw an error).
+    that will be instantiated as one of the tokenizer classes of the library
+    when created with the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)`
+    class method.
+
+    The `from_pretrained()` method takes care of returning the correct tokenizer class instance
+    based on the `model_type` property of the config object, or when it's missing,
+    falling back to using pattern matching on the `pretrained_model_name_or_path` string:
+
+        - `t5`: T5Tokenizer (T5 model)
+        - `distilbert`: DistilBertTokenizer (DistilBert model)
+        - `albert`: AlbertTokenizer (ALBERT model)
+        - `camembert`: CamembertTokenizer (CamemBERT model)
+        - `xlm-roberta`: XLMRobertaTokenizer (XLM-RoBERTa model)
+        - `longformer`: LongformerTokenizer (AllenAI Longformer model)
+        - `roberta`: RobertaTokenizer (RoBERTa model)
+        - `bert`: BertTokenizer (Bert model)
+        - `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
+        - `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
+        - `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
+        - `xlnet`: XLNetTokenizer (XLNet model)
+        - `xlm`: XLMTokenizer (XLM model)
+        - `ctrl`: CTRLTokenizer (Salesforce CTRL model)
+        - `electra`: ElectraTokenizer (Google ELECTRA model)
+
+    This class cannot be instantiated using `__init__()` (throw an error).
    """

    def __init__(self):
@@ -140,7 +140,7 @@ class AutoTokenizer:

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
-        r""" Instantiate one of the tokenizer classes of the library
+        r"""Instantiate one of the tokenizer classes of the library
        from a pre-trained model vocabulary.

        The tokenizer class to instantiate is selected

--- a/src/transformers/tokenization_bert.py
+++ b/src/transformers/tokenization_bert.py
@@ -359,7 +359,7 @@ class BasicTokenizer(object):
    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""

    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
-        """ Constructs a BasicTokenizer.
+        """Constructs a BasicTokenizer.

        Args:
            **do_lower_case**: Whether to lower case the input.
@@ -383,7 +383,7 @@ class BasicTokenizer(object):
        self.strip_accents = strip_accents

    def tokenize(self, text, never_split=None):
-        """ Basic Tokenization of a piece of text.
+        """Basic Tokenization of a piece of text.
            Split on "white spaces" only, for sub-word tokenization, see WordPieceTokenizer.

        Args:

--- a/src/transformers/tokenization_ctrl.py
+++ b/src/transformers/tokenization_ctrl.py
@@ -202,8 +202,7 @@ class CTRLTokenizer(PreTrainedTokenizer):
        return word

    def _tokenize(self, text):
-        """ Tokenize a string.
-        """
+        """Tokenize a string."""
        split_tokens = []

        words = re.findall(r"\S+\n?", text)

--- a/src/transformers/tokenization_dpr.py
+++ b/src/transformers/tokenization_dpr.py
@@ -330,7 +330,11 @@ class CustomDPRReaderTokenizerMixin:
        return nbest_spans_predictions[:num_spans]

    def _get_best_spans(
-        self, start_logits: List[int], end_logits: List[int], max_answer_length: int, top_spans: int,
+        self,
+        start_logits: List[int],
+        end_logits: List[int],
+        max_answer_length: int,
+        top_spans: int,
    ) -> List[DPRSpanPrediction]:
        """
        Finds the best answer span for the extractive Q&A model for one passage.

--- a/src/transformers/tokenization_marian.py
+++ b/src/transformers/tokenization_marian.py
@@ -137,9 +137,7 @@ class MarianTokenizer(PreTrainedTokenizer):
        padding="longest",
        **unused,
    ) -> BatchEncoding:
-        """Prepare model inputs for translation. For best performance, translate one sentence at a time.
-
-        """
+        """Prepare model inputs for translation. For best performance, translate one sentence at a time."""
        if "" in src_texts:
            raise ValueError(f"found empty string in src_texts: {src_texts}")
        self.current_spm = self.spm_source

--- a/src/transformers/tokenization_reformer.py
+++ b/src/transformers/tokenization_reformer.py
@@ -53,29 +53,29 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {

 class ReformerTokenizer(PreTrainedTokenizer):
    """
-        Constructs an Reformer tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__ .
-
-        This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
-        should refer to the superclass for more information regarding methods.
-
-        Args:
-            vocab_file (:obj:`string`):
-                `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
-                contains the vocabulary necessary to instantiate a tokenizer.
-            eos_token (:obj:`string`, `optional`, defaults to "</s>"):
-                The end of sequence token.
-
-                .. note::
-
-                    When building a sequence using special tokens, this is not the token that is used for the end
-                    of sequence. The token used is the :obj:`sep_token`.
-            unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
-                The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-                token instead.
-            pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
-                The token used for padding, for example when batching sequences of different lengths.
-            additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`None`):
-                Additional special tokens used by the tokenizer.
+    Constructs an Reformer tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__ .
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+    should refer to the superclass for more information regarding methods.
+
+    Args:
+        vocab_file (:obj:`string`):
+            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        eos_token (:obj:`string`, `optional`, defaults to "</s>"):
+            The end of sequence token.
+
+            .. note::
+
+                When building a sequence using special tokens, this is not the token that is used for the end
+                of sequence. The token used is the :obj:`sep_token`.
+        unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
+            The token used for padding, for example when batching sequences of different lengths.
+        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`None`):
+            Additional special tokens used by the tokenizer.
    """

    vocab_files_names = VOCAB_FILES_NAMES
@@ -142,8 +142,7 @@ class ReformerTokenizer(PreTrainedTokenizer):
        self.sp_model.Load(self.vocab_file)

    def _tokenize(self, text, sample=False):
-        """ Take as input a string and return a list of strings (tokens) for words/sub-words
-        """
+        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
        if not sample:
            pieces = self.sp_model.EncodeAsPieces(text)
        else:
@@ -166,8 +165,8 @@ class ReformerTokenizer(PreTrainedTokenizer):
        return out_string

    def save_vocabulary(self, save_directory):
-        """ Save the sentencepiece vocabulary (copy original file) and special tokens file
-            to a directory.
+        """Save the sentencepiece vocabulary (copy original file) and special tokens file
+        to a directory.
        """
        if not os.path.isdir(save_directory):
            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))

--- a/src/transformers/tokenization_t5.py
+++ b/src/transformers/tokenization_t5.py
@@ -63,34 +63,34 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {

 class T5Tokenizer(PreTrainedTokenizer):
    """
-        Constructs a T5 tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__ .
-
-        This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
-        should refer to the superclass for more information regarding methods.
-
-        Args:
-            vocab_file (:obj:`string`):
-                `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
-                contains the vocabulary necessary to instantiate a tokenizer.
-            eos_token (:obj:`string`, `optional`, defaults to "</s>"):
-                The end of sequence token.
-
-                .. note::
-
-                    When building a sequence using special tokens, this is not the token that is used for the end
-                    of sequence. The token used is the :obj:`sep_token`.
-            unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
-                The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-                token instead.
-            pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
-                The token used for padding, for example when batching sequences of different lengths.
-            extra_ids (:obj:`List[str]`, `optional`, defaults to :obj:`100`):
-                Add a number of extra ids added to the end of the vocabulary for use as sentinels.
-                These tokens are accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1.
-                Extra tokens are indexed from the end of the vocabulary up to beginnning ("<extra_id_0>" is the last token in the vocabulary like in T5 preprocessing
-                see: https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)
-            additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`None`):
-                Additional special tokens used by the tokenizer.
+    Constructs a T5 tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__ .
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+    should refer to the superclass for more information regarding methods.
+
+    Args:
+        vocab_file (:obj:`string`):
+            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        eos_token (:obj:`string`, `optional`, defaults to "</s>"):
+            The end of sequence token.
+
+            .. note::
+
+                When building a sequence using special tokens, this is not the token that is used for the end
+                of sequence. The token used is the :obj:`sep_token`.
+        unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
+            The token used for padding, for example when batching sequences of different lengths.
+        extra_ids (:obj:`List[str]`, `optional`, defaults to :obj:`100`):
+            Add a number of extra ids added to the end of the vocabulary for use as sentinels.
+            These tokens are accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1.
+            Extra tokens are indexed from the end of the vocabulary up to beginnning ("<extra_id_0>" is the last token in the vocabulary like in T5 preprocessing
+            see: https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)
+        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`None`):
+            Additional special tokens used by the tokenizer.
    """

    vocab_files_names = VOCAB_FILES_NAMES
@@ -236,8 +236,7 @@ class T5Tokenizer(PreTrainedTokenizer):
        self.sp_model.Load(self.vocab_file)

    def _tokenize(self, text, sample=False):
-        """ Take as input a string and return a list of strings (tokens) for words/sub-words
-        """
+        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
        if not sample:
            pieces = self.sp_model.EncodeAsPieces(text)
        else:
@@ -266,8 +265,8 @@ class T5Tokenizer(PreTrainedTokenizer):
        return out_string

    def save_vocabulary(self, save_directory):
-        """ Save the sentencepiece vocabulary (copy original file) and special tokens file
-            to a directory.
+        """Save the sentencepiece vocabulary (copy original file) and special tokens file
+        to a directory.
        """
        if not os.path.isdir(save_directory):
            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))

--- a/src/transformers/tokenization_transfo_xl.py
+++ b/src/transformers/tokenization_transfo_xl.py
@@ -163,7 +163,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):

    def count_sents(self, sents, verbose=False):
        """
-            sents : a list of sentences, each a list of tokenized symbols
+        sents : a list of sentences, each a list of tokenized symbols
        """
        if verbose:
            logger.info("counting {} sents ...".format(len(sents)))
@@ -496,7 +496,7 @@ class TransfoXLTokenizerFast(PreTrainedTokenizerFast):
 class LMOrderedIterator(object):
    def __init__(self, data, bsz, bptt, device="cpu", ext_len=None):
        """
-            data -- LongTensor -- the LongTensor is strictly ordered
+        data -- LongTensor -- the LongTensor is strictly ordered
        """
        self.bsz = bsz
        self.bptt = bptt
@@ -555,7 +555,7 @@ class LMOrderedIterator(object):
 class LMShuffledIterator(object):
    def __init__(self, data, bsz, bptt, device="cpu", ext_len=None, shuffle=False):
        """
-            data -- list[LongTensor] -- there is no order among the LongTensors
+        data -- list[LongTensor] -- there is no order among the LongTensors
        """
        self.data = data