Black 20 release

a75c64d8 · Lysandre · e78c1103 · a75c64d8 · a75c64d8 · a75c64d8
Commit a75c64d8 authored Aug 26, 2020 by Lysandre
20 changed files
--- a/examples/token-classification/utils_ner.py
+++ b/examples/token-classification/utils_ner.py
@@ -90,11 +90,11 @@ class TokenClassificationTask:
        sequence_a_segment_id=0,
        mask_padding_with_zero=True,
    ) -> List[InputFeatures]:
-        """ Loads a data file into a list of `InputFeatures`
-            `cls_token_at_end` define the location of the CLS token:
-                - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
-                - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
-            `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
+        """Loads a data file into a list of `InputFeatures`
+        `cls_token_at_end` define the location of the CLS token:
+            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
+            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
+        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
        """
        # TODO clean up all this to leverage built-in features of tokenizers

@@ -230,7 +230,8 @@ if is_torch_available():
        ):
            # Load data features from cache or dataset file
            cached_features_file = os.path.join(
-                data_dir, "cached_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length)),
+                data_dir,
+                "cached_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length)),
            )

            # Make sure only the first process in distributed training processes the dataset,

--- a/src/transformers/activations.py
+++ b/src/transformers/activations.py
@@ -14,18 +14,18 @@ def swish(x):


 def _gelu_python(x):
-    """ Original Implementation of the gelu activation function in Google Bert repo when initially created.
-        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
-        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
-        This is now written in C in torch.nn.functional
-        Also see https://arxiv.org/abs/1606.08415
+    """Original Implementation of the gelu activation function in Google Bert repo when initially created.
+    For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+    0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+    This is now written in C in torch.nn.functional
+    Also see https://arxiv.org/abs/1606.08415
    """
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))


 def gelu_new(x):
-    """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
-        Also see https://arxiv.org/abs/1606.08415
+    """Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
+    Also see https://arxiv.org/abs/1606.08415
    """
    return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))


--- a/src/transformers/benchmark/benchmark.py
+++ b/src/transformers/benchmark/benchmark.py
@@ -199,11 +199,17 @@ class PyTorchBenchmark(Benchmark):
                # run additional 10 times to stabilize compilation for tpu and torchscript
                logger.info("Do inference on TPU or torchscript. Running model 5 times to stabilize compilation")
                timeit.repeat(
-                    func, repeat=1, number=5,
+                    func,
+                    repeat=1,
+                    number=5,
                )

            # as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average
-            runtimes = timeit.repeat(func, repeat=self.args.repeat, number=10,)
+            runtimes = timeit.repeat(
+                func,
+                repeat=self.args.repeat,
+                number=10,
+            )

            if self.args.is_tpu and self.args.torch_xla_tpu_print_metrics:
                import torch_xla.debug.metrics as met

--- a/src/transformers/benchmark/benchmark_args_tf.py
+++ b/src/transformers/benchmark/benchmark_args_tf.py
@@ -32,10 +32,12 @@ logger = logging.get_logger(__name__)
 @dataclass
 class TensorFlowBenchmarkArguments(BenchmarkArguments):
    tpu_name: str = field(
-        default=None, metadata={"help": "Name of TPU"},
+        default=None,
+        metadata={"help": "Name of TPU"},
    )
    device_idx: int = field(
-        default=0, metadata={"help": "CPU / GPU device index. Defaults to 0."},
+        default=0,
+        metadata={"help": "CPU / GPU device index. Defaults to 0."},
    )
    eager_mode: bool = field(default=False, metadata={"help": "Benchmark models in eager model."})
    use_xla: bool = field(

--- a/src/transformers/benchmark/benchmark_tf.py
+++ b/src/transformers/benchmark/benchmark_tf.py
@@ -219,7 +219,11 @@ class TensorFlowBenchmark(Benchmark):
                    timeit.repeat(func, repeat=1, number=5)

                # as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average
-                runtimes = timeit.repeat(func, repeat=self.args.repeat, number=10,)
+                runtimes = timeit.repeat(
+                    func,
+                    repeat=self.args.repeat,
+                    number=10,
+                )

                return min(runtimes) / 10.0
            except ResourceExhaustedError as e:

--- a/src/transformers/benchmark/benchmark_utils.py
+++ b/src/transformers/benchmark/benchmark_utils.py
--- a/src/transformers/configuration_albert.py
+++ b/src/transformers/configuration_albert.py
@@ -32,71 +32,71 @@ ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class AlbertConfig(PretrainedConfig):
    r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.AlbertModel`.
-        It is used to instantiate an ALBERT model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the ALBERT `xxlarge <https://huggingface.co/albert-xxlarge-v2>`__ architecture.
-
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
-
-
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 30000):
-                Vocabulary size of the ALBERT model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`.
-            embedding_size (:obj:`int`, optional, defaults to 128):
-                Dimensionality of vocabulary embeddings.
-            hidden_size (:obj:`int`, optional, defaults to 4096):
-                Dimensionality of the encoder layers and the pooler layer.
-            num_hidden_layers (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            num_hidden_groups (:obj:`int`, optional, defaults to 1):
-                Number of groups for the hidden layers, parameters in the same group are shared.
-            num_attention_heads (:obj:`int`, optional, defaults to 64):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            intermediate_size (:obj:`int`, optional, defaults to 16384):
-                The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-            inner_group_num (:obj:`int`, optional, defaults to 1):
-                The number of inner repetition of attention and ffn.
-            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"):
-                The non-linear activation function (function or string) in the encoder and pooler.
-                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            hidden_dropout_prob (:obj:`float`, optional, defaults to 0):
-                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0):
-                The dropout ratio for the attention probabilities.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might ever be used with. Typically set this to something
-                large (e.g., 512 or 1024 or 2048).
-            type_vocab_size (:obj:`int`, optional, defaults to 2):
-                The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`.
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-            classifier_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for attached classifiers.
-
-        Example::
-
-            >>> from transformers import AlbertConfig, AlbertModel
-            >>> # Initializing an ALBERT-xxlarge style configuration
-            >>> albert_xxlarge_configuration = AlbertConfig()
-
-            >>> # Initializing an ALBERT-base style configuration
-            >>> albert_base_configuration = AlbertConfig(
-            ...      hidden_size=768,
-            ...      num_attention_heads=12,
-            ...      intermediate_size=3072,
-            ...  )
-
-            >>> # Initializing a model from the ALBERT-base style configuration
-            >>> model = AlbertModel(albert_xxlarge_configuration)
-
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+    This is the configuration class to store the configuration of a :class:`~transformers.AlbertModel`.
+    It is used to instantiate an ALBERT model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the ALBERT `xxlarge <https://huggingface.co/albert-xxlarge-v2>`__ architecture.
+
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
+
+
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 30000):
+            Vocabulary size of the ALBERT model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`.
+        embedding_size (:obj:`int`, optional, defaults to 128):
+            Dimensionality of vocabulary embeddings.
+        hidden_size (:obj:`int`, optional, defaults to 4096):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, optional, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_hidden_groups (:obj:`int`, optional, defaults to 1):
+            Number of groups for the hidden layers, parameters in the same group are shared.
+        num_attention_heads (:obj:`int`, optional, defaults to 64):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, optional, defaults to 16384):
+            The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        inner_group_num (:obj:`int`, optional, defaults to 1):
+            The number of inner repetition of attention and ffn.
+        hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"):
+            The non-linear activation function (function or string) in the encoder and pooler.
+            If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+        hidden_dropout_prob (:obj:`float`, optional, defaults to 0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, optional, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something
+            large (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, optional, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`.
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        classifier_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for attached classifiers.
+
+    Example::
+
+        >>> from transformers import AlbertConfig, AlbertModel
+        >>> # Initializing an ALBERT-xxlarge style configuration
+        >>> albert_xxlarge_configuration = AlbertConfig()
+
+        >>> # Initializing an ALBERT-base style configuration
+        >>> albert_base_configuration = AlbertConfig(
+        ...      hidden_size=768,
+        ...      num_attention_heads=12,
+        ...      intermediate_size=3072,
+        ...  )
+
+        >>> # Initializing a model from the ALBERT-base style configuration
+        >>> model = AlbertModel(albert_xxlarge_configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
    """

    model_type = "albert"

--- a/src/transformers/configuration_auto.py
+++ b/src/transformers/configuration_auto.py
@@ -73,43 +73,112 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict(

 CONFIG_MAPPING = OrderedDict(
    [
-        ("retribert", RetriBertConfig,),
-        ("t5", T5Config,),
-        ("mobilebert", MobileBertConfig,),
-        ("distilbert", DistilBertConfig,),
-        ("albert", AlbertConfig,),
-        ("camembert", CamembertConfig,),
-        ("xlm-roberta", XLMRobertaConfig,),
+        (
+            "retribert",
+            RetriBertConfig,
+        ),
+        (
+            "t5",
+            T5Config,
+        ),
+        (
+            "mobilebert",
+            MobileBertConfig,
+        ),
+        (
+            "distilbert",
+            DistilBertConfig,
+        ),
+        (
+            "albert",
+            AlbertConfig,
+        ),
+        (
+            "camembert",
+            CamembertConfig,
+        ),
+        (
+            "xlm-roberta",
+            XLMRobertaConfig,
+        ),
        ("pegasus", PegasusConfig),
-        ("marian", MarianConfig,),
-        ("mbart", MBartConfig,),
-        ("bart", BartConfig,),
-        ("reformer", ReformerConfig,),
-        ("longformer", LongformerConfig,),
-        ("roberta", RobertaConfig,),
-        ("flaubert", FlaubertConfig,),
-        ("bert", BertConfig,),
-        ("openai-gpt", OpenAIGPTConfig,),
-        ("gpt2", GPT2Config,),
-        ("transfo-xl", TransfoXLConfig,),
-        ("xlnet", XLNetConfig,),
-        ("xlm", XLMConfig,),
-        ("ctrl", CTRLConfig,),
-        ("electra", ElectraConfig,),
-        ("encoder-decoder", EncoderDecoderConfig,),
+        (
+            "marian",
+            MarianConfig,
+        ),
+        (
+            "mbart",
+            MBartConfig,
+        ),
+        (
+            "bart",
+            BartConfig,
+        ),
+        (
+            "reformer",
+            ReformerConfig,
+        ),
+        (
+            "longformer",
+            LongformerConfig,
+        ),
+        (
+            "roberta",
+            RobertaConfig,
+        ),
+        (
+            "flaubert",
+            FlaubertConfig,
+        ),
+        (
+            "bert",
+            BertConfig,
+        ),
+        (
+            "openai-gpt",
+            OpenAIGPTConfig,
+        ),
+        (
+            "gpt2",
+            GPT2Config,
+        ),
+        (
+            "transfo-xl",
+            TransfoXLConfig,
+        ),
+        (
+            "xlnet",
+            XLNetConfig,
+        ),
+        (
+            "xlm",
+            XLMConfig,
+        ),
+        (
+            "ctrl",
+            CTRLConfig,
+        ),
+        (
+            "electra",
+            ElectraConfig,
+        ),
+        (
+            "encoder-decoder",
+            EncoderDecoderConfig,
+        ),
    ]
 )


 class AutoConfig:
    r"""
-        :class:`~transformers.AutoConfig` is a generic configuration class
-        that will be instantiated as one of the configuration classes of the library
-        when created with the :func:`~transformers.AutoConfig.from_pretrained` class method.
+    :class:`~transformers.AutoConfig` is a generic configuration class
+    that will be instantiated as one of the configuration classes of the library
+    when created with the :func:`~transformers.AutoConfig.from_pretrained` class method.

-        The :func:`~transformers.AutoConfig.from_pretrained` method takes care of returning the correct model class instance
-        based on the `model_type` property of the config object, or when it's missing,
-        falling back to using pattern matching on the `pretrained_model_name_or_path` string.
+    The :func:`~transformers.AutoConfig.from_pretrained` method takes care of returning the correct model class instance
+    based on the `model_type` property of the config object, or when it's missing,
+    falling back to using pattern matching on the `pretrained_model_name_or_path` string.
    """

    def __init__(self):

--- a/src/transformers/configuration_bart.py
+++ b/src/transformers/configuration_bart.py
@@ -102,7 +102,7 @@ BART_CONFIG_ARGS_DOC = r"""
 @add_start_docstrings_to_callable(BART_CONFIG_ARGS_DOC)
 class BartConfig(PretrainedConfig):
    r"""
-        Configuration class for Bart. Parameters are renamed from the fairseq implementation
+    Configuration class for Bart. Parameters are renamed from the fairseq implementation
    """
    model_type = "bart"

@@ -141,14 +141,14 @@ class BartConfig(PretrainedConfig):
        **common_kwargs
    ):
        r"""
-            :class:`~transformers.BartConfig` is the configuration class for `BartModel`.
+        :class:`~transformers.BartConfig` is the configuration class for `BartModel`.

-            Examples::
+        Examples::

-                >>> from transformers import BartConfig, BartModel
+            >>> from transformers import BartConfig, BartModel

-                >>> config = BartConfig.from_pretrained('facebook/bart-large')
-                >>> model = BartModel(config)
+            >>> config = BartConfig.from_pretrained('facebook/bart-large')
+            >>> model = BartModel(config)

        """
        if "hidden_size" in common_kwargs:

--- a/src/transformers/configuration_bert.py
+++ b/src/transformers/configuration_bert.py
@@ -50,59 +50,59 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class BertConfig(PretrainedConfig):
    r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.BertModel`.
-        It is used to instantiate an BERT model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
-
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
-
-
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 30522):
-                Vocabulary size of the BERT model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
-            hidden_size (:obj:`int`, optional, defaults to 768):
-                Dimensionality of the encoder layers and the pooler layer.
-            num_hidden_layers (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            num_attention_heads (:obj:`int`, optional, defaults to 12):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            intermediate_size (:obj:`int`, optional, defaults to 3072):
-                Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
-                The non-linear activation function (function or string) in the encoder and pooler.
-                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention probabilities.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            type_vocab_size (:obj:`int`, optional, defaults to 2):
-                The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-            gradient_checkpointing (:obj:`bool`, optional, defaults to False):
-                If True, use gradient checkpointing to save memory at the expense of slower backward pass.
-
-        Example::
-
-            >>> from transformers import BertModel, BertConfig
-
-            >>> # Initializing a BERT bert-base-uncased style configuration
-            >>> configuration = BertConfig()
-
-            >>> # Initializing a model from the bert-base-uncased style configuration
-            >>> model = BertModel(configuration)
-
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+    This is the configuration class to store the configuration of a :class:`~transformers.BertModel`.
+    It is used to instantiate an BERT model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
+
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
+
+
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
+        hidden_size (:obj:`int`, optional, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, optional, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, optional, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, optional, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
+            The non-linear activation function (function or string) in the encoder and pooler.
+            If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+        hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, optional, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, optional, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        gradient_checkpointing (:obj:`bool`, optional, defaults to False):
+            If True, use gradient checkpointing to save memory at the expense of slower backward pass.
+
+    Example::
+
+        >>> from transformers import BertModel, BertConfig
+
+        >>> # Initializing a BERT bert-base-uncased style configuration
+        >>> configuration = BertConfig()
+
+        >>> # Initializing a model from the bert-base-uncased style configuration
+        >>> model = BertModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
    """
    model_type = "bert"


--- a/src/transformers/configuration_ctrl.py
+++ b/src/transformers/configuration_ctrl.py
@@ -25,55 +25,55 @@ CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://s3.amazonaws.com/models.h

 class CTRLConfig(PretrainedConfig):
    """
-        This is the configuration class to store the configuration of a :class:`~transformers.CTRLModel`.
-        It is used to instantiate an CTRL model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the `ctrl <https://huggingface.co/ctrl>`__ architecture from SalesForce.
-
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
-
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 246534):
-                Vocabulary size of the CTRL model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
-            n_positions (:obj:`int`, optional, defaults to 256):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            n_ctx (:obj:`int`, optional, defaults to 256):
-                Dimensionality of the causal mask (usually same as n_positions).
-            n_embd (:obj:`int`, optional, defaults to 1280):
-                Dimensionality of the embeddings and hidden states.
-            dff (:obj:`int`, optional, defaults to 8192):
-                Dimensionality of the inner dimension of the FFN.
-            n_layer (:obj:`int`, optional, defaults to 48):
-                Number of hidden layers in the Transformer encoder.
-            n_head (:obj:`int`, optional, defaults to 16):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            resid_pdrop (:obj:`float`, optional, defaults to 0.1):
-                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-            embd_pdrop (:obj:`int`, optional, defaults to 0.1):
-                The dropout ratio for the embeddings.
-            attn_pdrop (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention.
-            layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-6):
-                The epsilon to use in the layer normalization layers
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-
-        Example::
-
-            >>> from transformers import CTRLModel, CTRLConfig
-
-            >>> # Initializing a CTRL configuration
-            >>> configuration = CTRLConfig()
-
-            >>> # Initializing a model from the configuration
-            >>> model = CTRLModel(configuration)
-
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+    This is the configuration class to store the configuration of a :class:`~transformers.CTRLModel`.
+    It is used to instantiate an CTRL model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the `ctrl <https://huggingface.co/ctrl>`__ architecture from SalesForce.
+
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
+
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 246534):
+            Vocabulary size of the CTRL model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
+        n_positions (:obj:`int`, optional, defaults to 256):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        n_ctx (:obj:`int`, optional, defaults to 256):
+            Dimensionality of the causal mask (usually same as n_positions).
+        n_embd (:obj:`int`, optional, defaults to 1280):
+            Dimensionality of the embeddings and hidden states.
+        dff (:obj:`int`, optional, defaults to 8192):
+            Dimensionality of the inner dimension of the FFN.
+        n_layer (:obj:`int`, optional, defaults to 48):
+            Number of hidden layers in the Transformer encoder.
+        n_head (:obj:`int`, optional, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        resid_pdrop (:obj:`float`, optional, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        embd_pdrop (:obj:`int`, optional, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        attn_pdrop (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention.
+        layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-6):
+            The epsilon to use in the layer normalization layers
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+
+    Example::
+
+        >>> from transformers import CTRLModel, CTRLConfig
+
+        >>> # Initializing a CTRL configuration
+        >>> configuration = CTRLConfig()
+
+        >>> # Initializing a model from the configuration
+        >>> model = CTRLModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
    """

    model_type = "ctrl"

--- a/src/transformers/configuration_distilbert.py
+++ b/src/transformers/configuration_distilbert.py
@@ -33,61 +33,61 @@ DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class DistilBertConfig(PretrainedConfig):
    r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.DistilBertModel`.
-        It is used to instantiate a DistilBERT model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the DistilBERT `distilbert-base-uncased <https://huggingface.co/distilbert-base-uncased>`__ architecture.
-
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
-
-
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 30522):
-                Vocabulary size of the DistilBERT model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            sinusoidal_pos_embds (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Whether to use sinusoidal positional embeddings.
-            n_layers (:obj:`int`, optional, defaults to 6):
-                Number of hidden layers in the Transformer encoder.
-            n_heads (:obj:`int`, optional, defaults to 12):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            dim (:obj:`int`, optional, defaults to 768):
-                Dimensionality of the encoder layers and the pooler layer.
-            hidden_dim (:obj:`int`, optional, defaults to 3072):
-                The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-            dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-            attention_dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention probabilities.
-            activation (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
-                The non-linear activation function (function or string) in the encoder and pooler.
-                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            qa_dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout probabilities used in the question answering model
-                :class:`~transformers.DistilBertForQuestionAnswering`.
-            seq_classif_dropout (:obj:`float`, optional, defaults to 0.2):
-                The dropout probabilities used in the sequence classification and the multiple choice model
-                :class:`~transformers.DistilBertForSequenceClassification`.
-
-        Example::
-
-            >>> from transformers import DistilBertModel, DistilBertConfig
-
-            >>> # Initializing a DistilBERT configuration
-            >>> configuration = DistilBertConfig()
-
-            >>> # Initializing a model from the configuration
-            >>> model = DistilBertModel(configuration)
-
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+    This is the configuration class to store the configuration of a :class:`~transformers.DistilBertModel`.
+    It is used to instantiate a DistilBERT model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the DistilBERT `distilbert-base-uncased <https://huggingface.co/distilbert-base-uncased>`__ architecture.
+
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
+
+
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 30522):
+            Vocabulary size of the DistilBERT model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
+        max_position_embeddings (:obj:`int`, optional, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        sinusoidal_pos_embds (:obj:`boolean`, optional, defaults to :obj:`False`):
+            Whether to use sinusoidal positional embeddings.
+        n_layers (:obj:`int`, optional, defaults to 6):
+            Number of hidden layers in the Transformer encoder.
+        n_heads (:obj:`int`, optional, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        dim (:obj:`int`, optional, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        hidden_dim (:obj:`int`, optional, defaults to 3072):
+            The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        dropout (:obj:`float`, optional, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        activation (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
+            The non-linear activation function (function or string) in the encoder and pooler.
+            If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        qa_dropout (:obj:`float`, optional, defaults to 0.1):
+            The dropout probabilities used in the question answering model
+            :class:`~transformers.DistilBertForQuestionAnswering`.
+        seq_classif_dropout (:obj:`float`, optional, defaults to 0.2):
+            The dropout probabilities used in the sequence classification and the multiple choice model
+            :class:`~transformers.DistilBertForSequenceClassification`.
+
+    Example::
+
+        >>> from transformers import DistilBertModel, DistilBertConfig
+
+        >>> # Initializing a DistilBERT configuration
+        >>> configuration = DistilBertConfig()
+
+        >>> # Initializing a model from the configuration
+        >>> model = DistilBertModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
    """
    model_type = "distilbert"


--- a/src/transformers/configuration_dpr.py
+++ b/src/transformers/configuration_dpr.py
@@ -29,16 +29,16 @@ DPR_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class DPRConfig(BertConfig):
    r"""
-        :class:`~transformers.DPRConfig` is the configuration class to store the configuration of a
-        `DPRModel`.
+    :class:`~transformers.DPRConfig` is the configuration class to store the configuration of a
+    `DPRModel`.

-        This is the configuration class to store the configuration of a `DPRContextEncoder`, `DPRQuestionEncoder`, or a `DPRReader`.
-        It is used to instantiate the components of the DPR model.
+    This is the configuration class to store the configuration of a `DPRContextEncoder`, `DPRQuestionEncoder`, or a `DPRReader`.
+    It is used to instantiate the components of the DPR model.

-        Args:
-            projection_dim (:obj:`int`, optional, defaults to 0):
-                Dimension of the projection for the context and question encoders.
-                If it is set to zero (default), then no projection is done.
+    Args:
+        projection_dim (:obj:`int`, optional, defaults to 0):
+            Dimension of the projection for the context and question encoders.
+            If it is set to zero (default), then no projection is done.
    """
    model_type = "dpr"


--- a/src/transformers/configuration_electra.py
+++ b/src/transformers/configuration_electra.py
@@ -33,82 +33,82 @@ ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class ElectraConfig(PretrainedConfig):
    r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.ElectraModel`.
-        It is used to instantiate an ELECTRA model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the ELECTRA `google/electra-small-discriminator <https://huggingface.co/google/electra-small-discriminator>`__
-        architecture.
-
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
-
-
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 30522):
-                Vocabulary size of the ELECTRA model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.ElectraModel`.
-            embedding_size (:obj:`int`, optional, defaults to 128):
-                Dimensionality of the encoder layers and the pooler layer.
-            hidden_size (:obj:`int`, optional, defaults to 256):
-                Dimensionality of the encoder layers and the pooler layer.
-            num_hidden_layers (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            num_attention_heads (:obj:`int`, optional, defaults to 4):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            intermediate_size (:obj:`int`, optional, defaults to 1024):
-                Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
-                The non-linear activation function (function or string) in the encoder and pooler.
-                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention probabilities.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            type_vocab_size (:obj:`int`, optional, defaults to 2):
-                The vocabulary size of the `token_type_ids` passed into :class:`~transformers.ElectraModel`.
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-            summary_type (:obj:`string`, optional, defaults to "first"):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.ElectraForMultipleChoice`.
-                Is one of the following options:
-
-                    - 'last' => take the last token hidden state (like XLNet)
-                    - 'first' => take the first token hidden state (like Bert)
-                    - 'mean' => take the mean of all tokens hidden states
-                    - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
-                    - 'attn' => Not implemented now, use multi-head attention
-            summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.ElectraForMultipleChoice`.
-                Add a projection after the vector extraction
-            summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.ElectraForMultipleChoice`.
-                'gelu' => add a gelu activation to the output, Other => no activation.
-            summary_last_dropout (:obj:`float`, optional, defaults to 0.0):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.ElectraForMultipleChoice`.
-                Add a dropout after the projection and activation
-
-        Example::
-
-            >>> from transformers import ElectraModel, ElectraConfig
-
-            >>> # Initializing a ELECTRA electra-base-uncased style configuration
-            >>> configuration = ElectraConfig()
-
-            >>> # Initializing a model from the electra-base-uncased style configuration
-            >>> model = ElectraModel(configuration)
-
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+    This is the configuration class to store the configuration of a :class:`~transformers.ElectraModel`.
+    It is used to instantiate an ELECTRA model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the ELECTRA `google/electra-small-discriminator <https://huggingface.co/google/electra-small-discriminator>`__
+    architecture.
+
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
+
+
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 30522):
+            Vocabulary size of the ELECTRA model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.ElectraModel`.
+        embedding_size (:obj:`int`, optional, defaults to 128):
+            Dimensionality of the encoder layers and the pooler layer.
+        hidden_size (:obj:`int`, optional, defaults to 256):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, optional, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, optional, defaults to 4):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, optional, defaults to 1024):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
+            The non-linear activation function (function or string) in the encoder and pooler.
+            If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+        hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, optional, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, optional, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed into :class:`~transformers.ElectraModel`.
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        summary_type (:obj:`string`, optional, defaults to "first"):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.ElectraForMultipleChoice`.
+            Is one of the following options:
+
+                - 'last' => take the last token hidden state (like XLNet)
+                - 'first' => take the first token hidden state (like Bert)
+                - 'mean' => take the mean of all tokens hidden states
+                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+                - 'attn' => Not implemented now, use multi-head attention
+        summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.ElectraForMultipleChoice`.
+            Add a projection after the vector extraction
+        summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.ElectraForMultipleChoice`.
+            'gelu' => add a gelu activation to the output, Other => no activation.
+        summary_last_dropout (:obj:`float`, optional, defaults to 0.0):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.ElectraForMultipleChoice`.
+            Add a dropout after the projection and activation
+
+    Example::
+
+        >>> from transformers import ElectraModel, ElectraConfig
+
+        >>> # Initializing a ELECTRA electra-base-uncased style configuration
+        >>> configuration = ElectraConfig()
+
+        >>> # Initializing a model from the electra-base-uncased style configuration
+        >>> model = ElectraModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
    """
    model_type = "electra"


--- a/src/transformers/configuration_encoder_decoder.py
+++ b/src/transformers/configuration_encoder_decoder.py
@@ -25,47 +25,47 @@ logger = logging.get_logger(__name__)

 class EncoderDecoderConfig(PretrainedConfig):
    r"""
-        :class:`~transformers.EncoderDecoderConfig` is the configuration class to store the configuration of a `EncoderDecoderModel`.
+    :class:`~transformers.EncoderDecoderConfig` is the configuration class to store the configuration of a `EncoderDecoderModel`.

-        It is used to instantiate an Encoder Decoder model according to the specified arguments, defining the encoder and decoder configs.
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig`
-        and can be used to control the model outputs.
-        See the documentation for :class:`~transformers.PretrainedConfig` for more information.
+    It is used to instantiate an Encoder Decoder model according to the specified arguments, defining the encoder and decoder configs.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig`
+    and can be used to control the model outputs.
+    See the documentation for :class:`~transformers.PretrainedConfig` for more information.

-        Args:
-            kwargs (`optional`):
-                Remaining dictionary of keyword arguments. Notably:
-                    encoder (:class:`PretrainedConfig`, optional, defaults to `None`):
-                        An instance of a configuration object that defines the encoder config.
-                    decoder (:class:`PretrainedConfig`, optional, defaults to `None`):
-                        An instance of a configuration object that defines the decoder config.
+    Args:
+        kwargs (`optional`):
+            Remaining dictionary of keyword arguments. Notably:
+                encoder (:class:`PretrainedConfig`, optional, defaults to `None`):
+                    An instance of a configuration object that defines the encoder config.
+                decoder (:class:`PretrainedConfig`, optional, defaults to `None`):
+                    An instance of a configuration object that defines the decoder config.

-        Example::
+    Example::

-            >>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel
+        >>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel

-            >>> # Initializing a BERT bert-base-uncased style configuration
-            >>> config_encoder = BertConfig()
-            >>> config_decoder = BertConfig()
+        >>> # Initializing a BERT bert-base-uncased style configuration
+        >>> config_encoder = BertConfig()
+        >>> config_decoder = BertConfig()

-            >>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
+        >>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)

-            >>> # Initializing a Bert2Bert model from the bert-base-uncased style configurations
-            >>> model = EncoderDecoderModel(config=config)
+        >>> # Initializing a Bert2Bert model from the bert-base-uncased style configurations
+        >>> model = EncoderDecoderModel(config=config)

-            >>> # Accessing the model configuration
-            >>> config_encoder = model.config.encoder
-            >>> config_decoder  = model.config.decoder
-            >>> # set decoder config to causal lm
-            >>> config_decoder.is_decoder = True
-            >>> config_decoder.add_cross_attention = True
+        >>> # Accessing the model configuration
+        >>> config_encoder = model.config.encoder
+        >>> config_decoder  = model.config.decoder
+        >>> # set decoder config to causal lm
+        >>> config_decoder.is_decoder = True
+        >>> config_decoder.add_cross_attention = True

-            >>> # Saving the model, including its configuration
-            >>> model.save_pretrained('my-model')
+        >>> # Saving the model, including its configuration
+        >>> model.save_pretrained('my-model')

-            >>> # loading model and config from pretrained folder
-            >>> encoder_decoder_config = EncoderDecoderConfig.from_pretrained('my-model')
-            >>> model = EncoderDecoderModel.from_pretrained('my-model', config=encoder_decoder_config)
+        >>> # loading model and config from pretrained folder
+        >>> encoder_decoder_config = EncoderDecoderConfig.from_pretrained('my-model')
+        >>> model = EncoderDecoderModel.from_pretrained('my-model', config=encoder_decoder_config)
    """
    model_type = "encoder_decoder"


--- a/src/transformers/configuration_flaubert.py
+++ b/src/transformers/configuration_flaubert.py
@@ -30,121 +30,120 @@ FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class FlaubertConfig(XLMConfig):
    """
-        Configuration class to store the configuration of a `FlaubertModel`.
-        This is the configuration class to store the configuration of a :class:`~transformers.XLMModel`.
-        It is used to instantiate an XLM model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the `xlm-mlm-en-2048 <https://huggingface.co/xlm-mlm-en-2048>`__ architecture.
+    Configuration class to store the configuration of a `FlaubertModel`.
+    This is the configuration class to store the configuration of a :class:`~transformers.XLMModel`.
+    It is used to instantiate an XLM model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the `xlm-mlm-en-2048 <https://huggingface.co/xlm-mlm-en-2048>`__ architecture.

-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.

-        Args:
-            pre_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether to apply the layer normalization before or after the feed forward layer following the
-                attention in each layer (Vaswani et al., Tensor2Tensor for Neural Machine Translation. 2018)
-            layerdrop (:obj:`float`, `optional`, defaults to 0.0):
-                Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand
-                with Structured Dropout. ICLR 2020)
-            vocab_size (:obj:`int`, optional, defaults to 30145):
-                Vocabulary size of the Flaubert model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.FlaubertModel`.
-            emb_dim (:obj:`int`, optional, defaults to 2048):
-                Dimensionality of the encoder layers and the pooler layer.
-            n_layer (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            n_head (:obj:`int`, optional, defaults to 16):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout probability for all fully connected
-                layers in the embeddings, encoder, and pooler.
-            attention_dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout probability for the attention mechanism
-            gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`):
-                The non-linear activation function (function or string) in the
-                encoder and pooler. If set to `True`, "gelu" will be used instead of "relu".
-            sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Whether to use sinusoidal positional embeddings instead of absolute positional embeddings.
-            causal (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Set this to `True` for the model to behave in a causal manner.
-                Causal models use a triangular attention mask in order to only attend to the left-side context instead
-                if a bidirectional context.
-            asm (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Whether to use an adaptive log softmax projection layer instead of a linear layer for the prediction
-                layer.
-            n_langs (:obj:`int`, optional, defaults to 1):
-                The number of languages the model handles. Set to 1 for monolingual models.
-            use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`)
-                Whether to use language embeddings. Some models use additional language embeddings, see
-                `the multilingual models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__
-                for information on how to use them.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might
-                ever be used with. Typically set this to something large just in case
-                (e.g., 512 or 1024 or 2048).
-            embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5):
-                The standard deviation of the truncated_normal_initializer for
-                initializing the embedding matrices.
-            init_std (:obj:`int`, optional, defaults to 50257):
-                The standard deviation of the truncated_normal_initializer for
-                initializing all weight matrices except the embedding matrices.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-            bos_index (:obj:`int`, optional, defaults to 0):
-                The index of the beginning of sentence token in the vocabulary.
-            eos_index (:obj:`int`, optional, defaults to 1):
-                The index of the end of sentence token in the vocabulary.
-            pad_index (:obj:`int`, optional, defaults to 2):
-                The index of the padding token in the vocabulary.
-            unk_index (:obj:`int`, optional, defaults to 3):
-                The index of the unknown token in the vocabulary.
-            mask_index (:obj:`int`, optional, defaults to 5):
-                The index of the masking token in the vocabulary.
-            is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`):
-                Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
-            summary_type (:obj:`string`, optional, defaults to "first"):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                Is one of the following options:
+    Args:
+        pre_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether to apply the layer normalization before or after the feed forward layer following the
+            attention in each layer (Vaswani et al., Tensor2Tensor for Neural Machine Translation. 2018)
+        layerdrop (:obj:`float`, `optional`, defaults to 0.0):
+            Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand
+            with Structured Dropout. ICLR 2020)
+        vocab_size (:obj:`int`, optional, defaults to 30145):
+            Vocabulary size of the Flaubert model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.FlaubertModel`.
+        emb_dim (:obj:`int`, optional, defaults to 2048):
+            Dimensionality of the encoder layers and the pooler layer.
+        n_layer (:obj:`int`, optional, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        n_head (:obj:`int`, optional, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        dropout (:obj:`float`, optional, defaults to 0.1):
+            The dropout probability for all fully connected
+            layers in the embeddings, encoder, and pooler.
+        attention_dropout (:obj:`float`, optional, defaults to 0.1):
+            The dropout probability for the attention mechanism
+        gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`):
+            The non-linear activation function (function or string) in the
+            encoder and pooler. If set to `True`, "gelu" will be used instead of "relu".
+        sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`):
+            Whether to use sinusoidal positional embeddings instead of absolute positional embeddings.
+        causal (:obj:`boolean`, optional, defaults to :obj:`False`):
+            Set this to `True` for the model to behave in a causal manner.
+            Causal models use a triangular attention mask in order to only attend to the left-side context instead
+            if a bidirectional context.
+        asm (:obj:`boolean`, optional, defaults to :obj:`False`):
+            Whether to use an adaptive log softmax projection layer instead of a linear layer for the prediction
+            layer.
+        n_langs (:obj:`int`, optional, defaults to 1):
+            The number of languages the model handles. Set to 1 for monolingual models.
+        use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`)
+            Whether to use language embeddings. Some models use additional language embeddings, see
+            `the multilingual models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__
+            for information on how to use them.
+        max_position_embeddings (:obj:`int`, optional, defaults to 512):
+            The maximum sequence length that this model might
+            ever be used with. Typically set this to something large just in case
+            (e.g., 512 or 1024 or 2048).
+        embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5):
+            The standard deviation of the truncated_normal_initializer for
+            initializing the embedding matrices.
+        init_std (:obj:`int`, optional, defaults to 50257):
+            The standard deviation of the truncated_normal_initializer for
+            initializing all weight matrices except the embedding matrices.
+        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        bos_index (:obj:`int`, optional, defaults to 0):
+            The index of the beginning of sentence token in the vocabulary.
+        eos_index (:obj:`int`, optional, defaults to 1):
+            The index of the end of sentence token in the vocabulary.
+        pad_index (:obj:`int`, optional, defaults to 2):
+            The index of the padding token in the vocabulary.
+        unk_index (:obj:`int`, optional, defaults to 3):
+            The index of the unknown token in the vocabulary.
+        mask_index (:obj:`int`, optional, defaults to 5):
+            The index of the masking token in the vocabulary.
+        is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`):
+            Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
+        summary_type (:obj:`string`, optional, defaults to "first"):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLMForSequenceClassification`.
+            Is one of the following options:

-                - 'last' => take the last token hidden state (like XLNet)
-                - 'first' => take the first token hidden state (like Bert)
-                - 'mean' => take the mean of all tokens hidden states
-                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
-                - 'attn' => Not implemented now, use multi-head attention
-            summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                Add a projection after the vector extraction
-            summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                'tanh' => add a tanh activation to the output, Other => no activation.
-            summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
-            summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                Add a dropout before the projection and activation
-            start_n_top (:obj:`int`, optional, defaults to 5):
-                Used in the SQuAD evaluation script for XLM and XLNet.
-            end_n_top (:obj:`int`, optional, defaults to 5):
-                Used in the SQuAD evaluation script for XLM and XLNet.
-            mask_token_id (:obj:`int`, optional, defaults to 0):
-                Model agnostic parameter to identify masked tokens when generating text in an MLM context.
-            lang_id (:obj:`int`, optional, defaults to 1):
-                The ID of the language used by the model. This parameter is used when generating
-                text in a given language.
+            - 'last' => take the last token hidden state (like XLNet)
+            - 'first' => take the first token hidden state (like Bert)
+            - 'mean' => take the mean of all tokens hidden states
+            - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+            - 'attn' => Not implemented now, use multi-head attention
+        summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLMForSequenceClassification`.
+            Add a projection after the vector extraction
+        summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLMForSequenceClassification`.
+            'tanh' => add a tanh activation to the output, Other => no activation.
+        summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLMForSequenceClassification`.
+            If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
+        summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLMForSequenceClassification`.
+            Add a dropout before the projection and activation
+        start_n_top (:obj:`int`, optional, defaults to 5):
+            Used in the SQuAD evaluation script for XLM and XLNet.
+        end_n_top (:obj:`int`, optional, defaults to 5):
+            Used in the SQuAD evaluation script for XLM and XLNet.
+        mask_token_id (:obj:`int`, optional, defaults to 0):
+            Model agnostic parameter to identify masked tokens when generating text in an MLM context.
+        lang_id (:obj:`int`, optional, defaults to 1):
+            The ID of the language used by the model. This parameter is used when generating
+            text in a given language.
    """

    model_type = "flaubert"

    def __init__(self, layerdrop=0.0, pre_norm=False, pad_token_id=2, bos_token_id=0, **kwargs):
-        """Constructs FlaubertConfig.
-        """
+        """Constructs FlaubertConfig."""
        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs)
        self.layerdrop = layerdrop
        self.pre_norm = pre_norm
--- a/src/transformers/configuration_gpt2.py
+++ b/src/transformers/configuration_gpt2.py
@@ -32,84 +32,84 @@ GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class GPT2Config(PretrainedConfig):
    """
-        This is the configuration class to store the configuration of a :class:`~transformers.GPT2Model`.
-        It is used to instantiate an GPT-2 model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the GPT-2 `small <https://huggingface.co/gpt2>`__ architecture.
-
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
-
-
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 50257):
-                Vocabulary size of the GPT-2 model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.GPT2Model`.
-            n_positions (:obj:`int`, optional, defaults to 1024):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            n_ctx (:obj:`int`, optional, defaults to 1024):
-                Dimensionality of the causal mask (usually same as n_positions).
-            n_embd (:obj:`int`, optional, defaults to 768):
-                Dimensionality of the embeddings and hidden states.
-            n_layer (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            n_head (:obj:`int`, optional, defaults to 12):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            n_inner (:obj:`int`, optional, defaults to None):
-                Dimensionality of the inner feed-forward layers. :obj:`None` will set it to 4 times n_embd
-            activation_function (:obj:`str`, optional, defaults to 'gelu'):
-                Activation function selected in the list ["relu", "swish", "gelu", "tanh", "gelu_new"].
-            resid_pdrop (:obj:`float`, optional, defaults to 0.1):
-                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-            embd_pdrop (:obj:`int`, optional, defaults to 0.1):
-                The dropout ratio for the embeddings.
-            attn_pdrop (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention.
-            layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
-                The epsilon to use in the layer normalization layers
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            summary_type (:obj:`string`, optional, defaults to "cls_index"):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.GPT2DoubleHeadsModel`.
-                Is one of the following options:
-
-                - 'last' => take the last token hidden state (like XLNet)
-                - 'first' => take the first token hidden state (like Bert)
-                - 'mean' => take the mean of all tokens hidden states
-                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
-                - 'attn' => Not implemented now, use multi-head attention
-            summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.GPT2DoubleHeadsModel`.
-                Add a projection after the vector extraction
-            summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.GPT2DoubleHeadsModel`.
-                'tanh' => add a tanh activation to the output, Other => no activation.
-            summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.GPT2DoubleHeadsModel`.
-                If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
-            summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.GPT2DoubleHeadsModel`.
-                Add a dropout before the projection and activation
-
-        Example::
-
-            >>> from transformers import GPT2Model, GPT2Config
-
-            >>> # Initializing a GPT2 configuration
-            >>> configuration = GPT2Config()
-
-            >>> # Initializing a model from the configuration
-            >>> model = GPT2Model(configuration)
-
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+    This is the configuration class to store the configuration of a :class:`~transformers.GPT2Model`.
+    It is used to instantiate an GPT-2 model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the GPT-2 `small <https://huggingface.co/gpt2>`__ architecture.
+
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
+
+
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 50257):
+            Vocabulary size of the GPT-2 model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.GPT2Model`.
+        n_positions (:obj:`int`, optional, defaults to 1024):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        n_ctx (:obj:`int`, optional, defaults to 1024):
+            Dimensionality of the causal mask (usually same as n_positions).
+        n_embd (:obj:`int`, optional, defaults to 768):
+            Dimensionality of the embeddings and hidden states.
+        n_layer (:obj:`int`, optional, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        n_head (:obj:`int`, optional, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        n_inner (:obj:`int`, optional, defaults to None):
+            Dimensionality of the inner feed-forward layers. :obj:`None` will set it to 4 times n_embd
+        activation_function (:obj:`str`, optional, defaults to 'gelu'):
+            Activation function selected in the list ["relu", "swish", "gelu", "tanh", "gelu_new"].
+        resid_pdrop (:obj:`float`, optional, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        embd_pdrop (:obj:`int`, optional, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        attn_pdrop (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention.
+        layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
+            The epsilon to use in the layer normalization layers
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        summary_type (:obj:`string`, optional, defaults to "cls_index"):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.GPT2DoubleHeadsModel`.
+            Is one of the following options:
+
+            - 'last' => take the last token hidden state (like XLNet)
+            - 'first' => take the first token hidden state (like Bert)
+            - 'mean' => take the mean of all tokens hidden states
+            - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+            - 'attn' => Not implemented now, use multi-head attention
+        summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.GPT2DoubleHeadsModel`.
+            Add a projection after the vector extraction
+        summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.GPT2DoubleHeadsModel`.
+            'tanh' => add a tanh activation to the output, Other => no activation.
+        summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.GPT2DoubleHeadsModel`.
+            If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
+        summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.GPT2DoubleHeadsModel`.
+            Add a dropout before the projection and activation
+
+    Example::
+
+        >>> from transformers import GPT2Model, GPT2Config
+
+        >>> # Initializing a GPT2 configuration
+        >>> configuration = GPT2Config()
+
+        >>> # Initializing a model from the configuration
+        >>> model = GPT2Model(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
    """

    model_type = "gpt2"

--- a/src/transformers/configuration_longformer.py
+++ b/src/transformers/configuration_longformer.py
@@ -33,32 +33,32 @@ LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class LongformerConfig(RobertaConfig):
    r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.LongformerModel`.
-        It is used to instantiate an Longformer model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the RoBERTa `roberta-base <https://huggingface.co/roberta-base>`__ architecture with a sequence length 4,096.
+    This is the configuration class to store the configuration of a :class:`~transformers.LongformerModel`.
+    It is used to instantiate an Longformer model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the RoBERTa `roberta-base <https://huggingface.co/roberta-base>`__ architecture with a sequence length 4,096.

-        The :class:`~transformers.LongformerConfig` class directly inherits :class:`~transformers.RobertaConfig`.
-        It reuses the same defaults. Please check the parent class for more information.
+    The :class:`~transformers.LongformerConfig` class directly inherits :class:`~transformers.RobertaConfig`.
+    It reuses the same defaults. Please check the parent class for more information.

-        Args:
-            attention_window (:obj:`int` or :obj:`List[int]`, optional, defaults to 512):
-                Size of an attention window around each token. If :obj:`int`, use the same size for all layers.
-                To specify a different window size for each layer, use a :obj:`List[int]` where
-                ``len(attention_window) == num_hidden_layers``.
+    Args:
+        attention_window (:obj:`int` or :obj:`List[int]`, optional, defaults to 512):
+            Size of an attention window around each token. If :obj:`int`, use the same size for all layers.
+            To specify a different window size for each layer, use a :obj:`List[int]` where
+            ``len(attention_window) == num_hidden_layers``.

-        Example::
+    Example::

-            >>> from transformers import LongformerConfig, LongformerModel
+        >>> from transformers import LongformerConfig, LongformerModel

-            >>> # Initializing a Longformer configuration
-            >>> configuration = LongformerConfig()
+        >>> # Initializing a Longformer configuration
+        >>> configuration = LongformerConfig()

-            >>> # Initializing a model from the configuration
-            >>> model = LongformerModel(configuration)
+        >>> # Initializing a model from the configuration
+        >>> model = LongformerModel(configuration)

-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
    """
    model_type = "longformer"


--- a/src/transformers/configuration_mobilebert.py
+++ b/src/transformers/configuration_mobilebert.py
@@ -25,79 +25,79 @@ MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class MobileBertConfig(PretrainedConfig):
    r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.MobileBertModel`.
-        It is used to instantiate a MobileBERT model according to the specified arguments, defining the model
-        architecture.
-
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
-
-
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 30522):
-                Vocabulary size of the MobileBERT model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.MobileBertModel`.
-            hidden_size (:obj:`int`, optional, defaults to 512):
-                Dimensionality of the encoder layers and the pooler layer.
-            num_hidden_layers (:obj:`int`, optional, defaults to 24):
-                Number of hidden layers in the Transformer encoder.
-            num_attention_heads (:obj:`int`, optional, defaults to 4):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            intermediate_size (:obj:`int`, optional, defaults to 512):
-                Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "relu"):
-                The non-linear activation function (function or string) in the encoder and pooler.
-                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            hidden_dropout_prob (:obj:`float`, optional, defaults to 0.0):
-                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention probabilities.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            type_vocab_size (:obj:`int`, optional, defaults to 2):
-                The vocabulary size of the `token_type_ids` passed into :class:`~transformers.MobileBertModel`.
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-
-            pad_token_id (:obj:`int`, optional, defaults to 0):
-                The ID of the token in the word embedding to use as padding.
-            embedding_size (:obj:`int`, optional, defaults to 128):
-                The dimension of the word embedding vectors.
-            trigram_input (:obj:`bool`, optional, defaults to True):
-                Use a convolution of trigram as input.
-            use_bottleneck (:obj:`bool`, optional, defaults to True):
-                Whether to use bottleneck in BERT.
-            intra_bottleneck_size (:obj:`int`, optional, defaults to 128):
-                Size of bottleneck layer output.
-            use_bottleneck_attention (:obj:`bool`, optional, defaults to False):
-                Whether to use attention inputs from the bottleneck transformation.
-            key_query_shared_bottleneck (:obj:`bool`, optional, defaults to True):
-                Whether to use the same linear transformation for query&key in the bottleneck.
-            num_feedforward_networks (:obj:`int`, optional, defaults to 4):
-                Number of FFNs in a block.
-            normalization_type (:obj:`str`, optional, defaults to "no_norm"):
-                The normalization type in BERT.
-
-        Example:
-
-            >>> from transformers import MobileBertModel, MobileBertConfig
-
-            >>> # Initializing a MobileBERT configuration
-            >>> configuration = MobileBertConfig()
-
-            >>> # Initializing a model from the configuration above
-            >>> model = MobileBertModel(configuration)
-
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
-
-        Attributes:
-            pretrained_config_archive_map (Dict[str, str]):
-                A dictionary containing all the available pre-trained checkpoints.
+    This is the configuration class to store the configuration of a :class:`~transformers.MobileBertModel`.
+    It is used to instantiate a MobileBERT model according to the specified arguments, defining the model
+    architecture.
+
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
+
+
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 30522):
+            Vocabulary size of the MobileBERT model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.MobileBertModel`.
+        hidden_size (:obj:`int`, optional, defaults to 512):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, optional, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, optional, defaults to 4):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, optional, defaults to 512):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "relu"):
+            The non-linear activation function (function or string) in the encoder and pooler.
+            If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+        hidden_dropout_prob (:obj:`float`, optional, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, optional, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, optional, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed into :class:`~transformers.MobileBertModel`.
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+
+        pad_token_id (:obj:`int`, optional, defaults to 0):
+            The ID of the token in the word embedding to use as padding.
+        embedding_size (:obj:`int`, optional, defaults to 128):
+            The dimension of the word embedding vectors.
+        trigram_input (:obj:`bool`, optional, defaults to True):
+            Use a convolution of trigram as input.
+        use_bottleneck (:obj:`bool`, optional, defaults to True):
+            Whether to use bottleneck in BERT.
+        intra_bottleneck_size (:obj:`int`, optional, defaults to 128):
+            Size of bottleneck layer output.
+        use_bottleneck_attention (:obj:`bool`, optional, defaults to False):
+            Whether to use attention inputs from the bottleneck transformation.
+        key_query_shared_bottleneck (:obj:`bool`, optional, defaults to True):
+            Whether to use the same linear transformation for query&key in the bottleneck.
+        num_feedforward_networks (:obj:`int`, optional, defaults to 4):
+            Number of FFNs in a block.
+        normalization_type (:obj:`str`, optional, defaults to "no_norm"):
+            The normalization type in BERT.
+
+    Example:
+
+        >>> from transformers import MobileBertModel, MobileBertConfig
+
+        >>> # Initializing a MobileBERT configuration
+        >>> configuration = MobileBertConfig()
+
+        >>> # Initializing a model from the configuration above
+        >>> model = MobileBertModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
+
+    Attributes:
+        pretrained_config_archive_map (Dict[str, str]):
+            A dictionary containing all the available pre-trained checkpoints.
    """
    pretrained_config_archive_map = MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
    model_type = "mobilebert"

--- a/src/transformers/configuration_openai.py
+++ b/src/transformers/configuration_openai.py
@@ -28,84 +28,84 @@ OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class OpenAIGPTConfig(PretrainedConfig):
    """
-        This is the configuration class to store the configuration of a :class:`~transformers.OpenAIGPTModel`.
-        It is used to instantiate an GPT model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the `GPT <https://huggingface.co/openai-gpt>`__ architecture from OpenAI.
-
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
-
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 40478):
-                Vocabulary size of the GPT model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
-            n_positions (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            n_ctx (:obj:`int`, optional, defaults to 512):
-                Dimensionality of the causal mask (usually same as n_positions).
-            n_embd (:obj:`int`, optional, defaults to 768):
-                Dimensionality of the embeddings and hidden states.
-            n_layer (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            n_head (:obj:`int`, optional, defaults to 12):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            afn (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
-                The non-linear activation function (function or string) in the encoder and pooler.
-                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            resid_pdrop (:obj:`float`, optional, defaults to 0.1):
-                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-            embd_pdrop (:obj:`int`, optional, defaults to 0.1):
-                The dropout ratio for the embeddings.
-            attn_pdrop (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention.
-            layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
-                The epsilon to use in the layer normalization layers
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            predict_special_tokens (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Whether special tokens should be predicted when the model is has a language modeling head.
-            summary_type (:obj:`string`, optional, defaults to "cls_index"):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
-                Is one of the following options:
-
-                - 'last' => take the last token hidden state (like XLNet)
-                - 'first' => take the first token hidden state (like Bert)
-                - 'mean' => take the mean of all tokens hidden states
-                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
-                - 'attn' => Not implemented now, use multi-head attention
-            summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
-                Add a projection after the vector extraction
-            summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
-                'tanh' => add a tanh activation to the output, Other => no activation.
-            summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
-                If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
-            summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
-                Add a dropout before the projection and activation
-
-        Example::
-
-            >>> from transformers import OpenAIGPTConfig, OpenAIGPTModel
-
-            >>> # Initializing a GPT configuration
-            >>> configuration = OpenAIGPTConfig()
-
-            >>> # Initializing a model from the configuration
-            >>> model = OpenAIGPTModel(configuration)
-
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+    This is the configuration class to store the configuration of a :class:`~transformers.OpenAIGPTModel`.
+    It is used to instantiate an GPT model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the `GPT <https://huggingface.co/openai-gpt>`__ architecture from OpenAI.
+
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
+
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 40478):
+            Vocabulary size of the GPT model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
+        n_positions (:obj:`int`, optional, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        n_ctx (:obj:`int`, optional, defaults to 512):
+            Dimensionality of the causal mask (usually same as n_positions).
+        n_embd (:obj:`int`, optional, defaults to 768):
+            Dimensionality of the embeddings and hidden states.
+        n_layer (:obj:`int`, optional, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        n_head (:obj:`int`, optional, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        afn (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
+            The non-linear activation function (function or string) in the encoder and pooler.
+            If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+        resid_pdrop (:obj:`float`, optional, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        embd_pdrop (:obj:`int`, optional, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        attn_pdrop (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention.
+        layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
+            The epsilon to use in the layer normalization layers
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        predict_special_tokens (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Whether special tokens should be predicted when the model is has a language modeling head.
+        summary_type (:obj:`string`, optional, defaults to "cls_index"):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+            Is one of the following options:
+
+            - 'last' => take the last token hidden state (like XLNet)
+            - 'first' => take the first token hidden state (like Bert)
+            - 'mean' => take the mean of all tokens hidden states
+            - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+            - 'attn' => Not implemented now, use multi-head attention
+        summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+            Add a projection after the vector extraction
+        summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+            'tanh' => add a tanh activation to the output, Other => no activation.
+        summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+            If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
+        summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+            Add a dropout before the projection and activation
+
+    Example::
+
+        >>> from transformers import OpenAIGPTConfig, OpenAIGPTModel
+
+        >>> # Initializing a GPT configuration
+        >>> configuration = OpenAIGPTConfig()
+
+        >>> # Initializing a model from the configuration
+        >>> model = OpenAIGPTModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
    """

    model_type = "openai-gpt"