Merge branch 'master' into from_scratch_training

715fa638 · Julien Chaumond · 764f836d · 100e3b6f · 715fa638 · 715fa638
Commit 715fa638 authored Jan 14, 2020 by Julien Chaumond
20 changed files
--- a/docs/source/model_doc/albert.rst
+++ b/docs/source/model_doc/albert.rst
 ALBERT
 ----------------------------------------------------
-``AlbrtConfig``
+``AlbertConfig``
 ~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.AlbertConfig

--- a/docs/source/model_doc/xlm.rst
+++ b/docs/source/model_doc/xlm.rst
@@ -34,6 +34,13 @@ XLM
    :members:
+``XLMForQuestionAnsweringSimple``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: transformers.XLMForQuestionAnsweringSimple
+    :members:
 ``XLMForQuestionAnswering``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/xlnet.rst
+++ b/docs/source/model_doc/xlnet.rst
@@ -36,6 +36,27 @@ XLNet
    :members:
+``XLNetForTokenClassification``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: transformers.XLNetForTokenClassification
+    :members:
+``XLNetForMultipleChoice``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: transformers.XLNetForMultipleChoice
+    :members:
+``XLNetForQuestionAnsweringSimple``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: transformers.XLNetForQuestionAnsweringSimple
+    :members:
 ``XLNetForQuestionAnswering``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/examples/distillation/lm_seqs_dataset.py
+++ b/examples/distillation/lm_seqs_dataset.py
@@ -42,6 +42,7 @@ class LmSeqsDataset(Dataset):
        self.check()
        self.remove_long_sequences()
        self.remove_empty_sequences()
+        self.remove_unknown_sequences()
        self.check()
        self.print_statistics()
@@ -109,6 +110,22 @@ class LmSeqsDataset(Dataset):
        new_size = len(self)
        logger.info(f"Remove {init_size - new_size} too short (<=11 tokens) sequences.")
+    def remove_unknown_sequences(self):
+        """
+        Remove sequences with a (too) high level of unknown tokens.
+        """
+        if "unk_token" not in self.params.special_tok_ids:
+            return
+        else:
+            unk_token_id = self.params.special_tok_ids["unk_token"]
+        init_size = len(self)
+        unk_occs = np.array([np.count_nonzero(a == unk_token_id) for a in self.token_ids])
+        indices = (unk_occs / self.lengths) < 0.5
+        self.token_ids = self.token_ids[indices]
+        self.lengths = self.lengths[indices]
+        new_size = len(self)
+        logger.info(f"Remove {init_size - new_size} sequences with a high level of unknown tokens (50%).")
    def print_statistics(self):
        """
        Print some statistics on the corpus. Only the master process.

--- a/examples/distillation/training_configs/distilbert-base-multilingual-cased.json
+++ b/examples/distillation/training_configs/distilbert-base-multilingual-cased.json
+{
+	"activation": "gelu",
+	"attention_dropout": 0.1,
+	"dim": 768,
+	"dropout": 0.1,
+	"hidden_dim": 3072,
+	"initializer_range": 0.02,
+	"max_position_embeddings": 512,
+	"n_heads": 12,
+	"n_layers": 6,
+	"sinusoidal_pos_embds": true,
+	"tie_weights_": true,
+	"vocab_size": 119547
+  }
\ No newline at end of file
--- a/examples/distillation/training_configs/distilroberta-base.json
+++ b/examples/distillation/training_configs/distilroberta-base.json
+{
+    "vocab_size": 50265,
+    "hidden_size": 768,
+    "num_hidden_layers": 6,
+    "num_attention_heads": 12,
+    "intermediate_size": 3072,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.1,
+    "attention_probs_dropout_prob": 0.1,
+    "max_position_embeddings": 514,
+    "type_vocab_size": 1,
+    "initializer_range": 0.02,
+    "layer_norm_eps": 0.00001
+}
\ No newline at end of file
--- a/examples/pplm/run_pplm.py
+++ b/examples/pplm/run_pplm.py
@@ -344,6 +344,7 @@ def full_text_generation(
    gamma=1.5,
    gm_scale=0.9,
    kl_scale=0.01,
+    repetition_penalty=1.0,
    **kwargs
 ):
    classifier, class_id = get_classifier(discrim, class_label, device)
@@ -368,7 +369,14 @@ def full_text_generation(
        raise Exception("Specify either a bag of words or a discriminator")
    unpert_gen_tok_text, _, _ = generate_text_pplm(
-        model=model, tokenizer=tokenizer, context=context, device=device, length=length, sample=sample, perturb=False
+        model=model,
+        tokenizer=tokenizer,
+        context=context,
+        device=device,
+        length=length,
+        sample=sample,
+        perturb=False,
+        repetition_penalty=repetition_penalty,
    )
    if device == "cuda":
        torch.cuda.empty_cache()
@@ -401,6 +409,7 @@ def full_text_generation(
            gamma=gamma,
            gm_scale=gm_scale,
            kl_scale=kl_scale,
+            repetition_penalty=repetition_penalty,
        )
        pert_gen_tok_texts.append(pert_gen_tok_text)
        if classifier is not None:
@@ -437,6 +446,7 @@ def generate_text_pplm(
    gamma=1.5,
    gm_scale=0.9,
    kl_scale=0.01,
+    repetition_penalty=1.0,
 ):
    output_so_far = None
    if context:
@@ -508,6 +518,13 @@ def generate_text_pplm(
        pert_logits, past, pert_all_hidden = model(last, past=pert_past)
        pert_logits = pert_logits[:, -1, :] / temperature  # + SMALL_CONST
+        for token_idx in set(output_so_far[0].tolist()):
+            if pert_logits[0, token_idx] < 0:
+                pert_logits[0, token_idx] *= repetition_penalty
+            else:
+                pert_logits[0, token_idx] /= repetition_penalty
        pert_probs = F.softmax(pert_logits, dim=-1)
        if classifier is not None:
@@ -588,6 +605,7 @@ def run_pplm_example(
    seed=0,
    no_cuda=False,
    colorama=False,
+    repetition_penalty=1.0,
 ):
    # set Random seed
    torch.manual_seed(seed)
@@ -655,6 +673,7 @@ def run_pplm_example(
        gamma=gamma,
        gm_scale=gm_scale,
        kl_scale=kl_scale,
+        repetition_penalty=repetition_penalty,
    )
    # untokenize unperturbed text
@@ -767,6 +786,9 @@ if __name__ == "__main__":
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--no_cuda", action="store_true", help="no cuda")
    parser.add_argument("--colorama", action="store_true", help="colors keywords")
+    parser.add_argument(
+        "--repetition_penalty", type=float, default=1.0, help="Penalize repetition. More than 1.0 -> less repetition",
+    )
    args = parser.parse_args()
    run_pplm_example(**vars(args))
--- a/src/transformers/configuration_albert.py
+++ b/src/transformers/configuration_albert.py
@@ -31,9 +31,73 @@ ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class AlbertConfig(PretrainedConfig):
-    """Configuration for `AlbertModel`.
+    r"""
+        This is the configuration class to store the configuration of an :class:`~transformers.AlbertModel`.
+        It is used to instantiate an ALBERT model according to the specified arguments, defining the model
+        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+        the ALBERT `xxlarge <https://huggingface.co/albert-xxlarge-v2>`__ architecture.
-    The default settings match the configuration of model `albert_xxlarge`.
+        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+        for more information.
+        Args:
+            vocab_size (:obj:`int`, optional, defaults to 30000):
+                Vocabulary size of the ALBERT model. Defines the different tokens that
+                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`.
+            embedding_size (:obj:`int`, optional, defaults to 128):
+                Dimensionality of vocabulary embeddings.
+            hidden_size (:obj:`int`, optional, defaults to 4096):
+                Dimensionality of the encoder layers and the pooler layer.
+            num_hidden_layers (:obj:`int`, optional, defaults to 12):
+                Number of hidden layers in the Transformer encoder.
+            num_hidden_groups (:obj:`int`, optional, defaults to 1):
+                Number of groups for the hidden layers, parameters in the same group are shared.
+            num_attention_heads (:obj:`int`, optional, defaults to 64):
+                Number of attention heads for each attention layer in the Transformer encoder.
+            intermediate_size (:obj:`int`, optional, defaults to 16384):
+                The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+            inner_group_num (:obj:`int`, optional, defaults to 1):
+                The number of inner repetition of attention and ffn.
+            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"):
+                The non-linear activation function (function or string) in the encoder and pooler.
+                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+            hidden_dropout_prob (:obj:`float`, optional, defaults to 0):
+                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+            attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0):
+                The dropout ratio for the attention probabilities.
+            max_position_embeddings (:obj:`int`, optional, defaults to 512):
+                The maximum sequence length that this model might ever be used with. Typically set this to something
+                large (e.g., 512 or 1024 or 2048).
+            type_vocab_size (:obj:`int`, optional, defaults to 2):
+                The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`.
+            initializer_range (:obj:`float`, optional, defaults to 0.02):
+                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+                The epsilon used by the layer normalization layers.
+        Example::
+            # Initializing an ALBERT-xxlarge style configuration
+            albert_xxlarge_configuration = AlbertConfig()
+            # Initializing an ALBERT-base style configuration
+            albert_base_configuration = AlbertConfig(
+                hidden_size=768,
+                num_attention_heads=12,
+                intermediate_size=3072,
+            )
+            # Initializing a model from the ALBERT-base style configuration
+            model = AlbertModel(bert_base_configuration)
+            # Accessing the model configuration
+            configuration = model.config
+        Attributes:
+            pretrained_config_archive_map (Dict[str, str]):
+                A dictionary containing all the available pre-trained checkpoints.
    """
    pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
@@ -58,35 +122,6 @@ class AlbertConfig(PretrainedConfig):
        layer_norm_eps=1e-12,
        **kwargs
    ):
-        """Constructs AlbertConfig.
-        Args:
-            vocab_size: Vocabulary size of `inputs_ids` in `AlbertModel`.
-            embedding_size: size of voc embeddings.
-            hidden_size: Size of the encoder layers and the pooler layer.
-            num_hidden_layers: Number of hidden layers in the Transformer encoder.
-            num_hidden_groups: Number of group for the hidden layers, parameters in
-                the same group are shared.
-            num_attention_heads: Number of attention heads for each attention layer in
-                the Transformer encoder.
-            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
-                layer in the Transformer encoder.
-            inner_group_num: int, number of inner repetition of attention and ffn.
-            down_scale_factor: float, the scale to apply
-            hidden_act: The non-linear activation function (function or string) in the
-                encoder and pooler.
-            hidden_dropout_prob: The dropout probability for all fully connected
-                layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob: The dropout ratio for the attention
-                probabilities.
-            max_position_embeddings: The maximum sequence length that this model might
-                ever be used with. Typically set this to something large just in case
-                (e.g., 512 or 1024 or 2048).
-            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
-                `AlbertModel`.
-            initializer_range: The stdev of the truncated_normal_initializer for
-                initializing all weight matrices.
-        """
        super(AlbertConfig, self).__init__(**kwargs)
        self.vocab_size = vocab_size

--- a/src/transformers/configuration_auto.py
+++ b/src/transformers/configuration_auto.py
@@ -77,32 +77,15 @@ CONFIG_MAPPING = OrderedDict(
 )
-class AutoConfig:
+class AutoConfig(object):
-    r""":class:`~transformers.AutoConfig` is a generic configuration class
+    r"""
+        :class:`~transformers.AutoConfig` is a generic configuration class
        that will be instantiated as one of the configuration classes of the library
-        when created with the `AutoConfig.from_pretrained(pretrained_model_name_or_path)`
+        when created with the :func:`~transformers.AutoConfig.from_pretrained` class method.
-        class method.
-        The `from_pretrained()` method take care of returning the correct model class instance
+        The :func:`~transformers.AutoConfig.from_pretrained` method takes care of returning the correct model class instance
        based on the `model_type` property of the config object, or when it's missing,
        falling back to using pattern matching on the `pretrained_model_name_or_path` string.
-        When using string matching, the configuration class is matched on
-        the `pretrained_model_name_or_path` string in the following order:
-            - contains `t5`: T5Config (T5 model)
-            - contains `distilbert`: DistilBertConfig (DistilBERT model)
-            - contains `albert`: AlbertConfig (ALBERT model)
-            - contains `camembert`: CamembertConfig (CamemBERT model)
-            - contains `xlm-roberta`: XLMRobertaConfig (XLM-RoBERTa model)
-            - contains `roberta`: RobertaConfig (RoBERTa model)
-            - contains `bert`: BertConfig (Bert model)
-            - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
-            - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
-            - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
-            - contains `xlnet`: XLNetConfig (XLNet model)
-            - contains `xlm`: XLMConfig (XLM model)
-            - contains `ctrl` : CTRLConfig (CTRL model)
-        This class cannot be instantiated using `__init__()` (throw an error).
    """
    def __init__(self):
@@ -124,60 +107,61 @@ class AutoConfig:
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
-        r""" Instantiate one of the configuration classes of the library
+        r""" Instantiates one of the configuration classes of the library
        from a pre-trained model configuration.
        The configuration class to instantiate is selected
        based on the `model_type` property of the config object, or when it's missing,
        falling back to using pattern matching on the `pretrained_model_name_or_path` string.
-            - contains `t5`: T5Config (T5 model)
+            - contains `t5`: :class:`~transformers.T5Config` (T5 model)
-            - contains `distilbert`: DistilBertConfig (DistilBERT model)
+            - contains `distilbert`: :class:`~transformers.DistilBertConfig` (DistilBERT model)
-            - contains `albert`: AlbertConfig (ALBERT model)
+            - contains `albert`: :class:`~transformers.AlbertConfig` (ALBERT model)
-            - contains `camembert`: CamembertConfig (CamemBERT model)
+            - contains `camembert`: :class:`~transformers.CamembertConfig` (CamemBERT model)
-            - contains `xlm-roberta`: XLMRobertaConfig (XLM-RoBERTa model)
+            - contains `xlm-roberta`: :class:`~transformers.XLMRobertaConfig` (XLM-RoBERTa model)
-            - contains `roberta`: RobertaConfig (RoBERTa model)
+            - contains `roberta`: :class:`~transformers.RobertaConfig` (RoBERTa model)
-            - contains `bert`: BertConfig (Bert model)
+            - contains `bert`: :class:`~transformers.BertConfig` (Bert model)
-            - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
+            - contains `openai-gpt`: :class:`~transformers.OpenAIGPTConfig` (OpenAI GPT model)
-            - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
+            - contains `gpt2`: :class:`~transformers.GPT2Config` (OpenAI GPT-2 model)
-            - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
+            - contains `transfo-xl`: :class:`~transformers.TransfoXLConfig` (Transformer-XL model)
-            - contains `xlnet`: XLNetConfig (XLNet model)
+            - contains `xlnet`: :class:`~transformers.XLNetConfig` (XLNet model)
-            - contains `xlm`: XLMConfig (XLM model)
+            - contains `xlm`: :class:`~transformers.XLMConfig` (XLM model)
-            - contains `ctrl` : CTRLConfig (CTRL model)
+            - contains `ctrl` : :class:`~transformers.CTRLConfig` (CTRL model)
-        Params:
-            pretrained_model_name_or_path: either:
+        Args:
-                - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
+            pretrained_model_name_or_path (:obj:`string`):
-                - a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
+                Is either: \
-                - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                    - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
+                    - a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
+                    - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
-            cache_dir: (`optional`) string:
+                    - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
+            cache_dir (:obj:`string`, optional, defaults to `None`):
                Path to a directory in which a downloaded pre-trained model
                configuration should be cached if the standard cache should not be used.
-            kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading.
+            force_download (:obj:`boolean`, optional, defaults to `False`):
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exist.
-                - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
-                - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
-            force_download: (`optional`) boolean, default False:
-                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
-            resume_download: (`optional`) boolean, default False:
+            resume_download (:obj:`boolean`, optional, defaults to `False`):
-                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
+                Do not delete incompletely received file. Attempt to resume the download if such a file exists.
-            proxies: (`optional`) dict, default None:
+            proxies (:obj:`Dict[str, str]`, optional, defaults to `None`):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: :obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`.
-                The proxies are used on each request.
+                The proxies are used on each request. See `the requests documentation <https://requests.readthedocs.io/en/master/user/advanced/#proxies>`__ for usage.
-            return_unused_kwargs: (`optional`) bool:
+            return_unused_kwargs (:obj:`boolean`, optional, defaults to `False`):
                - If False, then this function returns just the final configuration object.
                - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
+            kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`): key/value pairs with which to update the configuration object after loading.
+                - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
+                - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
        Examples::
-            config = AutoConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+            config = AutoConfig.from_pretrained('bert-base-uncased')  # Download configuration from S3 and cache.
            config = AutoConfig.from_pretrained('./test/bert_saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
            config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json')
            config = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)

--- a/src/transformers/configuration_bert.py
+++ b/src/transformers/configuration_bert.py
@@ -50,32 +50,61 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class BertConfig(PretrainedConfig):
    r"""
-        :class:`~transformers.BertConfig` is the configuration class to store the configuration of a
+        This is the configuration class to store the configuration of a :class:`~transformers.BertModel`.
-        `BertModel`.
+        It is used to instantiate an BERT model according to the specified arguments, defining the model
+        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+        the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
-        Arguments:
-            vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.
+        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-            hidden_size: Size of the encoder layers and the pooler layer.
+        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-            num_hidden_layers: Number of hidden layers in the Transformer encoder.
+        for more information.
-            num_attention_heads: Number of attention heads for each attention layer in
-                the Transformer encoder.
-            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+        Args:
-                layer in the Transformer encoder.
+            vocab_size (:obj:`int`, optional, defaults to 30522):
-            hidden_act: The non-linear activation function (function or string) in the
+                Vocabulary size of the BERT model. Defines the different tokens that
-                encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
-            hidden_dropout_prob: The dropout probabilitiy for all fully connected
+            hidden_size (:obj:`int`, optional, defaults to 768):
-                layers in the embeddings, encoder, and pooler.
+                Dimensionality of the encoder layers and the pooler layer.
-            attention_probs_dropout_prob: The dropout ratio for the attention
+            num_hidden_layers (:obj:`int`, optional, defaults to 12):
-                probabilities.
+                Number of hidden layers in the Transformer encoder.
-            max_position_embeddings: The maximum sequence length that this model might
+            num_attention_heads (:obj:`int`, optional, defaults to 12):
-                ever be used with. Typically set this to something large just in case
+                Number of attention heads for each attention layer in the Transformer encoder.
-                (e.g., 512 or 1024 or 2048).
+            intermediate_size (:obj:`int`, optional, defaults to 3072):
-            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+                Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-                `BertModel`.
+            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
-            initializer_range: The sttdev of the truncated_normal_initializer for
+                The non-linear activation function (function or string) in the encoder and pooler.
-                initializing all weight matrices.
+                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            layer_norm_eps: The epsilon used by LayerNorm.
+            hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+                The dropout ratio for the attention probabilities.
+            max_position_embeddings (:obj:`int`, optional, defaults to 512):
+                The maximum sequence length that this model might ever be used with.
+                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+            type_vocab_size (:obj:`int`, optional, defaults to 2):
+                The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
+            initializer_range (:obj:`float`, optional, defaults to 0.02):
+                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+                The epsilon used by the layer normalization layers.
+        Example::
+            from transformers import BertModel, BertConfig
+            # Initializing a BERT bert-base-uncased style configuration
+            configuration = BertConfig()
+            # Initializing a model from the bert-base-uncased style configuration
+            model = BertModel(configuration)
+            # Accessing the model configuration
+            configuration = model.config
+        Attributes:
+            pretrained_config_archive_map (Dict[str, str]):
+                A dictionary containing all the available pre-trained checkpoints.
    """
    pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
    model_type = "bert"
@@ -97,6 +126,7 @@ class BertConfig(PretrainedConfig):
        **kwargs
    ):
        super(BertConfig, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers

--- a/src/transformers/configuration_camembert.py
+++ b/src/transformers/configuration_camembert.py
@@ -29,5 +29,35 @@ CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class CamembertConfig(RobertaConfig):
+    r"""
+        This is the configuration class to store the configuration of an :class:`~transformers.CamembertModel`.
+        It is used to instantiate an Camembert model according to the specified arguments, defining the model
+        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+        the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
+        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+        for more information.
+        The :class:`~transformers.CamembertConfig` class directly inherits :class:`~transformers.BertConfig`.
+        It reuses the same defaults. Please check the parent class for more information.
+        Example::
+            from transformers import CamembertModel, CamembertConfig
+            # Initializing a CamemBERT configuration
+            configuration = CamembertConfig()
+            # Initializing a model from the configuration
+            model = CamembertModel(configuration)
+            # Accessing the model configuration
+            configuration = model.config
+        Attributes:
+            pretrained_config_archive_map (Dict[str, str]):
+                A dictionary containing all the available pre-trained checkpoints.
+    """
    pretrained_config_archive_map = CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
    model_type = "camembert"
--- a/src/transformers/configuration_ctrl.py
+++ b/src/transformers/configuration_ctrl.py
@@ -26,25 +26,60 @@ CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://storage.googleapis.com/sf
 class CTRLConfig(PretrainedConfig):
-    """Configuration class to store the configuration of a `CTRLModel`.
+    """
+        This is the configuration class to store the configuration of an :class:`~transformers.CTRLModel`.
-    Args:
+        It is used to instantiate an CTRL model according to the specified arguments, defining the model
-        vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
+        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        n_positions: Number of positional embeddings.
+        the `ctrl <https://huggingface.co/ctrl>`__ architecture from SalesForce.
-        n_ctx: Size of the causal mask (usually same as n_positions).
-        dff: Size of the inner dimension of the FFN.
+        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        n_embd: Dimensionality of the embeddings and hidden states.
+        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        n_layer: Number of hidden layers in the Transformer encoder.
+        for more information.
-        n_head: Number of attention heads for each attention layer in
-            the Transformer encoder.
+        Args:
-        layer_norm_epsilon: epsilon to use in the layer norm layers
+            vocab_size (:obj:`int`, optional, defaults to 246534):
-        resid_pdrop: The dropout probabilitiy for all fully connected
+                Vocabulary size of the CTRL model. Defines the different tokens that
-            layers in the embeddings, encoder, and pooler.
+                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
-        attn_pdrop: The dropout ratio for the attention
+            n_positions (:obj:`int`, optional, defaults to 256):
-            probabilities.
+                The maximum sequence length that this model might ever be used with.
-        embd_pdrop: The dropout ratio for the embeddings.
+                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-        initializer_range: The sttdev of the truncated_normal_initializer for
+            n_ctx (:obj:`int`, optional, defaults to 256):
-            initializing all weight matrices.
+                Dimensionality of the causal mask (usually same as n_positions).
+            n_embd (:obj:`int`, optional, defaults to 1280):
+                Dimensionality of the embeddings and hidden states.
+            dff (:obj:`int`, optional, defaults to 8192):
+                Dimensionality of the inner dimension of the FFN.
+            n_layer (:obj:`int`, optional, defaults to 48):
+                Number of hidden layers in the Transformer encoder.
+            n_head (:obj:`int`, optional, defaults to 16):
+                Number of attention heads for each attention layer in the Transformer encoder.
+            resid_pdrop (:obj:`float`, optional, defaults to 0.1):
+                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+            embd_pdrop (:obj:`int`, optional, defaults to 0.1):
+                The dropout ratio for the embeddings.
+            attn_pdrop (:obj:`float`, optional, defaults to 0.1):
+                The dropout ratio for the attention.
+            layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-6):
+                The epsilon to use in the layer normalization layers
+            initializer_range (:obj:`float`, optional, defaults to 0.02):
+                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        Example::
+            from transformers import CTRLModel, CTRLConfig
+            # Initializing a CTRL configuration
+            configuration = CTRLConfig()
+            # Initializing a model from the configuration
+            model = CTRLModel(configuration)
+            # Accessing the model configuration
+            configuration = model.config
+        Attributes:
+            pretrained_config_archive_map (Dict[str, str]):
+                A dictionary containing all the available pre-trained checkpoints.
    """
    pretrained_config_archive_map = CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
@@ -71,26 +106,6 @@ class CTRLConfig(PretrainedConfig):
        summary_first_dropout=0.1,
        **kwargs
    ):
-        """Constructs CTRLConfig.
-        Args:
-            vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
-            n_positions: Number of positional embeddings.
-            n_ctx: Size of the causal mask (usually same as n_positions).
-            dff: Size of the inner dimension of the FFN.
-            n_embd: Dimensionality of the embeddings and hidden states.
-            n_layer: Number of hidden layers in the Transformer encoder.
-            n_head: Number of attention heads for each attention layer in
-                the Transformer encoder.
-            layer_norm_epsilon: epsilon to use in the layer norm layers
-            resid_pdrop: The dropout probabilitiy for all fully connected
-                layers in the embeddings, encoder, and pooler.
-            attn_pdrop: The dropout ratio for the attention
-                probabilities.
-            embd_pdrop: The dropout ratio for the embeddings.
-            initializer_range: The sttdev of the truncated_normal_initializer for
-                initializing all weight matrices.
-        """
        super(CTRLConfig, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.n_ctx = n_ctx

--- a/src/transformers/configuration_distilbert.py
+++ b/src/transformers/configuration_distilbert.py
@@ -31,6 +31,67 @@ DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class DistilBertConfig(PretrainedConfig):
+    r"""
+        This is the configuration class to store the configuration of a :class:`~transformers.DistilBertModel`.
+        It is used to instantiate a DistilBERT model according to the specified arguments, defining the model
+        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+        the DistilBERT `distilbert-base-uncased <https://huggingface.co/distilbert-base-uncased>`__ architecture.
+        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+        for more information.
+        Args:
+            vocab_size (:obj:`int`, optional, defaults to 30522):
+                Vocabulary size of the DistilBERT model. Defines the different tokens that
+                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
+            max_position_embeddings (:obj:`int`, optional, defaults to 512):
+                The maximum sequence length that this model might ever be used with.
+                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+            sinusoidal_pos_embds (:obj:`boolean`, optional, defaults to :obj:`False`):
+                Whether to use sinusoidal positional embeddings.
+            n_layers (:obj:`int`, optional, defaults to 6):
+                Number of hidden layers in the Transformer encoder.
+            n_heads (:obj:`int`, optional, defaults to 12):
+                Number of attention heads for each attention layer in the Transformer encoder.
+            dim (:obj:`int`, optional, defaults to 768):
+                Dimensionality of the encoder layers and the pooler layer.
+            intermediate_size (:obj:`int`, optional, defaults to 3072):
+                The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+            dropout (:obj:`float`, optional, defaults to 0.1):
+                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            attention_dropout (:obj:`float`, optional, defaults to 0.1):
+                The dropout ratio for the attention probabilities.
+            activation (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
+                The non-linear activation function (function or string) in the encoder and pooler.
+                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+            initializer_range (:obj:`float`, optional, defaults to 0.02):
+                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+            qa_dropout (:obj:`float`, optional, defaults to 0.1):
+                The dropout probabilities used in the question answering model
+                :class:`~tranformers.DistilBertForQuestionAnswering`.
+            seq_classif_dropout (:obj:`float`, optional, defaults to 0.2):
+                The dropout probabilities used in the sequence classification model
+                :class:`~tranformers.DistilBertForSequenceClassification`.
+        Example::
+            from transformers import DistilBertModel, DistilBertConfig
+            # Initializing a DistilBERT configuration
+            configuration = DistilBertConfig()
+            # Initializing a model from the configuration
+            model = DistilBertModel(configuration)
+            # Accessing the model configuration
+            configuration = model.config
+        Attributes:
+            pretrained_config_archive_map (Dict[str, str]):
+                A dictionary containing all the available pre-trained checkpoints.
+    """
    pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
    model_type = "distilbert"
@@ -47,7 +108,6 @@ class DistilBertConfig(PretrainedConfig):
        attention_dropout=0.1,
        activation="gelu",
        initializer_range=0.02,
-        tie_weights_=True,
        qa_dropout=0.1,
        seq_classif_dropout=0.2,
        **kwargs
@@ -64,7 +124,6 @@ class DistilBertConfig(PretrainedConfig):
        self.attention_dropout = attention_dropout
        self.activation = activation
        self.initializer_range = initializer_range
-        self.tie_weights_ = tie_weights_
        self.qa_dropout = qa_dropout
        self.seq_classif_dropout = seq_classif_dropout

--- a/src/transformers/configuration_gpt2.py
+++ b/src/transformers/configuration_gpt2.py
@@ -33,24 +33,84 @@ GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class GPT2Config(PretrainedConfig):
-    """Configuration class to store the configuration of a `GPT2Model`.
+    """
+        This is the configuration class to store the configuration of a :class:`~transformers.GPT2Model`.
-    Args:
+        It is used to instantiate an GPT-2 model according to the specified arguments, defining the model
-        vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
+        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        n_positions: Number of positional embeddings.
+        the GPT-2 `small <https://huggingface.co/gpt2>`__ architecture.
-        n_ctx: Size of the causal mask (usually same as n_positions).
-        n_embd: Dimensionality of the embeddings and hidden states.
+        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        n_layer: Number of hidden layers in the Transformer encoder.
+        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        n_head: Number of attention heads for each attention layer in
+        for more information.
-            the Transformer encoder.
-        layer_norm_epsilon: epsilon to use in the layer norm layers
-        resid_pdrop: The dropout probabilitiy for all fully connected
+        Args:
-            layers in the embeddings, encoder, and pooler.
+            vocab_size (:obj:`int`, optional, defaults to 50257):
-        attn_pdrop: The dropout ratio for the attention
+                Vocabulary size of the GPT-2 model. Defines the different tokens that
-            probabilities.
+                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.GPT2Model`.
-        embd_pdrop: The dropout ratio for the embeddings.
+            n_positions (:obj:`int`, optional, defaults to 1024):
-        initializer_range: The sttdev of the truncated_normal_initializer for
+                The maximum sequence length that this model might ever be used with.
-            initializing all weight matrices.
+                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+            n_ctx (:obj:`int`, optional, defaults to 1024):
+                Dimensionality of the causal mask (usually same as n_positions).
+            n_embd (:obj:`int`, optional, defaults to 768):
+                Dimensionality of the embeddings and hidden states.
+            n_layer (:obj:`int`, optional, defaults to 12):
+                Number of hidden layers in the Transformer encoder.
+            n_head (:obj:`int`, optional, defaults to 12):
+                Number of attention heads for each attention layer in the Transformer encoder.
+            resid_pdrop (:obj:`float`, optional, defaults to 0.1):
+                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+            embd_pdrop (:obj:`int`, optional, defaults to 0.1):
+                The dropout ratio for the embeddings.
+            attn_pdrop (:obj:`float`, optional, defaults to 0.1):
+                The dropout ratio for the attention.
+            layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
+                The epsilon to use in the layer normalization layers
+            initializer_range (:obj:`float`, optional, defaults to 16):
+                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+            summary_type (:obj:`string`, optional, defaults to "cls_index"):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:`~transformers.GPT2DoubleHeadsModel`.
+                Is one of the following options:
+                    - 'last' => take the last token hidden state (like XLNet)
+                    - 'first' => take the first token hidden state (like Bert)
+                    - 'mean' => take the mean of all tokens hidden states
+                    - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+                    - 'attn' => Not implemented now, use multi-head attention
+            summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:`~transformers.GPT2DoubleHeadsModel`.
+                Add a projection after the vector extraction
+            summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:`~transformers.GPT2DoubleHeadsModel`.
+                'tanh' => add a tanh activation to the output, Other => no activation.
+            summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:`~transformers.GPT2DoubleHeadsModel`.
+                If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
+            summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:`~transformers.GPT2DoubleHeadsModel`.
+                Add a dropout before the projection and activation
+        Example::
+            from transformers import GPT2Model, GPT2Config
+            # Initializing a GPT2 configuration
+            configuration = GPT2Config()
+            # Initializing a model from the configuration
+            model = GPT2Model(configuration)
+            # Accessing the model configuration
+            configuration = model.config
+        Attributes:
+            pretrained_config_archive_map (Dict[str, str]):
+                A dictionary containing all the available pre-trained checkpoints.
    """
    pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
@@ -76,26 +136,8 @@ class GPT2Config(PretrainedConfig):
        summary_first_dropout=0.1,
        **kwargs
    ):
-        """Constructs GPT2Config.
-        Args:
-            vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
-            n_positions: Number of positional embeddings.
-            n_ctx: Size of the causal mask (usually same as n_positions).
-            n_embd: Dimensionality of the embeddings and hidden states.
-            n_layer: Number of hidden layers in the Transformer encoder.
-            n_head: Number of attention heads for each attention layer in
-                the Transformer encoder.
-            layer_norm_epsilon: epsilon to use in the layer norm layers
-            resid_pdrop: The dropout probabilitiy for all fully connected
-                layers in the embeddings, encoder, and pooler.
-            attn_pdrop: The dropout ratio for the attention
-                probabilities.
-            embd_pdrop: The dropout ratio for the embeddings.
-            initializer_range: The sttdev of the truncated_normal_initializer for
-                initializing all weight matrices.
-        """
        super(GPT2Config, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.n_ctx = n_ctx
        self.n_positions = n_positions

--- a/src/transformers/configuration_mmbt.py
+++ b/src/transformers/configuration_mmbt.py
@@ -26,9 +26,13 @@ class MMBTConfig(object):
    """Configuration class to store the configuration of a `MMBT Model`.
    Args:
-        config: config of the underlying Transformer models. It's values are copied over to use a single config.
+        config (:obj:`~transformers.PreTrainedConfig`):
-        num_labels: Size of final Linear layer for classification.
+            Config of the underlying Transformer models. Its values are
-        modal_hidden_size: Embedding dimension of the non-text modality encoder.
+            copied over to use a single config.
+        num_labels (:obj:`int` or :obj:`None`, optional, defaults to `None`):
+            Size of final Linear layer for classification.
+        modal_hidden_size (:obj:`int`, optional, defautls to 2048):
+            Embedding dimension of the non-text modality encoder.
    """
    def __init__(self, config, num_labels=None, modal_hidden_size=2048):

--- a/src/transformers/configuration_openai.py
+++ b/src/transformers/configuration_openai.py
@@ -30,27 +30,87 @@ OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class OpenAIGPTConfig(PretrainedConfig):
    """
-    Configuration class to store the configuration of a `OpenAIGPTModel`.
+        This is the configuration class to store the configuration of an :class:`~transformers.OpenAIGPTModel`.
+        It is used to instantiate an GPT model according to the specified arguments, defining the model
-    Args:
+        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        vocab_size: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
+        the `GPT <https://huggingface.co/openai-gpt>`__ architecture from OpenAI.
-        n_positions: Number of positional embeddings.
-        n_ctx: Size of the causal mask (usually same as n_positions).
+        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        n_embd: Dimensionality of the embeddings and hidden states.
+        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        n_layer: Number of hidden layers in the Transformer encoder.
+        for more information.
-        n_head: Number of attention heads for each attention layer in
-            the Transformer encoder.
+        Args:
-        afn: The non-linear activation function (function or string) in the
+            vocab_size (:obj:`int`, optional, defaults to 40478):
-            encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+                Vocabulary size of the GPT model. Defines the different tokens that
-        resid_pdrop: The dropout probabilitiy for all fully connected
+                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
-            layers in the embeddings, encoder, and pooler.
+            n_positions (:obj:`int`, optional, defaults to 512):
-        attn_pdrop: The dropout ratio for the attention
+                The maximum sequence length that this model might ever be used with.
-            probabilities.
+                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-        embd_pdrop: The dropout ratio for the embeddings.
+            n_ctx (:obj:`int`, optional, defaults to 512):
-        layer_norm_epsilon: epsilon to use in the layer norm layers
+                Dimensionality of the causal mask (usually same as n_positions).
-        initializer_range: The sttdev of the truncated_normal_initializer for
+            n_embd (:obj:`int`, optional, defaults to 768):
-            initializing all weight matrices.
+                Dimensionality of the embeddings and hidden states.
-        predict_special_tokens: should we predict special tokens (when the model has a LM head)
+            n_layer (:obj:`int`, optional, defaults to 12):
+                Number of hidden layers in the Transformer encoder.
+            n_head (:obj:`int`, optional, defaults to 12):
+                Number of attention heads for each attention layer in the Transformer encoder.
+            afn (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
+                The non-linear activation function (function or string) in the encoder and pooler.
+                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+            resid_pdrop (:obj:`float`, optional, defaults to 0.1):
+                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+            embd_pdrop (:obj:`int`, optional, defaults to 0.1):
+                The dropout ratio for the embeddings.
+            attn_pdrop (:obj:`float`, optional, defaults to 0.1):
+                The dropout ratio for the attention.
+            layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
+                The epsilon to use in the layer normalization layers
+            initializer_range (:obj:`float`, optional, defaults to 0.02):
+                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+            predict_special_tokens (:obj:`boolean`, optional, defaults to :obj:`True`):
+                Whether special tokens should be predicted when the model is has a language modeling head.
+            summary_type (:obj:`string`, optional, defaults to "cls_index"):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+                Is one of the following options:
+                    - 'last' => take the last token hidden state (like XLNet)
+                    - 'first' => take the first token hidden state (like Bert)
+                    - 'mean' => take the mean of all tokens hidden states
+                    - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+                    - 'attn' => Not implemented now, use multi-head attention
+            summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+                Add a projection after the vector extraction
+            summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+                'tanh' => add a tanh activation to the output, Other => no activation.
+            summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+                If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
+            summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+                Add a dropout before the projection and activation
+        Example::
+            from transformers import OpenAIGPTConfig, OpenAIGPTModel
+            # Initializing a GPT configuration
+            configuration = OpenAIGPTConfig()
+            # Initializing a model from the configuration
+            model = OpenAIGPTModel(configuration)
+            # Accessing the model configuration
+            configuration = model.config
+        Attributes:
+            pretrained_config_archive_map (Dict[str, str]):
+                A dictionary containing all the available pre-trained checkpoints.
    """
    pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
@@ -78,9 +138,8 @@ class OpenAIGPTConfig(PretrainedConfig):
        summary_first_dropout=0.1,
        **kwargs
    ):
-        """Constructs OpenAIGPTConfig.
-        """
        super(OpenAIGPTConfig, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.n_ctx = n_ctx
        self.n_positions = n_positions

--- a/src/transformers/configuration_roberta.py
+++ b/src/transformers/configuration_roberta.py
@@ -34,5 +34,35 @@ ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class RobertaConfig(BertConfig):
+    r"""
+        This is the configuration class to store the configuration of an :class:`~transformers.RobertaModel`.
+        It is used to instantiate an RoBERTa model according to the specified arguments, defining the model
+        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+        the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
+        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+        for more information.
+        The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`.
+        It reuses the same defaults. Please check the parent class for more information.
+        Example::
+            from transformers import RobertaConfig, RobertaModel
+            # Initializing a RoBERTa configuration
+            configuration = RobertaConfig()
+            # Initializing a model from the configuration
+            model = RobertaModel(configuration)
+            # Accessing the model configuration
+            configuration = model.config
+        Attributes:
+            pretrained_config_archive_map (Dict[str, str]):
+                A dictionary containing all the available pre-trained checkpoints.
+    """
    pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
    model_type = "roberta"
--- a/src/transformers/configuration_transfo_xl.py
+++ b/src/transformers/configuration_transfo_xl.py
@@ -29,39 +29,91 @@ TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class TransfoXLConfig(PretrainedConfig):
-    """Configuration class to store the configuration of a `TransfoXLModel`.
+    """
+        This is the configuration class to store the configuration of an :class:`~transformers.TransfoXLModel`.
+        It is used to instantiate a Transformer XL model according to the specified arguments, defining the model
+        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+        the `Transformer XL <https://huggingface.co/transfo-xl-wt103>`__ architecture.
+        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+        for more information.
        Args:
-            vocab_size: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.
+            vocab_size (:obj:`int`, optional, defaults to 267735):
-            cutoffs: cutoffs for the adaptive softmax
+                Vocabulary size of the Transformer XL model. Defines the different tokens that
-            d_model: Dimensionality of the model's hidden states.
+                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.TransfoXLModel`.
-            d_embed: Dimensionality of the embeddings
+            cutoffs (:obj:`List[int]`, optional, defaults to :obj:`[20000, 40000, 200000]`):
-            d_head: Dimensionality of the model's heads.
+                Cutoffs for the adaptive softmax
-            div_val: divident value for adapative input and softmax
+            d_model (:obj:`int`, optional, defaults to 1024):
-            pre_lnorm: apply LayerNorm to the input instead of the output
+                Dimensionality of the model's hidden states.
-            d_inner: Inner dimension in FF
+            d_embed (:obj:`int`, optional, defaults to 1024):
-            n_layer: Number of hidden layers in the Transformer encoder.
+                Dimensionality of the embeddings
-            n_head: Number of attention heads for each attention layer in
+            n_head (:obj:`int`, optional, defaults to 16):
-                the Transformer encoder.
+                Number of attention heads for each attention layer in the Transformer encoder.
-            tgt_len: number of tokens to predict
+            d_head (:obj:`int`, optional, defaults to 64):
-            ext_len: length of the extended context
+                Dimensionality of the model's heads.
-            mem_len: length of the retained previous heads
+            d_inner (:obj:`int`, optional, defaults to 4096):
-            same_length: use the same attn length for all tokens
+                Inner dimension in FF
-            proj_share_all_but_first: True to share all but first projs, False not to share.
+            div_val (:obj:`int`, optional, defaults to 4):
-            attn_type: attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
+                Divident value for adapative input and softmax
-            clamp_len: use the same pos embeddings after clamp_len
+            pre_lnorm (:obj:`boolean`, optional, defaults to :obj:`False`):
-            sample_softmax: number of samples in sampled softmax
+                Apply LayerNorm to the input instead of the output
-            adaptive: use adaptive softmax
+            n_layer (:obj:`int`, optional, defaults to 18):
-            tie_weight: tie the word embedding and softmax weights
+                Number of hidden layers in the Transformer encoder.
-            dropout: The dropout probabilitiy for all fully connected
+            tgt_len (:obj:`int`, optional, defaults to 128):
-                layers in the embeddings, encoder, and pooler.
+                Number of tokens to predict
-            dropatt: The dropout ratio for the attention probabilities.
+            ext_len (:obj:`int`, optional, defaults to 0):
-            untie_r: untie relative position biases
+                Length of the extended context
-            embd_pdrop: The dropout ratio for the embeddings.
+            mem_len (:obj:`int`, optional, defaults to 1600):
-            init: parameter initializer to use
+                Length of the retained previous heads
-            init_range: parameters initialized by U(-init_range, init_range).
+            clamp_len (:obj:`int`, optional, defaults to 1000):
-            proj_init_std: parameters initialized by N(0, init_std)
+                use the same pos embeddings after clamp_len
-            init_std: parameters initialized by N(0, init_std)
+            same_length (:obj:`boolean`, optional, defaults to :obj:`True`):
+                Use the same attn length for all tokens
+            proj_share_all_but_first (:obj:`boolean`, optional, defaults to :obj:`True`):
+                True to share all but first projs, False not to share.
+            attn_type (:obj:`int`, optional, defaults to 0):
+                Attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
+            sample_softmax (:obj:`int`, optional, defaults to -1):
+                number of samples in sampled softmax
+            adaptive (:obj:`boolean`, optional, defaults to :obj:`True`):
+                use adaptive softmax
+            tie_weight (:obj:`boolean`, optional, defaults to :obj:`True`):
+                tie the word embedding and softmax weights
+            dropout (:obj:`float`, optional, defaults to 0.1):
+                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            dropatt (:obj:`float`, optional, defaults to 0):
+                The dropout ratio for the attention probabilities.
+            untie_r (:obj:`boolean`, optional, defaults to :obj:`True`):
+                Untie relative position biases
+            init (:obj:`string`, optional, defaults to `normal`):
+                Parameter initializer to use
+            init_range (:obj:`float`, optional, defaults to 0.01):
+                Parameters initialized by U(-init_range, init_range).
+            proj_init_std (:obj:`float`, optional, defaults to 0.01):
+                Parameters initialized by N(0, init_std)
+            init_std (:obj:`float`, optional, defaults to 0.02):
+                Parameters initialized by N(0, init_std)
+            layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
+                The epsilon to use in the layer normalization layers
+        Example::
+            from transformers import TransfoXLConfig, TransfoXLModel
+            # Initializing a Transformer XL configuration
+            configuration = TransfoXLConfig()
+            # Initializing a model from the configuration
+            model = TransfoXLModel(configuration)
+            # Accessing the model configuration
+            configuration = model.config
+        Attributes:
+            pretrained_config_archive_map (Dict[str, str]):
+                A dictionary containing all the available pre-trained checkpoints.
    """
    pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
@@ -99,9 +151,8 @@ class TransfoXLConfig(PretrainedConfig):
        layer_norm_epsilon=1e-5,
        **kwargs
    ):
-        """Constructs TransfoXLConfig.
-        """
        super(TransfoXLConfig, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.cutoffs = []
        self.cutoffs.extend(cutoffs)

--- a/src/transformers/configuration_xlm.py
+++ b/src/transformers/configuration_xlm.py
@@ -37,44 +37,124 @@ XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class XLMConfig(PretrainedConfig):
-    """Configuration class to store the configuration of a `XLMModel`.
+    """
+        This is the configuration class to store the configuration of a :class:`~transformers.XLMModel`.
-    Args:
+        It is used to instantiate an XLM model according to the specified arguments, defining the model
-        vocab_size: Vocabulary size of `inputs_ids` in `XLMModel`.
+        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        d_model: Size of the encoder layers and the pooler layer.
+        the `xlm-mlm-en-2048 <https://huggingface.co/xlm-mlm-en-2048>`__ architecture.
-        n_layer: Number of hidden layers in the Transformer encoder.
-        n_head: Number of attention heads for each attention layer in
+        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-            the Transformer encoder.
+        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        d_inner: The size of the "intermediate" (i.e., feed-forward)
+        for more information.
-            layer in the Transformer encoder.
-        ff_activation: The non-linear activation function (function or string) in the
+        Args:
-            encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+            vocab_size (:obj:`int`, optional, defaults to 30145):
-        untie_r: untie relative position biases
+                Vocabulary size of the XLM model. Defines the different tokens that
-        attn_type: 'bi' for XLM, 'uni' for Transformer-XL
+                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLMModel`.
+            emb_dim (:obj:`int`, optional, defaults to 2048):
-        dropout: The dropout probabilitiy for all fully connected
+                Dimensionality of the encoder layers and the pooler layer.
-            layers in the embeddings, encoder, and pooler.
+            n_layer (:obj:`int`, optional, defaults to 12):
-        max_position_embeddings: The maximum sequence length that this model might
+                Number of hidden layers in the Transformer encoder.
-            ever be used with. Typically set this to something large just in case
+            n_head (:obj:`int`, optional, defaults to 16):
-            (e.g., 512 or 1024 or 2048).
+                Number of attention heads for each attention layer in the Transformer encoder.
-        initializer_range: The sttdev of the truncated_normal_initializer for
+            dropout (:obj:`float`, optional, defaults to 0.1):
-            initializing all weight matrices.
+                The dropout probability for all fully connected
-        layer_norm_eps: The epsilon used by LayerNorm.
+                layers in the embeddings, encoder, and pooler.
+            attention_dropout (:obj:`float`, optional, defaults to 0.1):
-        dropout: float, dropout rate.
+                The dropout probability for the attention mechanism
-        init: str, the initialization scheme, either "normal" or "uniform".
+            gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`):
-        init_range: float, initialize the parameters with a uniform distribution
+                The non-linear activation function (function or string) in the
-            in [-init_range, init_range]. Only effective when init="uniform".
+                encoder and pooler. If set to `True`, "gelu" will be used instead of "relu".
-        init_std: float, initialize the parameters with a normal distribution
+            sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`):
-            with mean 0 and stddev init_std. Only effective when init="normal".
+                Whether to use sinusoidal positional embeddings instead of absolute positional embeddings.
-        mem_len: int, the number of tokens to cache.
+            causal (:obj:`boolean`, optional, defaults to :obj:`False`):
-        reuse_len: int, the number of tokens in the currect batch to be cached
+                Set this to `True` for the model to behave in a causal manner.
-            and reused in the future.
+                Causal models use a triangular attention mask in order to only attend to the left-side context instead
-        bi_data: bool, whether to use bidirectional input pipeline.
+                if a bidirectional context.
-            Usually set to True during pretraining and False during finetuning.
+            asm (:obj:`boolean`, optional, defaults to :obj:`False`):
-        clamp_len: int, clamp all relative distances larger than clamp_len.
+                Whether to use an adaptive log softmax projection layer instead of a linear layer for the prediction
-            -1 means no clamping.
+                layer.
-        same_length: bool, whether to use the same attention length for each token.
+            n_langs (:obj:`int`, optional, defaults to 1):
+                The number of languages the model handles. Set to 1 for monolingual models.
+            use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`)
+                Whether to use language embeddings. Some models use additional language embeddings, see
+                `the multilingual models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__
+                for information on how to use them.
+            max_position_embeddings (:obj:`int`, optional, defaults to 512):
+                The maximum sequence length that this model might
+                ever be used with. Typically set this to something large just in case
+                (e.g., 512 or 1024 or 2048).
+            embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5):
+                The standard deviation of the truncated_normal_initializer for
+                initializing the embedding matrices.
+            init_std (:obj:`int`, optional, defaults to 50257):
+                The standard deviation of the truncated_normal_initializer for
+                initializing all weight matrices except the embedding matrices.
+            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+                The epsilon used by the layer normalization layers.
+            bos_index (:obj:`int`, optional, defaults to 0):
+                The index of the beginning of sentence token in the vocabulary.
+            eos_index (:obj:`int`, optional, defaults to 1):
+                The index of the end of sentence token in the vocabulary.
+            pad_index (:obj:`int`, optional, defaults to 2):
+                The index of the padding token in the vocabulary.
+            unk_index (:obj:`int`, optional, defaults to 3):
+                The index of the unknown token in the vocabulary.
+            mask_index (:obj:`int`, optional, defaults to 5):
+                The index of the masking token in the vocabulary.
+            is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`):
+                Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
+            summary_type (:obj:`string`, optional, defaults to "first"):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:`~transformers.XLMForSequenceClassification`.
+                Is one of the following options:
+                    - 'last' => take the last token hidden state (like XLNet)
+                    - 'first' => take the first token hidden state (like Bert)
+                    - 'mean' => take the mean of all tokens hidden states
+                    - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+                    - 'attn' => Not implemented now, use multi-head attention
+            summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:`~transformers.XLMForSequenceClassification`.
+                Add a projection after the vector extraction
+            summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:`~transformers.XLMForSequenceClassification`.
+                'tanh' => add a tanh activation to the output, Other => no activation.
+            summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:`~transformers.XLMForSequenceClassification`.
+                If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
+            summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:`~transformers.XLMForSequenceClassification`.
+                Add a dropout before the projection and activation
+            start_n_top (:obj:`int`, optional, defaults to 5):
+                Used in the SQuAD evaluation script for XLM and XLNet.
+            end_n_top (:obj:`int`, optional, defaults to 5):
+                Used in the SQuAD evaluation script for XLM and XLNet.
+            mask_token_id (:obj:`int`, optional, defaults to 0):
+                Model agnostic parameter to identify masked tokens when generating text in an MLM context.
+            lang_id (:obj:`int`, optional, defaults to 1):
+                The ID of the language used by the model. This parameter is used when generating
+                text in a given language.
+        Example::
+            from transformers import XLMConfig, XLMModel
+            # Initializing a XLM configuration
+            configuration = XLMConfig()
+            # Initializing a model from the configuration
+            model = XLMModel(configuration)
+            # Accessing the model configuration
+            configuration = model.config
+        Attributes:
+            pretrained_config_archive_map (Dict[str, str]):
+                A dictionary containing all the available pre-trained checkpoints.
    """
    pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP

--- a/src/transformers/configuration_xlnet.py
+++ b/src/transformers/configuration_xlnet.py
@@ -30,42 +30,102 @@ XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class XLNetConfig(PretrainedConfig):
-    """Configuration class to store the configuration of a ``XLNetModel``.
+    """
+        This is the configuration class to store the configuration of a :class:`~transformers.XLNetModel`.
-    Args:
+        It is used to instantiate an XLNet model according to the specified arguments, defining the model
-        vocab_size: Vocabulary size of ``inputs_ids`` in ``XLNetModel``.
+        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        d_model: Size of the encoder layers and the pooler layer.
+        the `xlnet-large-cased <https://huggingface.co/xlnet-large-cased>`__ architecture.
-        n_layer: Number of hidden layers in the Transformer encoder.
-        n_head: Number of attention heads for each attention layer in
+        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-            the Transformer encoder.
+        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        d_inner: The size of the "intermediate" (i.e., feed-forward)
+        for more information.
-            layer in the Transformer encoder.
-        ff_activation: The non-linear activation function (function or string) in the
+        Args:
-            encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+            vocab_size (:obj:`int`, optional, defaults to 32000):
-        untie_r: untie relative position biases
+                Vocabulary size of the XLNet model. Defines the different tokens that
-        attn_type: 'bi' for XLNet, 'uni' for Transformer-XL
+                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLNetModel`.
+            d_model (:obj:`int`, optional, defaults to 1024):
-        dropout: The dropout probabilitiy for all fully connected
+                Dimensionality of the encoder layers and the pooler layer.
-            layers in the embeddings, encoder, and pooler.
+            n_layer (:obj:`int`, optional, defaults to 24):
-        initializer_range: The sttdev of the truncated_normal_initializer for
+                Number of hidden layers in the Transformer encoder.
-            initializing all weight matrices.
+            n_head (:obj:`int`, optional, defaults to 16):
-        layer_norm_eps: The epsilon used by LayerNorm.
+                Number of attention heads for each attention layer in the Transformer encoder.
+            d_inner (:obj:`int`, optional, defaults to 4096):
-        dropout: float, dropout rate.
+                Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        init: str, the initialization scheme, either "normal" or "uniform".
+            ff_activation (:obj:`string`, optional, defaults to "gelu"):
-        init_range: float, initialize the parameters with a uniform distribution
+                The non-linear activation function (function or string) in the
-            in [-init_range, init_range]. Only effective when init="uniform".
+                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
-        init_std: float, initialize the parameters with a normal distribution
+            untie_r (:obj:`boolean`, optional, defaults to :obj:`True`):
-            with mean 0 and stddev init_std. Only effective when init="normal".
+                Untie relative position biases
-        mem_len: int, the number of tokens to cache.
+            attn_type (:obj:`string`, optional, defaults to "bi"):
-        reuse_len: int, the number of tokens in the currect batch to be cached
+                The attention type used by the model. Set 'bi' for XLNet, 'uni' for Transformer-XL.
-            and reused in the future.
+            initializer_range (:obj:`float`, optional, defaults to 0.02):
-        bi_data: bool, whether to use bidirectional input pipeline.
+                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            Usually set to True during pretraining and False during finetuning.
+            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-        clamp_len: int, clamp all relative distances larger than clamp_len.
+                The epsilon used by the layer normalization layers.
-            -1 means no clamping.
+            dropout (:obj:`float`, optional, defaults to 0.1):
-        same_length: bool, whether to use the same attention length for each token.
+                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        finetuning_task: name of the glue task on which the model was fine-tuned if any
+            mem_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`):
+                The number of tokens to cache. The key/value pairs that have already been pre-computed
+                in a previous forward pass won't be re-computed. See the
+                `quickstart <https://huggingface.co/transformers/quickstart.html#using-the-past>`__
+                for more information.
+            reuse_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`):
+                The number of tokens in the current batch to be cached and reused in the future.
+            bi_data (:obj:`boolean`, optional, defaults to :obj:`False`):
+                Whether to use bidirectional input pipeline. Usually set to `True` during
+                pretraining and `False` during finetuning.
+            clamp_len (:obj:`int`, optional, defaults to -1):
+                Clamp all relative distances larger than clamp_len.
+                Setting this attribute to -1 means no clamping.
+            same_length (:obj:`boolean`, optional, defaults to :obj:`False`):
+                Whether to use the same attention length for each token.
+            summary_type (:obj:`string`, optional, defaults to "last"):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
+                Is one of the following options:
+                    - 'last' => take the last token hidden state (like XLNet)
+                    - 'first' => take the first token hidden state (like Bert)
+                    - 'mean' => take the mean of all tokens hidden states
+                    - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+                    - 'attn' => Not implemented now, use multi-head attention
+            summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
+                Add a projection after the vector extraction
+            summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
+                'tanh' => add a tanh activation to the output, Other => no activation.
+            summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
+                If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
+            summary_last_dropout (:obj:`float`, optional, defaults to 0.1):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
+                Add a dropout after the projection and activation
+            start_n_top (:obj:`int`, optional, defaults to 5):
+                Used in the SQuAD evaluation script for XLM and XLNet.
+            end_n_top (:obj:`int`, optional, defaults to 5):
+                Used in the SQuAD evaluation script for XLM and XLNet.
+        Example::
+            from transformers import XLNetConfig, XLNetModel
+            # Initializing a XLNet configuration
+            configuration = XLNetConfig()
+            # Initializing a model from the configuration
+            model = XLNetModel(configuration)
+            # Accessing the model configuration
+            configuration = model.config
+        Attributes:
+            pretrained_config_archive_map (Dict[str, str]):
+                A dictionary containing all the available pre-trained checkpoints.
    """
    pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP