Size > Dimensionality + Remove final TODOs

6c32d8bb · Lysandre · Lysandre Debut · 760164d6 · 6c32d8bb · 6c32d8bb
Commit 6c32d8bb authored Jan 14, 2020 by Lysandre Committed by Lysandre Debut Jan 14, 2020
8 changed files
--- a/src/transformers/configuration_albert.py
+++ b/src/transformers/configuration_albert.py
@@ -47,9 +47,9 @@ class AlbertConfig(PretrainedConfig):
                Vocabulary size of the ALBERT model. Defines the different tokens that
                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`.
            embedding_size (:obj:`int`, optional, defaults to 128):
-                Size of vocabulary embeddings.
+                Dimensionality of vocabulary embeddings.
            hidden_size (:obj:`int`, optional, defaults to 4096):
-                Size of the encoder layers and the pooler layer.
+                Dimensionality of the encoder layers and the pooler layer.
            num_hidden_layers (:obj:`int`, optional, defaults to 12):
                Number of hidden layers in the Transformer encoder.
            num_hidden_groups (:obj:`int`, optional, defaults to 1):
@@ -57,7 +57,7 @@ class AlbertConfig(PretrainedConfig):
            num_attention_heads (:obj:`int`, optional, defaults to 64):
                Number of attention heads for each attention layer in the Transformer encoder.
            intermediate_size (:obj:`int`, optional, defaults to 16384):
-                The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+                The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
            inner_group_num (:obj:`int`, optional, defaults to 1):
                The number of inner repetition of attention and ffn.
            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"):

--- a/src/transformers/configuration_bert.py
+++ b/src/transformers/configuration_bert.py
@@ -65,13 +65,13 @@ class BertConfig(PretrainedConfig):
                Vocabulary size of the BERT model. Defines the different tokens that
                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
            hidden_size (:obj:`int`, optional, defaults to 768):
-                Size of the encoder layers and the pooler layer.
+                Dimensionality of the encoder layers and the pooler layer.
            num_hidden_layers (:obj:`int`, optional, defaults to 12):
                Number of hidden layers in the Transformer encoder.
            num_attention_heads (:obj:`int`, optional, defaults to 12):
                Number of attention heads for each attention layer in the Transformer encoder.
            intermediate_size (:obj:`int`, optional, defaults to 3072):
-                The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+                Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
                The non-linear activation function (function or string) in the encoder and pooler.
                If string, "gelu", "relu", "swish" and "gelu_new" are supported.

--- a/src/transformers/configuration_ctrl.py
+++ b/src/transformers/configuration_ctrl.py
@@ -44,11 +44,11 @@ class CTRLConfig(PretrainedConfig):
                The maximum sequence length that this model might ever be used with.
                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
            n_ctx (:obj:`int`, optional, defaults to 256):
-                Size of the causal mask (usually same as n_positions).
+                Dimensionality of the causal mask (usually same as n_positions).
            n_embd (:obj:`int`, optional, defaults to 1280):
                Dimensionality of the embeddings and hidden states.
            dff (:obj:`int`, optional, defaults to 8192):
-                Size of the inner dimension of the FFN.
+                Dimensionality of the inner dimension of the FFN.
            n_layer (:obj:`int`, optional, defaults to 48):
                Number of hidden layers in the Transformer encoder.
            n_head (:obj:`int`, optional, defaults to 16):

--- a/src/transformers/configuration_distilbert.py
+++ b/src/transformers/configuration_distilbert.py
@@ -56,7 +56,7 @@ class DistilBertConfig(PretrainedConfig):
            n_heads (:obj:`int`, optional, defaults to 12):
                Number of attention heads for each attention layer in the Transformer encoder.
            dim (:obj:`int`, optional, defaults to 768):
-                Size of the encoder layers and the pooler layer.
+                Dimensionality of the encoder layers and the pooler layer.
            intermediate_size (:obj:`int`, optional, defaults to 3072):
                The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
            dropout (:obj:`float`, optional, defaults to 0.1):

--- a/src/transformers/configuration_gpt2.py
+++ b/src/transformers/configuration_gpt2.py
@@ -52,7 +52,7 @@ class GPT2Config(PretrainedConfig):
                The maximum sequence length that this model might ever be used with.
                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
            n_ctx (:obj:`int`, optional, defaults to 1024):
-                Size of the causal mask (usually same as n_positions).
+                Dimensionality of the causal mask (usually same as n_positions).
            n_embd (:obj:`int`, optional, defaults to 768):
                Dimensionality of the embeddings and hidden states.
            n_layer (:obj:`int`, optional, defaults to 12):

--- a/src/transformers/configuration_openai.py
+++ b/src/transformers/configuration_openai.py
@@ -47,7 +47,7 @@ class OpenAIGPTConfig(PretrainedConfig):
                The maximum sequence length that this model might ever be used with.
                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
            n_ctx (:obj:`int`, optional, defaults to 512):
-                Size of the causal mask (usually same as n_positions).
+                Dimensionality of the causal mask (usually same as n_positions).
            n_embd (:obj:`int`, optional, defaults to 768):
                Dimensionality of the embeddings and hidden states.
            n_layer (:obj:`int`, optional, defaults to 12):

--- a/src/transformers/configuration_xlm.py
+++ b/src/transformers/configuration_xlm.py
@@ -72,7 +72,8 @@ class XLMConfig(PretrainedConfig):
                Causal models use a triangular attention mask in order to only attend to the left-side context instead
                if a bidirectional context.
            asm (:obj:`boolean`, optional, defaults to :obj:`False`):
-                TODO
+                Whether to use an adaptive log softmax projection layer instead of a linear layer for the prediction
+                layer.
            n_langs (:obj:`int`, optional, defaults to 1):
                The number of languages the model handles. Set to 1 for monolingual models.
            use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`)

--- a/src/transformers/configuration_xlnet.py
+++ b/src/transformers/configuration_xlnet.py
@@ -45,13 +45,13 @@ class XLNetConfig(PretrainedConfig):
                Vocabulary size of the XLNet model. Defines the different tokens that
                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLNetModel`.
            d_model (:obj:`int`, optional, defaults to 1024):
-                Size of the encoder layers and the pooler layer.
+                Dimensionality of the encoder layers and the pooler layer.
            n_layer (:obj:`int`, optional, defaults to 24):
                Number of hidden layers in the Transformer encoder.
            n_head (:obj:`int`, optional, defaults to 16):
                Number of attention heads for each attention layer in the Transformer encoder.
            d_inner (:obj:`int`, optional, defaults to 4096):
-                The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+                Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
            ff_activation (:obj:`string`, optional, defaults to "gelu"):
                The non-linear activation function (function or string) in the
                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.