"git@developer.sourcefind.cn:wangsen/paddle_dbnet.git" did not exist on "839bb4281a4aa400393d52d683ee194418204d97"
Commit 63268272 authored by Lysandre Debut's avatar Lysandre Debut Committed by Lysandre Debut
Browse files

Updated Configurations

parent 2b566c18
ALBERT ALBERT
---------------------------------------------------- ----------------------------------------------------
``AlbrtConfig`` ``AlbertConfig``
~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.AlbertConfig .. autoclass:: transformers.AlbertConfig
......
...@@ -31,9 +31,73 @@ ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -31,9 +31,73 @@ ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class AlbertConfig(PretrainedConfig): class AlbertConfig(PretrainedConfig):
"""Configuration for `AlbertModel`. r"""
This is the configuration class to store the configuration of an :class:`~transformers.AlbertModel`.
It is used to instantiate an ALBERT model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the ALBERT xxlarge architecture.
The default settings match the configuration of model `albert_xxlarge`. Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Args:
vocab_size (:obj:`int`, optional, defaults to 30000):
Vocabulary size of the ALBERT model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`.
embedding_size (:obj:`int`, optional, defaults to 128):
Size of vocabulary embeddings.
hidden_size (:obj:`int`, optional, defaults to 4096):
Size of the encoder layers and the pooler layer.
num_hidden_layers (:obj:`int`, optional, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_hidden_groups (:obj:`int`, optional, defaults to 1):
Number of groups for the hidden layers, parameters in the same group are shared.
num_attention_heads (:obj:`int`, optional, defaults to 64):
Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (:obj:`int`, optional, defaults to 16384):
The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
inner_group_num (:obj:`int`, optional, defaults to 1):
The number of inner repetition of attention and ffn.
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"):
The non-linear activation function (function or string) in the encoder and pooler.
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
hidden_dropout_prob (:obj:`float`, optional, defaults to 0):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0):
The dropout ratio for the attention probabilities.
max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something
large (e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, optional, defaults to 2):
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`.
initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
The epsilon used by the layer normalization layers.
Example::
# Initializing an ALBERT-xxlarge style configuration
albert_xxlarge_configuration = AlbertConfig()
# Initializing an ALBERT-base style configuration
albert_base_configuration = AlbertConfig(
hidden_size=768,
num_attention_heads=12,
intermediate_size=3072,
)
# Initializing a model from the ALBERT-base style configuration
model = AlbertModel(bert_base_configuration)
# Accessing the model configuration
configuration = model.config
Attributes:
pretrained_config_archive_map (Dict[str, str]):
A dictionary containing all the available pre-trained checkpoints.
""" """
pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
...@@ -57,35 +121,6 @@ class AlbertConfig(PretrainedConfig): ...@@ -57,35 +121,6 @@ class AlbertConfig(PretrainedConfig):
layer_norm_eps=1e-12, layer_norm_eps=1e-12,
**kwargs **kwargs
): ):
"""Constructs AlbertConfig.
Args:
vocab_size: Vocabulary size of `inputs_ids` in `AlbertModel`.
embedding_size: size of voc embeddings.
hidden_size: Size of the encoder layers and the pooler layer.
num_hidden_layers: Number of hidden layers in the Transformer encoder.
num_hidden_groups: Number of group for the hidden layers, parameters in
the same group are shared.
num_attention_heads: Number of attention heads for each attention layer in
the Transformer encoder.
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
layer in the Transformer encoder.
inner_group_num: int, number of inner repetition of attention and ffn.
down_scale_factor: float, the scale to apply
hidden_act: The non-linear activation function (function or string) in the
encoder and pooler.
hidden_dropout_prob: The dropout probability for all fully connected
layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob: The dropout ratio for the attention
probabilities.
max_position_embeddings: The maximum sequence length that this model might
ever be used with. Typically set this to something large just in case
(e.g., 512 or 1024 or 2048).
type_vocab_size: The vocabulary size of the `token_type_ids` passed into
`AlbertModel`.
initializer_range: The stdev of the truncated_normal_initializer for
initializing all weight matrices.
"""
super(AlbertConfig, self).__init__(**kwargs) super(AlbertConfig, self).__init__(**kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
......
...@@ -57,29 +57,13 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict( ...@@ -57,29 +57,13 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict(
class AutoConfig(object): class AutoConfig(object):
r""":class:`~transformers.AutoConfig` is a generic configuration class r"""
:class:`~transformers.AutoConfig` is a generic configuration class
that will be instantiated as one of the configuration classes of the library that will be instantiated as one of the configuration classes of the library
when created with the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` when created with the :func:`~transformers.AutoConfig.from_pretrained` class method.
class method.
The `from_pretrained()` method take care of returning the correct model class instance The :func:`~transformers.AutoConfig.from_pretrained` method takes care of returning the correct model class instance
using pattern matching on the `pretrained_model_name_or_path` string. using pattern matching on the `pretrained_model_name_or_path` string argument.
The base model class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order):
- contains `distilbert`: DistilBertConfig (DistilBERT model)
- contains `albert`: AlbertConfig (ALBERT model)
- contains `camembert`: CamembertConfig (CamemBERT model)
- contains `xlm-roberta`: XLMRobertaConfig (XLM-RoBERTa model)
- contains `roberta`: RobertaConfig (RoBERTa model)
- contains `bert`: BertConfig (Bert model)
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
- contains `xlnet`: XLNetConfig (XLNet model)
- contains `xlm`: XLMConfig (XLM model)
- contains `ctrl` : CTRLConfig (CTRL model)
This class cannot be instantiated using `__init__()` (throw an error).
""" """
def __init__(self): def __init__(self):
...@@ -94,6 +78,8 @@ class AutoConfig(object): ...@@ -94,6 +78,8 @@ class AutoConfig(object):
return DistilBertConfig(*args, **kwargs) return DistilBertConfig(*args, **kwargs)
elif "roberta" in model_type: elif "roberta" in model_type:
return RobertaConfig(*args, **kwargs) return RobertaConfig(*args, **kwargs)
elif "albert" in model_type:
return AlbertConfig(*args, **kwargs)
elif "bert" in model_type: elif "bert" in model_type:
return BertConfig(*args, **kwargs) return BertConfig(*args, **kwargs)
elif "openai-gpt" in model_type: elif "openai-gpt" in model_type:
...@@ -108,8 +94,6 @@ class AutoConfig(object): ...@@ -108,8 +94,6 @@ class AutoConfig(object):
return XLMConfig(*args, **kwargs) return XLMConfig(*args, **kwargs)
elif "ctrl" in model_type: elif "ctrl" in model_type:
return CTRLConfig(*args, **kwargs) return CTRLConfig(*args, **kwargs)
elif "albert" in model_type:
return AlbertConfig(*args, **kwargs)
elif "camembert" in model_type: elif "camembert" in model_type:
return CamembertConfig(*args, **kwargs) return CamembertConfig(*args, **kwargs)
raise ValueError( raise ValueError(
...@@ -120,59 +104,60 @@ class AutoConfig(object): ...@@ -120,59 +104,60 @@ class AutoConfig(object):
@classmethod @classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
r""" Instantiate a one of the configuration classes of the library r""" Instantiates one of the configuration classes of the library
from a pre-trained model configuration. from a pre-trained model configuration.
The configuration class to instantiate is selected as the first pattern matching The configuration class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order): in the `pretrained_model_name_or_path` string (in the following order):
- contains `t5`: T5Config (T5 model) - contains `t5`: :class:`~transformers.T5Config` (T5 model)
- contains `distilbert`: DistilBertConfig (DistilBERT model) - contains `distilbert`: :class:`~transformers.DistilBertConfig` (DistilBERT model)
- contains `albert`: AlbertConfig (ALBERT model) - contains `albert`: :class:`~transformers.AlbertConfig` (ALBERT model)
- contains `camembert`: CamembertConfig (CamemBERT model) - contains `camembert`: :class:`~transformers.CamembertConfig` (CamemBERT model)
- contains `xlm-roberta`: XLMRobertaConfig (XLM-RoBERTa model) - contains `xlm-roberta`: :class:`~transformers.XLMRobertaConfig` (XLM-RoBERTa model)
- contains `roberta`: RobertaConfig (RoBERTa model) - contains `roberta`: :class:`~transformers.RobertaConfig` (RoBERTa model)
- contains `bert`: BertConfig (Bert model) - contains `bert`: :class:`~transformers.BertConfig` (Bert model)
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model) - contains `openai-gpt`: :class:`~transformers.OpenAIGPTConfig` (OpenAI GPT model)
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model) - contains `gpt2`: :class:`~transformers.GPT2Config` (OpenAI GPT-2 model)
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model) - contains `transfo-xl`: :class:`~transformers.TransfoXLConfig` (Transformer-XL model)
- contains `xlnet`: XLNetConfig (XLNet model) - contains `xlnet`: :class:`~transformers.XLNetConfig` (XLNet model)
- contains `xlm`: XLMConfig (XLM model) - contains `xlm`: :class:`~transformers.XLMConfig` (XLM model)
- contains `ctrl` : CTRLConfig (CTRL model) - contains `ctrl` : :class:`~transformers.CTRLConfig` (CTRL model)
Params:
pretrained_model_name_or_path: either:
Args:
- a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``. pretrained_model_name_or_path (:obj:`string`):
- a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. Is either: \
- a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``. - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
- a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``. - a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
- a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
cache_dir: (`optional`) string: - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
cache_dir (:obj:`string`, optional, defaults to `None`):
Path to a directory in which a downloaded pre-trained model Path to a directory in which a downloaded pre-trained model
configuration should be cached if the standard cache should not be used. configuration should be cached if the standard cache should not be used.
kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading. force_download (:obj:`boolean`, optional, defaults to `False`):
Force to (re-)download the model weights and configuration files and override the cached versions if they exist.
- The values in kwargs of any keys which are configuration attributes will be used to override the loaded values. resume_download (:obj:`boolean`, optional, defaults to `False`):
- Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter. Do not delete incompletely received file. Attempt to resume the download if such a file exists.
force_download: (`optional`) boolean, default False: proxies (:obj:`Dict[str, str]`, optional, defaults to `None`):
Force to (re-)download the model weights and configuration files and override the cached versions if they exists. A dictionary of proxy servers to use by protocol or endpoint, e.g.: :obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`.
The proxies are used on each request. See `the requests documentation <https://requests.readthedocs.io/en/master/user/advanced/#proxies>`__ for usage.
resume_download: (`optional`) boolean, default False:
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
proxies: (`optional`) dict, default None:
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
The proxies are used on each request.
return_unused_kwargs: (`optional`) bool:
return_unused_kwargs (:obj:`boolean`, optional, defaults to `False`):
- If False, then this function returns just the final configuration object. - If False, then this function returns just the final configuration object.
- If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored. - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`): key/value pairs with which to update the configuration object after loading.
- The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
- Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
Examples:: Examples::
config = AutoConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. config = AutoConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache.
config = AutoConfig.from_pretrained('./test/bert_saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')` config = AutoConfig.from_pretrained('./test/bert_saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json') config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json')
config = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False) config = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
......
...@@ -50,32 +50,44 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -50,32 +50,44 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class BertConfig(PretrainedConfig): class BertConfig(PretrainedConfig):
r""" r"""
:class:`~transformers.BertConfig` is the configuration class to store the configuration of a This is the configuration class to store the configuration of a :class:`~transformers.BertModel`.
`BertModel`. It is used to instantiate an BERT model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the BERT bert-base-uncased architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Arguments:
vocab_size: Vocabulary size of `inputs_ids` in `BertModel`. Args:
hidden_size: Size of the encoder layers and the pooler layer. vocab_size (:obj:`int`, optional, defaults to 30522):
num_hidden_layers: Number of hidden layers in the Transformer encoder. Vocabulary size of the BERT model. Defines the different tokens that
num_attention_heads: Number of attention heads for each attention layer in can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
the Transformer encoder. hidden_size (:obj:`int`, optional, defaults to 768):
intermediate_size: The size of the "intermediate" (i.e., feed-forward) Size of the encoder layers and the pooler layer.
layer in the Transformer encoder. num_hidden_layers (:obj:`int`, optional, defaults to 12):
hidden_act: The non-linear activation function (function or string) in the Number of hidden layers in the Transformer encoder.
encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported. num_attention_heads (:obj:`int`, optional, defaults to 12):
hidden_dropout_prob: The dropout probabilitiy for all fully connected Number of attention heads for each attention layer in the Transformer encoder.
layers in the embeddings, encoder, and pooler. intermediate_size (:obj:`int`, optional, defaults to 3072):
attention_probs_dropout_prob: The dropout ratio for the attention The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
probabilities. hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
max_position_embeddings: The maximum sequence length that this model might The non-linear activation function (function or string) in the encoder and pooler.
ever be used with. Typically set this to something large just in case If string, "gelu", "relu", "swish" and "gelu_new" are supported.
(e.g., 512 or 1024 or 2048). hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
type_vocab_size: The vocabulary size of the `token_type_ids` passed into The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
`BertModel`. attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
initializer_range: The sttdev of the truncated_normal_initializer for The dropout ratio for the attention probabilities.
initializing all weight matrices. max_position_embeddings (:obj:`int`, optional, defaults to 512):
layer_norm_eps: The epsilon used by LayerNorm. The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, optional, defaults to 2):
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
The epsilon used by the layer normalization layers.
""" """
pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
...@@ -96,6 +108,7 @@ class BertConfig(PretrainedConfig): ...@@ -96,6 +108,7 @@ class BertConfig(PretrainedConfig):
**kwargs **kwargs
): ):
super(BertConfig, self).__init__(**kwargs) super(BertConfig, self).__init__(**kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers self.num_hidden_layers = num_hidden_layers
......
...@@ -29,4 +29,17 @@ CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -29,4 +29,17 @@ CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class CamembertConfig(RobertaConfig): class CamembertConfig(RobertaConfig):
r"""
This is the configuration class to store the configuration of an :class:`~transformers.CamembertModel`.
It is used to instantiate an Camembert model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the BERT bert-base-uncased architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
The :class:`~transformers.CamembertConfig` class directly inherits :class:`~transformers.BertConfig`.
It reuses the same defaults. Please check the parent class for more information.
"""
pretrained_config_archive_map = CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
...@@ -26,25 +26,43 @@ CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://storage.googleapis.com/sf ...@@ -26,25 +26,43 @@ CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://storage.googleapis.com/sf
class CTRLConfig(PretrainedConfig): class CTRLConfig(PretrainedConfig):
"""Configuration class to store the configuration of a `CTRLModel`. """
This is the configuration class to store the configuration of an :class:`~transformers.CTRLModel`.
Args: It is used to instantiate an CTRL model according to the specified arguments, defining the model
vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file. architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
n_positions: Number of positional embeddings. the CTRL architecture from SalesForce.
n_ctx: Size of the causal mask (usually same as n_positions).
dff: Size of the inner dimension of the FFN. Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
n_embd: Dimensionality of the embeddings and hidden states. to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
n_layer: Number of hidden layers in the Transformer encoder. for more information.
n_head: Number of attention heads for each attention layer in
the Transformer encoder. Args:
layer_norm_epsilon: epsilon to use in the layer norm layers vocab_size (:obj:`int`, optional, defaults to 246534):
resid_pdrop: The dropout probabilitiy for all fully connected Vocabulary size of the CTRL model. Defines the different tokens that
layers in the embeddings, encoder, and pooler. can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
attn_pdrop: The dropout ratio for the attention n_positions (:obj:`int`, optional, defaults to 256):
probabilities. The maximum sequence length that this model might ever be used with.
embd_pdrop: The dropout ratio for the embeddings. Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
initializer_range: The sttdev of the truncated_normal_initializer for n_ctx (:obj:`int`, optional, defaults to 256):
initializing all weight matrices. Size of the causal mask (usually same as n_positions).
n_embd (:obj:`int`, optional, defaults to 1280):
Dimensionality of the embeddings and hidden states.
dff (:obj:`int`, optional, defaults to 8192):
Size of the inner dimension of the FFN.
n_layer (:obj:`int`, optional, defaults to 48):
Number of hidden layers in the Transformer encoder.
n_head (:obj:`int`, optional, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
resid_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
embd_pdrop (:obj:`int`, optional, defaults to 0.1):
The dropout ratio for the embeddings.
attn_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention.
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-6):
The epsilon to use in the layer normalization layers
initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
""" """
pretrained_config_archive_map = CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
...@@ -70,26 +88,6 @@ class CTRLConfig(PretrainedConfig): ...@@ -70,26 +88,6 @@ class CTRLConfig(PretrainedConfig):
summary_first_dropout=0.1, summary_first_dropout=0.1,
**kwargs **kwargs
): ):
"""Constructs CTRLConfig.
Args:
vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
n_positions: Number of positional embeddings.
n_ctx: Size of the causal mask (usually same as n_positions).
dff: Size of the inner dimension of the FFN.
n_embd: Dimensionality of the embeddings and hidden states.
n_layer: Number of hidden layers in the Transformer encoder.
n_head: Number of attention heads for each attention layer in
the Transformer encoder.
layer_norm_epsilon: epsilon to use in the layer norm layers
resid_pdrop: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
attn_pdrop: The dropout ratio for the attention
probabilities.
embd_pdrop: The dropout ratio for the embeddings.
initializer_range: The sttdev of the truncated_normal_initializer for
initializing all weight matrices.
"""
super(CTRLConfig, self).__init__(**kwargs) super(CTRLConfig, self).__init__(**kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.n_ctx = n_ctx self.n_ctx = n_ctx
......
...@@ -31,6 +31,50 @@ DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -31,6 +31,50 @@ DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class DistilBertConfig(PretrainedConfig): class DistilBertConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a :class:`~transformers.DistilBertModel`.
It is used to instantiate a DistilBERT model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the DistilBERT distilbert-base-uncased architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Args:
vocab_size (:obj:`int`, optional, defaults to 30522):
Vocabulary size of the DistilBERT model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
sinusoidal_pos_embds (:obj:`boolean`, optional, defaults to :obj:`False`):
Whether to use sinusoidal positional embeddings.
n_layers (:obj:`int`, optional, defaults to 6):
Number of hidden layers in the Transformer encoder.
n_heads (:obj:`int`, optional, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
dim (:obj:`int`, optional, defaults to 768):
Size of the encoder layers and the pooler layer.
intermediate_size (:obj:`int`, optional, defaults to 3072):
The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
dropout (:obj:`float`, optional, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention probabilities.
activation (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
The non-linear activation function (function or string) in the encoder and pooler.
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
qa_dropout (:obj:`float`, optional, defaults to 0.1):
The dropout probabilities used in the question answering model
:class:`~tranformers.DistilBertForQuestionAnswering`.
seq_classif_dropout (:obj:`float`, optional, defaults to 0.2):
The dropout probabilities used in the sequence classification model
:class:`~tranformers.DistilBertForSequenceClassification`.
"""
pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
def __init__( def __init__(
...@@ -46,7 +90,6 @@ class DistilBertConfig(PretrainedConfig): ...@@ -46,7 +90,6 @@ class DistilBertConfig(PretrainedConfig):
attention_dropout=0.1, attention_dropout=0.1,
activation="gelu", activation="gelu",
initializer_range=0.02, initializer_range=0.02,
tie_weights_=True,
qa_dropout=0.1, qa_dropout=0.1,
seq_classif_dropout=0.2, seq_classif_dropout=0.2,
**kwargs **kwargs
...@@ -63,7 +106,6 @@ class DistilBertConfig(PretrainedConfig): ...@@ -63,7 +106,6 @@ class DistilBertConfig(PretrainedConfig):
self.attention_dropout = attention_dropout self.attention_dropout = attention_dropout
self.activation = activation self.activation = activation
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.tie_weights_ = tie_weights_
self.qa_dropout = qa_dropout self.qa_dropout = qa_dropout
self.seq_classif_dropout = seq_classif_dropout self.seq_classif_dropout = seq_classif_dropout
......
...@@ -33,24 +33,42 @@ GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -33,24 +33,42 @@ GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class GPT2Config(PretrainedConfig): class GPT2Config(PretrainedConfig):
"""Configuration class to store the configuration of a `GPT2Model`. """
This is the configuration class to store the configuration of a :class:`~transformers.GPT2Model`.
Args: It is used to instantiate an GPT-2 model according to the specified arguments, defining the model
vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file. architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
n_positions: Number of positional embeddings. the GPT-2 small architecture.
n_ctx: Size of the causal mask (usually same as n_positions).
n_embd: Dimensionality of the embeddings and hidden states. Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
n_layer: Number of hidden layers in the Transformer encoder. to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
n_head: Number of attention heads for each attention layer in for more information.
the Transformer encoder.
layer_norm_epsilon: epsilon to use in the layer norm layers
resid_pdrop: The dropout probabilitiy for all fully connected Args:
layers in the embeddings, encoder, and pooler. vocab_size (:obj:`int`, optional, defaults to 50257):
attn_pdrop: The dropout ratio for the attention Vocabulary size of the GPT-2 model. Defines the different tokens that
probabilities. can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.GPT2Model`.
embd_pdrop: The dropout ratio for the embeddings. n_positions (:obj:`int`, optional, defaults to 1024):
initializer_range: The sttdev of the truncated_normal_initializer for The maximum sequence length that this model might ever be used with.
initializing all weight matrices. Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
n_ctx (:obj:`int`, optional, defaults to 1024):
Size of the causal mask (usually same as n_positions).
n_embd (:obj:`int`, optional, defaults to 768):
Dimensionality of the embeddings and hidden states.
n_layer (:obj:`int`, optional, defaults to 12):
Number of hidden layers in the Transformer encoder.
n_head (:obj:`int`, optional, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
resid_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
embd_pdrop (:obj:`int`, optional, defaults to 0.1):
The dropout ratio for the embeddings.
attn_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention.
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
The epsilon to use in the layer normalization layers
initializer_range (:obj:`float`, optional, defaults to 16):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
""" """
pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
...@@ -75,26 +93,8 @@ class GPT2Config(PretrainedConfig): ...@@ -75,26 +93,8 @@ class GPT2Config(PretrainedConfig):
summary_first_dropout=0.1, summary_first_dropout=0.1,
**kwargs **kwargs
): ):
"""Constructs GPT2Config.
Args:
vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
n_positions: Number of positional embeddings.
n_ctx: Size of the causal mask (usually same as n_positions).
n_embd: Dimensionality of the embeddings and hidden states.
n_layer: Number of hidden layers in the Transformer encoder.
n_head: Number of attention heads for each attention layer in
the Transformer encoder.
layer_norm_epsilon: epsilon to use in the layer norm layers
resid_pdrop: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
attn_pdrop: The dropout ratio for the attention
probabilities.
embd_pdrop: The dropout ratio for the embeddings.
initializer_range: The sttdev of the truncated_normal_initializer for
initializing all weight matrices.
"""
super(GPT2Config, self).__init__(**kwargs) super(GPT2Config, self).__init__(**kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.n_ctx = n_ctx self.n_ctx = n_ctx
self.n_positions = n_positions self.n_positions = n_positions
......
...@@ -26,9 +26,13 @@ class MMBTConfig(object): ...@@ -26,9 +26,13 @@ class MMBTConfig(object):
"""Configuration class to store the configuration of a `MMBT Model`. """Configuration class to store the configuration of a `MMBT Model`.
Args: Args:
config: config of the underlying Transformer models. It's values are copied over to use a single config. config (:obj:`~transformers.PreTrainedConfig`):
num_labels: Size of final Linear layer for classification. Config of the underlying Transformer models. Its values are
modal_hidden_size: Embedding dimension of the non-text modality encoder. copied over to use a single config.
num_labels (:obj:`int` or :obj:`None`, optional, defaults to `None`):
Size of final Linear layer for classification.
modal_hidden_size (:obj:`int`, optional, defautls to 2048):
Embedding dimension of the non-text modality encoder.
""" """
def __init__(self, config, num_labels=None, modal_hidden_size=2048): def __init__(self, config, num_labels=None, modal_hidden_size=2048):
......
...@@ -30,27 +30,45 @@ OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -30,27 +30,45 @@ OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class OpenAIGPTConfig(PretrainedConfig): class OpenAIGPTConfig(PretrainedConfig):
""" """
Configuration class to store the configuration of a `OpenAIGPTModel`. This is the configuration class to store the configuration of an :class:`~transformers.OpenAIGPTModel`.
It is used to instantiate an GPT model according to the specified arguments, defining the model
Args: architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
vocab_size: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file. the GPT architecture from OpenAI.
n_positions: Number of positional embeddings.
n_ctx: Size of the causal mask (usually same as n_positions). Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
n_embd: Dimensionality of the embeddings and hidden states. to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
n_layer: Number of hidden layers in the Transformer encoder. for more information.
n_head: Number of attention heads for each attention layer in
the Transformer encoder. Args:
afn: The non-linear activation function (function or string) in the vocab_size (:obj:`int`, optional, defaults to 40478):
encoder and pooler. If string, "gelu", "relu" and "swish" are supported. Vocabulary size of the GPT model. Defines the different tokens that
resid_pdrop: The dropout probabilitiy for all fully connected can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
layers in the embeddings, encoder, and pooler. n_positions (:obj:`int`, optional, defaults to 512):
attn_pdrop: The dropout ratio for the attention The maximum sequence length that this model might ever be used with.
probabilities. Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
embd_pdrop: The dropout ratio for the embeddings. n_ctx (:obj:`int`, optional, defaults to 512):
layer_norm_epsilon: epsilon to use in the layer norm layers Size of the causal mask (usually same as n_positions).
initializer_range: The sttdev of the truncated_normal_initializer for n_embd (:obj:`int`, optional, defaults to 768):
initializing all weight matrices. Dimensionality of the embeddings and hidden states.
predict_special_tokens: should we predict special tokens (when the model has a LM head) n_layer (:obj:`int`, optional, defaults to 12):
Number of hidden layers in the Transformer encoder.
n_head (:obj:`int`, optional, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
afn (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
The non-linear activation function (function or string) in the encoder and pooler.
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
resid_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
embd_pdrop (:obj:`int`, optional, defaults to 0.1):
The dropout ratio for the embeddings.
attn_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention.
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
The epsilon to use in the layer normalization layers
initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
predict_special_tokens (:obj:`boolean`, optional, defaults to :obj:`True`):
Whether special tokens should be predicted when the model is has a language modeling head.
""" """
pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
...@@ -77,9 +95,8 @@ class OpenAIGPTConfig(PretrainedConfig): ...@@ -77,9 +95,8 @@ class OpenAIGPTConfig(PretrainedConfig):
summary_first_dropout=0.1, summary_first_dropout=0.1,
**kwargs **kwargs
): ):
"""Constructs OpenAIGPTConfig.
"""
super(OpenAIGPTConfig, self).__init__(**kwargs) super(OpenAIGPTConfig, self).__init__(**kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.n_ctx = n_ctx self.n_ctx = n_ctx
self.n_positions = n_positions self.n_positions = n_positions
......
...@@ -34,4 +34,17 @@ ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -34,4 +34,17 @@ ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class RobertaConfig(BertConfig): class RobertaConfig(BertConfig):
r"""
This is the configuration class to store the configuration of an :class:`~transformers.RobertaModel`.
It is used to instantiate an RoBERTa model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the BERT bert-base-uncased architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`.
It reuses the same defaults. Please check the parent class for more information.
"""
pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
...@@ -29,39 +29,74 @@ TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -29,39 +29,74 @@ TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class TransfoXLConfig(PretrainedConfig): class TransfoXLConfig(PretrainedConfig):
"""Configuration class to store the configuration of a `TransfoXLModel`. """
This is the configuration class to store the configuration of an :class:`~transformers.TransfoXLModel`.
It is used to instantiate a Transformer XL model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the Transformer XL architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Args: Args:
vocab_size: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file. vocab_size (:obj:`int`, optional, defaults to 267735):
cutoffs: cutoffs for the adaptive softmax Vocabulary size of the Transformer XL model. Defines the different tokens that
d_model: Dimensionality of the model's hidden states. can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.TransfoXLModel`.
d_embed: Dimensionality of the embeddings cutoffs (:obj:`List[int]`, optional, defaults to :obj:`[20000, 40000, 200000]`):
d_head: Dimensionality of the model's heads. Cutoffs for the adaptive softmax
div_val: divident value for adapative input and softmax d_model (:obj:`int`, optional, defaults to 1024):
pre_lnorm: apply LayerNorm to the input instead of the output Dimensionality of the model's hidden states.
d_inner: Inner dimension in FF d_embed (:obj:`int`, optional, defaults to 1024):
n_layer: Number of hidden layers in the Transformer encoder. Dimensionality of the embeddings
n_head: Number of attention heads for each attention layer in n_head (:obj:`int`, optional, defaults to 16):
the Transformer encoder. Number of attention heads for each attention layer in the Transformer encoder.
tgt_len: number of tokens to predict d_head (:obj:`int`, optional, defaults to 64):
ext_len: length of the extended context Dimensionality of the model's heads.
mem_len: length of the retained previous heads d_inner (:obj:`int`, optional, defaults to 4096):
same_length: use the same attn length for all tokens Inner dimension in FF
proj_share_all_but_first: True to share all but first projs, False not to share. div_val (:obj:`int`, optional, defaults to 4):
attn_type: attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al. Divident value for adapative input and softmax
clamp_len: use the same pos embeddings after clamp_len pre_lnorm (:obj:`boolean`, optional, defaults to :obj:`False`):
sample_softmax: number of samples in sampled softmax Apply LayerNorm to the input instead of the output
adaptive: use adaptive softmax n_layer (:obj:`int`, optional, defaults to 18):
tie_weight: tie the word embedding and softmax weights Number of hidden layers in the Transformer encoder.
dropout: The dropout probabilitiy for all fully connected tgt_len (:obj:`int`, optional, defaults to 128):
layers in the embeddings, encoder, and pooler. Number of tokens to predict
dropatt: The dropout ratio for the attention probabilities. ext_len (:obj:`int`, optional, defaults to 0):
untie_r: untie relative position biases Length of the extended context
embd_pdrop: The dropout ratio for the embeddings. mem_len (:obj:`int`, optional, defaults to 1600):
init: parameter initializer to use Length of the retained previous heads
init_range: parameters initialized by U(-init_range, init_range). clamp_len (:obj:`int`, optional, defaults to 1000):
proj_init_std: parameters initialized by N(0, init_std) use the same pos embeddings after clamp_len
init_std: parameters initialized by N(0, init_std) same_length (:obj:`boolean`, optional, defaults to :obj:`True`):
Use the same attn length for all tokens
proj_share_all_but_first (:obj:`boolean`, optional, defaults to :obj:`True`):
True to share all but first projs, False not to share.
attn_type (:obj:`int`, optional, defaults to 0):
Attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
sample_softmax (:obj:`int`, optional, defaults to -1):
number of samples in sampled softmax
adaptive (:obj:`boolean`, optional, defaults to :obj:`True`):
use adaptive softmax
tie_weight (:obj:`boolean`, optional, defaults to :obj:`True`):
tie the word embedding and softmax weights
dropout (:obj:`float`, optional, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
dropatt (:obj:`float`, optional, defaults to 0):
The dropout ratio for the attention probabilities.
untie_r (:obj:`boolean`, optional, defaults to :obj:`True`):
Untie relative position biases
init (:obj:`string`, optional, defaults to `normal`):
Parameter initializer to use
init_range (:obj:`float`, optional, defaults to 0.01):
Parameters initialized by U(-init_range, init_range).
proj_init_std (:obj:`float`, optional, defaults to 0.01):
Parameters initialized by N(0, init_std)
init_std (:obj:`float`, optional, defaults to 0.02):
Parameters initialized by N(0, init_std)
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
The epsilon to use in the layer normalization layers
""" """
pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
...@@ -98,9 +133,8 @@ class TransfoXLConfig(PretrainedConfig): ...@@ -98,9 +133,8 @@ class TransfoXLConfig(PretrainedConfig):
layer_norm_epsilon=1e-5, layer_norm_epsilon=1e-5,
**kwargs **kwargs
): ):
"""Constructs TransfoXLConfig.
"""
super(TransfoXLConfig, self).__init__(**kwargs) super(TransfoXLConfig, self).__init__(**kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.cutoffs = [] self.cutoffs = []
self.cutoffs.extend(cutoffs) self.cutoffs.extend(cutoffs)
......
...@@ -37,44 +37,81 @@ XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -37,44 +37,81 @@ XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class XLMConfig(PretrainedConfig): class XLMConfig(PretrainedConfig):
"""Configuration class to store the configuration of a `XLMModel`. """
This is the configuration class to store the configuration of a :class:`~transformers.XLMModel`.
Args: It is used to instantiate an XLM model according to the specified arguments, defining the model
vocab_size: Vocabulary size of `inputs_ids` in `XLMModel`. architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
d_model: Size of the encoder layers and the pooler layer. the `xlm-mlm-en-2048 <https://huggingface.co/xlm-mlm-en-2048>`__ architecture.
n_layer: Number of hidden layers in the Transformer encoder.
n_head: Number of attention heads for each attention layer in Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
the Transformer encoder. to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
d_inner: The size of the "intermediate" (i.e., feed-forward) for more information.
layer in the Transformer encoder.
ff_activation: The non-linear activation function (function or string) in the Args:
encoder and pooler. If string, "gelu", "relu" and "swish" are supported. vocab_size (:obj:`int`, optional, defaults to 30145):
untie_r: untie relative position biases Vocabulary size of the XLM model. Defines the different tokens that
attn_type: 'bi' for XLM, 'uni' for Transformer-XL can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLMModel`.
emb_dim (:obj:`int`, optional, defaults to 2048):
dropout: The dropout probabilitiy for all fully connected Dimensionality of the encoder layers and the pooler layer.
layers in the embeddings, encoder, and pooler. n_layer (:obj:`int`, optional, defaults to 12):
max_position_embeddings: The maximum sequence length that this model might Number of hidden layers in the Transformer encoder.
ever be used with. Typically set this to something large just in case n_head (:obj:`int`, optional, defaults to 16):
(e.g., 512 or 1024 or 2048). Number of attention heads for each attention layer in the Transformer encoder.
initializer_range: The sttdev of the truncated_normal_initializer for dropout (:obj:`float`, optional, defaults to 0.1):
initializing all weight matrices. The dropout probability for all fully connected
layer_norm_eps: The epsilon used by LayerNorm. layers in the embeddings, encoder, and pooler.
attention_dropout (:obj:`float`, optional, defaults to 0.1):
dropout: float, dropout rate. The dropout probability for the attention mechanism
init: str, the initialization scheme, either "normal" or "uniform". gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`):
init_range: float, initialize the parameters with a uniform distribution The non-linear activation function (function or string) in the
in [-init_range, init_range]. Only effective when init="uniform". encoder and pooler. If set to `True`, "gelu" will be used instead of "relu".
init_std: float, initialize the parameters with a normal distribution sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`):
with mean 0 and stddev init_std. Only effective when init="normal". Whether to use sinusoidal positional embeddings instead of absolute positional embeddings.
mem_len: int, the number of tokens to cache. causal (:obj:`boolean`, optional, defaults to :obj:`False`):
reuse_len: int, the number of tokens in the currect batch to be cached Set this to `True` for the model to behave in a causal manner.
and reused in the future. Causal models use a triangular attention mask in order to only attend to the left-side context instead
bi_data: bool, whether to use bidirectional input pipeline. if a bidirectional context.
Usually set to True during pretraining and False during finetuning. asm (:obj:`boolean`, optional, defaults to :obj:`False`):
clamp_len: int, clamp all relative distances larger than clamp_len. TODO
-1 means no clamping. n_langs (:obj:`int`, optional, defaults to 1):
same_length: bool, whether to use the same attention length for each token. The number of languages the model handles. Set to 1 for monolingual models.
use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`)
Whether to use language embeddings. Some models use additional language embeddings, see
`the multilingual models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__
for information on how to use them.
max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might
ever be used with. Typically set this to something large just in case
(e.g., 512 or 1024 or 2048).
embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5):
The standard deviation of the truncated_normal_initializer for
initializing the embedding matrices.
init_std (:obj:`int`, optional, defaults to 50257):
The standard deviation of the truncated_normal_initializer for
initializing all weight matrices except the embedding matrices.
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
The epsilon used by the layer normalization layers.
bos_index (:obj:`int`, optional, defaults to 0):
The index of the beginning of sentence token in the vocabulary.
eos_index (:obj:`int`, optional, defaults to 1):
The index of the end of sentence token in the vocabulary.
pad_index (:obj:`int`, optional, defaults to 2):
The index of the padding token in the vocabulary.
unk_index (:obj:`int`, optional, defaults to 3):
The index of the unknown token in the vocabulary.
mask_index (:obj:`int`, optional, defaults to 5):
The index of the masking token in the vocabulary.
is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`):
Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
start_n_top (:obj:`int`, optional, defaults to 5):
TODO
end_n_top (:obj:`int`, optional, defaults to 5):
TODO
mask_token_id (:obj:`int`, optional, defaults to 0):
Model agnostic parameter to identify masked tokens when generating text in an MLM context.
lang_id (:obj:`int`, optional, defaults to 1):
The ID of the language used by the model. This parameter is used when generating
text in a given language.
""" """
pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
......
...@@ -30,42 +30,60 @@ XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -30,42 +30,60 @@ XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class XLNetConfig(PretrainedConfig): class XLNetConfig(PretrainedConfig):
"""Configuration class to store the configuration of a ``XLNetModel``. """
This is the configuration class to store the configuration of a :class:`~transformers.XLNetModel`.
Args: It is used to instantiate an XLNet model according to the specified arguments, defining the model
vocab_size: Vocabulary size of ``inputs_ids`` in ``XLNetModel``. architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
d_model: Size of the encoder layers and the pooler layer. the `xlnet-large-cased <https://huggingface.co/xlnet-large-cased>`__ architecture.
n_layer: Number of hidden layers in the Transformer encoder.
n_head: Number of attention heads for each attention layer in Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
the Transformer encoder. to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
d_inner: The size of the "intermediate" (i.e., feed-forward) for more information.
layer in the Transformer encoder.
ff_activation: The non-linear activation function (function or string) in the Args:
encoder and pooler. If string, "gelu", "relu" and "swish" are supported. vocab_size (:obj:`int`, optional, defaults to 32000):
untie_r: untie relative position biases Vocabulary size of the XLNet model. Defines the different tokens that
attn_type: 'bi' for XLNet, 'uni' for Transformer-XL can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLNetModel`.
d_model (:obj:`int`, optional, defaults to 1024):
dropout: The dropout probabilitiy for all fully connected Size of the encoder layers and the pooler layer.
layers in the embeddings, encoder, and pooler. n_layer (:obj:`int`, optional, defaults to 24):
initializer_range: The sttdev of the truncated_normal_initializer for Number of hidden layers in the Transformer encoder.
initializing all weight matrices. n_head (:obj:`int`, optional, defaults to 16):
layer_norm_eps: The epsilon used by LayerNorm. Number of attention heads for each attention layer in the Transformer encoder.
d_inner (:obj:`int`, optional, defaults to 4096):
dropout: float, dropout rate. The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
init: str, the initialization scheme, either "normal" or "uniform". ff_activation (:obj:`string`, optional, defaults to "gelu"):
init_range: float, initialize the parameters with a uniform distribution The non-linear activation function (function or string) in the
in [-init_range, init_range]. Only effective when init="uniform". encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
init_std: float, initialize the parameters with a normal distribution untie_r (:obj:`boolean`, optional, defaults to :obj:`True`):
with mean 0 and stddev init_std. Only effective when init="normal". Untie relative position biases
mem_len: int, the number of tokens to cache. attn_type (:obj:`string`, optional, defaults to "bi"):
reuse_len: int, the number of tokens in the currect batch to be cached The attention type used by the model. Set 'bi' for XLNet, 'uni' for Transformer-XL.
and reused in the future. initializer_range (:obj:`float`, optional, defaults to 0.02):
bi_data: bool, whether to use bidirectional input pipeline. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
Usually set to True during pretraining and False during finetuning. layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
clamp_len: int, clamp all relative distances larger than clamp_len. The epsilon used by the layer normalization layers.
-1 means no clamping. dropout (:obj:`float`, optional, defaults to 0.1):
same_length: bool, whether to use the same attention length for each token. The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
finetuning_task: name of the glue task on which the model was fine-tuned if any mem_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`):
The number of tokens to cache. The key/value pairs that have already been pre-computed
in a previous forward pass won't be re-computed. See the
`quickstart <https://huggingface.co/transformers/quickstart.html#using-the-past>`__
for more information.
reuse_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`):
The number of tokens in the current batch to be cached and reused in the future.
bi_data (:obj:`boolean`, optional, defaults to :obj:`False`):
Whether to use bidirectional input pipeline. Usually set to `True` during
pretraining and `False` during finetuning.
clamp_len (:obj:`int`, optional, defaults to -1):
Clamp all relative distances larger than clamp_len.
Setting this attribute to -1 means no clamping.
same_length (:obj:`boolean`, optional, defaults to :obj:`False`):
Whether to use the same attention length for each token.
start_n_top (:obj:`int`, optional, defaults to 5):
TODO
end_n_top (:obj:`int`, optional, defaults to 5):
TODO
""" """
pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment