Commit 715fa638 authored by Julien Chaumond's avatar Julien Chaumond
Browse files

Merge branch 'master' into from_scratch_training

parents 764f836d 100e3b6f
ALBERT ALBERT
---------------------------------------------------- ----------------------------------------------------
``AlbrtConfig`` ``AlbertConfig``
~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.AlbertConfig .. autoclass:: transformers.AlbertConfig
......
...@@ -34,6 +34,13 @@ XLM ...@@ -34,6 +34,13 @@ XLM
:members: :members:
``XLMForQuestionAnsweringSimple``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.XLMForQuestionAnsweringSimple
:members:
``XLMForQuestionAnswering`` ``XLMForQuestionAnswering``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
......
...@@ -36,6 +36,27 @@ XLNet ...@@ -36,6 +36,27 @@ XLNet
:members: :members:
``XLNetForTokenClassification``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.XLNetForTokenClassification
:members:
``XLNetForMultipleChoice``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.XLNetForMultipleChoice
:members:
``XLNetForQuestionAnsweringSimple``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.XLNetForQuestionAnsweringSimple
:members:
``XLNetForQuestionAnswering`` ``XLNetForQuestionAnswering``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
......
...@@ -42,6 +42,7 @@ class LmSeqsDataset(Dataset): ...@@ -42,6 +42,7 @@ class LmSeqsDataset(Dataset):
self.check() self.check()
self.remove_long_sequences() self.remove_long_sequences()
self.remove_empty_sequences() self.remove_empty_sequences()
self.remove_unknown_sequences()
self.check() self.check()
self.print_statistics() self.print_statistics()
...@@ -109,6 +110,22 @@ class LmSeqsDataset(Dataset): ...@@ -109,6 +110,22 @@ class LmSeqsDataset(Dataset):
new_size = len(self) new_size = len(self)
logger.info(f"Remove {init_size - new_size} too short (<=11 tokens) sequences.") logger.info(f"Remove {init_size - new_size} too short (<=11 tokens) sequences.")
def remove_unknown_sequences(self):
"""
Remove sequences with a (too) high level of unknown tokens.
"""
if "unk_token" not in self.params.special_tok_ids:
return
else:
unk_token_id = self.params.special_tok_ids["unk_token"]
init_size = len(self)
unk_occs = np.array([np.count_nonzero(a == unk_token_id) for a in self.token_ids])
indices = (unk_occs / self.lengths) < 0.5
self.token_ids = self.token_ids[indices]
self.lengths = self.lengths[indices]
new_size = len(self)
logger.info(f"Remove {init_size - new_size} sequences with a high level of unknown tokens (50%).")
def print_statistics(self): def print_statistics(self):
""" """
Print some statistics on the corpus. Only the master process. Print some statistics on the corpus. Only the master process.
......
{
"activation": "gelu",
"attention_dropout": 0.1,
"dim": 768,
"dropout": 0.1,
"hidden_dim": 3072,
"initializer_range": 0.02,
"max_position_embeddings": 512,
"n_heads": 12,
"n_layers": 6,
"sinusoidal_pos_embds": true,
"tie_weights_": true,
"vocab_size": 119547
}
\ No newline at end of file
{
"vocab_size": 50265,
"hidden_size": 768,
"num_hidden_layers": 6,
"num_attention_heads": 12,
"intermediate_size": 3072,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"attention_probs_dropout_prob": 0.1,
"max_position_embeddings": 514,
"type_vocab_size": 1,
"initializer_range": 0.02,
"layer_norm_eps": 0.00001
}
\ No newline at end of file
...@@ -344,6 +344,7 @@ def full_text_generation( ...@@ -344,6 +344,7 @@ def full_text_generation(
gamma=1.5, gamma=1.5,
gm_scale=0.9, gm_scale=0.9,
kl_scale=0.01, kl_scale=0.01,
repetition_penalty=1.0,
**kwargs **kwargs
): ):
classifier, class_id = get_classifier(discrim, class_label, device) classifier, class_id = get_classifier(discrim, class_label, device)
...@@ -368,7 +369,14 @@ def full_text_generation( ...@@ -368,7 +369,14 @@ def full_text_generation(
raise Exception("Specify either a bag of words or a discriminator") raise Exception("Specify either a bag of words or a discriminator")
unpert_gen_tok_text, _, _ = generate_text_pplm( unpert_gen_tok_text, _, _ = generate_text_pplm(
model=model, tokenizer=tokenizer, context=context, device=device, length=length, sample=sample, perturb=False model=model,
tokenizer=tokenizer,
context=context,
device=device,
length=length,
sample=sample,
perturb=False,
repetition_penalty=repetition_penalty,
) )
if device == "cuda": if device == "cuda":
torch.cuda.empty_cache() torch.cuda.empty_cache()
...@@ -401,6 +409,7 @@ def full_text_generation( ...@@ -401,6 +409,7 @@ def full_text_generation(
gamma=gamma, gamma=gamma,
gm_scale=gm_scale, gm_scale=gm_scale,
kl_scale=kl_scale, kl_scale=kl_scale,
repetition_penalty=repetition_penalty,
) )
pert_gen_tok_texts.append(pert_gen_tok_text) pert_gen_tok_texts.append(pert_gen_tok_text)
if classifier is not None: if classifier is not None:
...@@ -437,6 +446,7 @@ def generate_text_pplm( ...@@ -437,6 +446,7 @@ def generate_text_pplm(
gamma=1.5, gamma=1.5,
gm_scale=0.9, gm_scale=0.9,
kl_scale=0.01, kl_scale=0.01,
repetition_penalty=1.0,
): ):
output_so_far = None output_so_far = None
if context: if context:
...@@ -508,6 +518,13 @@ def generate_text_pplm( ...@@ -508,6 +518,13 @@ def generate_text_pplm(
pert_logits, past, pert_all_hidden = model(last, past=pert_past) pert_logits, past, pert_all_hidden = model(last, past=pert_past)
pert_logits = pert_logits[:, -1, :] / temperature # + SMALL_CONST pert_logits = pert_logits[:, -1, :] / temperature # + SMALL_CONST
for token_idx in set(output_so_far[0].tolist()):
if pert_logits[0, token_idx] < 0:
pert_logits[0, token_idx] *= repetition_penalty
else:
pert_logits[0, token_idx] /= repetition_penalty
pert_probs = F.softmax(pert_logits, dim=-1) pert_probs = F.softmax(pert_logits, dim=-1)
if classifier is not None: if classifier is not None:
...@@ -588,6 +605,7 @@ def run_pplm_example( ...@@ -588,6 +605,7 @@ def run_pplm_example(
seed=0, seed=0,
no_cuda=False, no_cuda=False,
colorama=False, colorama=False,
repetition_penalty=1.0,
): ):
# set Random seed # set Random seed
torch.manual_seed(seed) torch.manual_seed(seed)
...@@ -655,6 +673,7 @@ def run_pplm_example( ...@@ -655,6 +673,7 @@ def run_pplm_example(
gamma=gamma, gamma=gamma,
gm_scale=gm_scale, gm_scale=gm_scale,
kl_scale=kl_scale, kl_scale=kl_scale,
repetition_penalty=repetition_penalty,
) )
# untokenize unperturbed text # untokenize unperturbed text
...@@ -767,6 +786,9 @@ if __name__ == "__main__": ...@@ -767,6 +786,9 @@ if __name__ == "__main__":
parser.add_argument("--seed", type=int, default=0) parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--no_cuda", action="store_true", help="no cuda") parser.add_argument("--no_cuda", action="store_true", help="no cuda")
parser.add_argument("--colorama", action="store_true", help="colors keywords") parser.add_argument("--colorama", action="store_true", help="colors keywords")
parser.add_argument(
"--repetition_penalty", type=float, default=1.0, help="Penalize repetition. More than 1.0 -> less repetition",
)
args = parser.parse_args() args = parser.parse_args()
run_pplm_example(**vars(args)) run_pplm_example(**vars(args))
...@@ -31,9 +31,73 @@ ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -31,9 +31,73 @@ ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class AlbertConfig(PretrainedConfig): class AlbertConfig(PretrainedConfig):
"""Configuration for `AlbertModel`. r"""
This is the configuration class to store the configuration of an :class:`~transformers.AlbertModel`.
It is used to instantiate an ALBERT model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the ALBERT `xxlarge <https://huggingface.co/albert-xxlarge-v2>`__ architecture.
The default settings match the configuration of model `albert_xxlarge`. Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Args:
vocab_size (:obj:`int`, optional, defaults to 30000):
Vocabulary size of the ALBERT model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`.
embedding_size (:obj:`int`, optional, defaults to 128):
Dimensionality of vocabulary embeddings.
hidden_size (:obj:`int`, optional, defaults to 4096):
Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (:obj:`int`, optional, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_hidden_groups (:obj:`int`, optional, defaults to 1):
Number of groups for the hidden layers, parameters in the same group are shared.
num_attention_heads (:obj:`int`, optional, defaults to 64):
Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (:obj:`int`, optional, defaults to 16384):
The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
inner_group_num (:obj:`int`, optional, defaults to 1):
The number of inner repetition of attention and ffn.
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"):
The non-linear activation function (function or string) in the encoder and pooler.
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
hidden_dropout_prob (:obj:`float`, optional, defaults to 0):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0):
The dropout ratio for the attention probabilities.
max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something
large (e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, optional, defaults to 2):
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`.
initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
The epsilon used by the layer normalization layers.
Example::
# Initializing an ALBERT-xxlarge style configuration
albert_xxlarge_configuration = AlbertConfig()
# Initializing an ALBERT-base style configuration
albert_base_configuration = AlbertConfig(
hidden_size=768,
num_attention_heads=12,
intermediate_size=3072,
)
# Initializing a model from the ALBERT-base style configuration
model = AlbertModel(bert_base_configuration)
# Accessing the model configuration
configuration = model.config
Attributes:
pretrained_config_archive_map (Dict[str, str]):
A dictionary containing all the available pre-trained checkpoints.
""" """
pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
...@@ -58,35 +122,6 @@ class AlbertConfig(PretrainedConfig): ...@@ -58,35 +122,6 @@ class AlbertConfig(PretrainedConfig):
layer_norm_eps=1e-12, layer_norm_eps=1e-12,
**kwargs **kwargs
): ):
"""Constructs AlbertConfig.
Args:
vocab_size: Vocabulary size of `inputs_ids` in `AlbertModel`.
embedding_size: size of voc embeddings.
hidden_size: Size of the encoder layers and the pooler layer.
num_hidden_layers: Number of hidden layers in the Transformer encoder.
num_hidden_groups: Number of group for the hidden layers, parameters in
the same group are shared.
num_attention_heads: Number of attention heads for each attention layer in
the Transformer encoder.
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
layer in the Transformer encoder.
inner_group_num: int, number of inner repetition of attention and ffn.
down_scale_factor: float, the scale to apply
hidden_act: The non-linear activation function (function or string) in the
encoder and pooler.
hidden_dropout_prob: The dropout probability for all fully connected
layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob: The dropout ratio for the attention
probabilities.
max_position_embeddings: The maximum sequence length that this model might
ever be used with. Typically set this to something large just in case
(e.g., 512 or 1024 or 2048).
type_vocab_size: The vocabulary size of the `token_type_ids` passed into
`AlbertModel`.
initializer_range: The stdev of the truncated_normal_initializer for
initializing all weight matrices.
"""
super(AlbertConfig, self).__init__(**kwargs) super(AlbertConfig, self).__init__(**kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
......
...@@ -77,32 +77,15 @@ CONFIG_MAPPING = OrderedDict( ...@@ -77,32 +77,15 @@ CONFIG_MAPPING = OrderedDict(
) )
class AutoConfig: class AutoConfig(object):
r""":class:`~transformers.AutoConfig` is a generic configuration class r"""
:class:`~transformers.AutoConfig` is a generic configuration class
that will be instantiated as one of the configuration classes of the library that will be instantiated as one of the configuration classes of the library
when created with the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` when created with the :func:`~transformers.AutoConfig.from_pretrained` class method.
class method.
The `from_pretrained()` method take care of returning the correct model class instance The :func:`~transformers.AutoConfig.from_pretrained` method takes care of returning the correct model class instance
based on the `model_type` property of the config object, or when it's missing, based on the `model_type` property of the config object, or when it's missing,
falling back to using pattern matching on the `pretrained_model_name_or_path` string. falling back to using pattern matching on the `pretrained_model_name_or_path` string.
When using string matching, the configuration class is matched on
the `pretrained_model_name_or_path` string in the following order:
- contains `t5`: T5Config (T5 model)
- contains `distilbert`: DistilBertConfig (DistilBERT model)
- contains `albert`: AlbertConfig (ALBERT model)
- contains `camembert`: CamembertConfig (CamemBERT model)
- contains `xlm-roberta`: XLMRobertaConfig (XLM-RoBERTa model)
- contains `roberta`: RobertaConfig (RoBERTa model)
- contains `bert`: BertConfig (Bert model)
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
- contains `xlnet`: XLNetConfig (XLNet model)
- contains `xlm`: XLMConfig (XLM model)
- contains `ctrl` : CTRLConfig (CTRL model)
This class cannot be instantiated using `__init__()` (throw an error).
""" """
def __init__(self): def __init__(self):
...@@ -124,57 +107,58 @@ class AutoConfig: ...@@ -124,57 +107,58 @@ class AutoConfig:
@classmethod @classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
r""" Instantiate one of the configuration classes of the library r""" Instantiates one of the configuration classes of the library
from a pre-trained model configuration. from a pre-trained model configuration.
The configuration class to instantiate is selected The configuration class to instantiate is selected
based on the `model_type` property of the config object, or when it's missing, based on the `model_type` property of the config object, or when it's missing,
falling back to using pattern matching on the `pretrained_model_name_or_path` string. falling back to using pattern matching on the `pretrained_model_name_or_path` string.
- contains `t5`: T5Config (T5 model) - contains `t5`: :class:`~transformers.T5Config` (T5 model)
- contains `distilbert`: DistilBertConfig (DistilBERT model) - contains `distilbert`: :class:`~transformers.DistilBertConfig` (DistilBERT model)
- contains `albert`: AlbertConfig (ALBERT model) - contains `albert`: :class:`~transformers.AlbertConfig` (ALBERT model)
- contains `camembert`: CamembertConfig (CamemBERT model) - contains `camembert`: :class:`~transformers.CamembertConfig` (CamemBERT model)
- contains `xlm-roberta`: XLMRobertaConfig (XLM-RoBERTa model) - contains `xlm-roberta`: :class:`~transformers.XLMRobertaConfig` (XLM-RoBERTa model)
- contains `roberta`: RobertaConfig (RoBERTa model) - contains `roberta`: :class:`~transformers.RobertaConfig` (RoBERTa model)
- contains `bert`: BertConfig (Bert model) - contains `bert`: :class:`~transformers.BertConfig` (Bert model)
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model) - contains `openai-gpt`: :class:`~transformers.OpenAIGPTConfig` (OpenAI GPT model)
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model) - contains `gpt2`: :class:`~transformers.GPT2Config` (OpenAI GPT-2 model)
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model) - contains `transfo-xl`: :class:`~transformers.TransfoXLConfig` (Transformer-XL model)
- contains `xlnet`: XLNetConfig (XLNet model) - contains `xlnet`: :class:`~transformers.XLNetConfig` (XLNet model)
- contains `xlm`: XLMConfig (XLM model) - contains `xlm`: :class:`~transformers.XLMConfig` (XLM model)
- contains `ctrl` : CTRLConfig (CTRL model) - contains `ctrl` : :class:`~transformers.CTRLConfig` (CTRL model)
Params:
pretrained_model_name_or_path: either:
Args:
pretrained_model_name_or_path (:obj:`string`):
Is either: \
- a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
- a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
- a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``. - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
- a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``. - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
cache_dir: (`optional`) string: cache_dir (:obj:`string`, optional, defaults to `None`):
Path to a directory in which a downloaded pre-trained model Path to a directory in which a downloaded pre-trained model
configuration should be cached if the standard cache should not be used. configuration should be cached if the standard cache should not be used.
kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading. force_download (:obj:`boolean`, optional, defaults to `False`):
Force to (re-)download the model weights and configuration files and override the cached versions if they exist.
- The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
- Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
force_download: (`optional`) boolean, default False:
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
resume_download: (`optional`) boolean, default False: resume_download (:obj:`boolean`, optional, defaults to `False`):
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. Do not delete incompletely received file. Attempt to resume the download if such a file exists.
proxies: (`optional`) dict, default None: proxies (:obj:`Dict[str, str]`, optional, defaults to `None`):
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. A dictionary of proxy servers to use by protocol or endpoint, e.g.: :obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`.
The proxies are used on each request. The proxies are used on each request. See `the requests documentation <https://requests.readthedocs.io/en/master/user/advanced/#proxies>`__ for usage.
return_unused_kwargs: (`optional`) bool:
return_unused_kwargs (:obj:`boolean`, optional, defaults to `False`):
- If False, then this function returns just the final configuration object. - If False, then this function returns just the final configuration object.
- If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored. - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`): key/value pairs with which to update the configuration object after loading.
- The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
- Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
Examples:: Examples::
config = AutoConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. config = AutoConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache.
......
...@@ -50,32 +50,61 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -50,32 +50,61 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class BertConfig(PretrainedConfig): class BertConfig(PretrainedConfig):
r""" r"""
:class:`~transformers.BertConfig` is the configuration class to store the configuration of a This is the configuration class to store the configuration of a :class:`~transformers.BertModel`.
`BertModel`. It is used to instantiate an BERT model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
Arguments:
vocab_size: Vocabulary size of `inputs_ids` in `BertModel`. Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
hidden_size: Size of the encoder layers and the pooler layer. to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
num_hidden_layers: Number of hidden layers in the Transformer encoder. for more information.
num_attention_heads: Number of attention heads for each attention layer in
the Transformer encoder.
intermediate_size: The size of the "intermediate" (i.e., feed-forward) Args:
layer in the Transformer encoder. vocab_size (:obj:`int`, optional, defaults to 30522):
hidden_act: The non-linear activation function (function or string) in the Vocabulary size of the BERT model. Defines the different tokens that
encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported. can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
hidden_dropout_prob: The dropout probabilitiy for all fully connected hidden_size (:obj:`int`, optional, defaults to 768):
layers in the embeddings, encoder, and pooler. Dimensionality of the encoder layers and the pooler layer.
attention_probs_dropout_prob: The dropout ratio for the attention num_hidden_layers (:obj:`int`, optional, defaults to 12):
probabilities. Number of hidden layers in the Transformer encoder.
max_position_embeddings: The maximum sequence length that this model might num_attention_heads (:obj:`int`, optional, defaults to 12):
ever be used with. Typically set this to something large just in case Number of attention heads for each attention layer in the Transformer encoder.
(e.g., 512 or 1024 or 2048). intermediate_size (:obj:`int`, optional, defaults to 3072):
type_vocab_size: The vocabulary size of the `token_type_ids` passed into Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
`BertModel`. hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
initializer_range: The sttdev of the truncated_normal_initializer for The non-linear activation function (function or string) in the encoder and pooler.
initializing all weight matrices. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
layer_norm_eps: The epsilon used by LayerNorm. hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention probabilities.
max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, optional, defaults to 2):
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
The epsilon used by the layer normalization layers.
Example::
from transformers import BertModel, BertConfig
# Initializing a BERT bert-base-uncased style configuration
configuration = BertConfig()
# Initializing a model from the bert-base-uncased style configuration
model = BertModel(configuration)
# Accessing the model configuration
configuration = model.config
Attributes:
pretrained_config_archive_map (Dict[str, str]):
A dictionary containing all the available pre-trained checkpoints.
""" """
pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
model_type = "bert" model_type = "bert"
...@@ -97,6 +126,7 @@ class BertConfig(PretrainedConfig): ...@@ -97,6 +126,7 @@ class BertConfig(PretrainedConfig):
**kwargs **kwargs
): ):
super(BertConfig, self).__init__(**kwargs) super(BertConfig, self).__init__(**kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers self.num_hidden_layers = num_hidden_layers
......
...@@ -29,5 +29,35 @@ CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -29,5 +29,35 @@ CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class CamembertConfig(RobertaConfig): class CamembertConfig(RobertaConfig):
r"""
This is the configuration class to store the configuration of an :class:`~transformers.CamembertModel`.
It is used to instantiate an Camembert model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
The :class:`~transformers.CamembertConfig` class directly inherits :class:`~transformers.BertConfig`.
It reuses the same defaults. Please check the parent class for more information.
Example::
from transformers import CamembertModel, CamembertConfig
# Initializing a CamemBERT configuration
configuration = CamembertConfig()
# Initializing a model from the configuration
model = CamembertModel(configuration)
# Accessing the model configuration
configuration = model.config
Attributes:
pretrained_config_archive_map (Dict[str, str]):
A dictionary containing all the available pre-trained checkpoints.
"""
pretrained_config_archive_map = CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
model_type = "camembert" model_type = "camembert"
...@@ -26,25 +26,60 @@ CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://storage.googleapis.com/sf ...@@ -26,25 +26,60 @@ CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://storage.googleapis.com/sf
class CTRLConfig(PretrainedConfig): class CTRLConfig(PretrainedConfig):
"""Configuration class to store the configuration of a `CTRLModel`. """
This is the configuration class to store the configuration of an :class:`~transformers.CTRLModel`.
It is used to instantiate an CTRL model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the `ctrl <https://huggingface.co/ctrl>`__ architecture from SalesForce.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Args: Args:
vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file. vocab_size (:obj:`int`, optional, defaults to 246534):
n_positions: Number of positional embeddings. Vocabulary size of the CTRL model. Defines the different tokens that
n_ctx: Size of the causal mask (usually same as n_positions). can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
dff: Size of the inner dimension of the FFN. n_positions (:obj:`int`, optional, defaults to 256):
n_embd: Dimensionality of the embeddings and hidden states. The maximum sequence length that this model might ever be used with.
n_layer: Number of hidden layers in the Transformer encoder. Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
n_head: Number of attention heads for each attention layer in n_ctx (:obj:`int`, optional, defaults to 256):
the Transformer encoder. Dimensionality of the causal mask (usually same as n_positions).
layer_norm_epsilon: epsilon to use in the layer norm layers n_embd (:obj:`int`, optional, defaults to 1280):
resid_pdrop: The dropout probabilitiy for all fully connected Dimensionality of the embeddings and hidden states.
layers in the embeddings, encoder, and pooler. dff (:obj:`int`, optional, defaults to 8192):
attn_pdrop: The dropout ratio for the attention Dimensionality of the inner dimension of the FFN.
probabilities. n_layer (:obj:`int`, optional, defaults to 48):
embd_pdrop: The dropout ratio for the embeddings. Number of hidden layers in the Transformer encoder.
initializer_range: The sttdev of the truncated_normal_initializer for n_head (:obj:`int`, optional, defaults to 16):
initializing all weight matrices. Number of attention heads for each attention layer in the Transformer encoder.
resid_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
embd_pdrop (:obj:`int`, optional, defaults to 0.1):
The dropout ratio for the embeddings.
attn_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention.
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-6):
The epsilon to use in the layer normalization layers
initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
Example::
from transformers import CTRLModel, CTRLConfig
# Initializing a CTRL configuration
configuration = CTRLConfig()
# Initializing a model from the configuration
model = CTRLModel(configuration)
# Accessing the model configuration
configuration = model.config
Attributes:
pretrained_config_archive_map (Dict[str, str]):
A dictionary containing all the available pre-trained checkpoints.
""" """
pretrained_config_archive_map = CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
...@@ -71,26 +106,6 @@ class CTRLConfig(PretrainedConfig): ...@@ -71,26 +106,6 @@ class CTRLConfig(PretrainedConfig):
summary_first_dropout=0.1, summary_first_dropout=0.1,
**kwargs **kwargs
): ):
"""Constructs CTRLConfig.
Args:
vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
n_positions: Number of positional embeddings.
n_ctx: Size of the causal mask (usually same as n_positions).
dff: Size of the inner dimension of the FFN.
n_embd: Dimensionality of the embeddings and hidden states.
n_layer: Number of hidden layers in the Transformer encoder.
n_head: Number of attention heads for each attention layer in
the Transformer encoder.
layer_norm_epsilon: epsilon to use in the layer norm layers
resid_pdrop: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
attn_pdrop: The dropout ratio for the attention
probabilities.
embd_pdrop: The dropout ratio for the embeddings.
initializer_range: The sttdev of the truncated_normal_initializer for
initializing all weight matrices.
"""
super(CTRLConfig, self).__init__(**kwargs) super(CTRLConfig, self).__init__(**kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.n_ctx = n_ctx self.n_ctx = n_ctx
......
...@@ -31,6 +31,67 @@ DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -31,6 +31,67 @@ DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class DistilBertConfig(PretrainedConfig): class DistilBertConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a :class:`~transformers.DistilBertModel`.
It is used to instantiate a DistilBERT model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the DistilBERT `distilbert-base-uncased <https://huggingface.co/distilbert-base-uncased>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Args:
vocab_size (:obj:`int`, optional, defaults to 30522):
Vocabulary size of the DistilBERT model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
sinusoidal_pos_embds (:obj:`boolean`, optional, defaults to :obj:`False`):
Whether to use sinusoidal positional embeddings.
n_layers (:obj:`int`, optional, defaults to 6):
Number of hidden layers in the Transformer encoder.
n_heads (:obj:`int`, optional, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
dim (:obj:`int`, optional, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
intermediate_size (:obj:`int`, optional, defaults to 3072):
The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
dropout (:obj:`float`, optional, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention probabilities.
activation (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
The non-linear activation function (function or string) in the encoder and pooler.
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
qa_dropout (:obj:`float`, optional, defaults to 0.1):
The dropout probabilities used in the question answering model
:class:`~tranformers.DistilBertForQuestionAnswering`.
seq_classif_dropout (:obj:`float`, optional, defaults to 0.2):
The dropout probabilities used in the sequence classification model
:class:`~tranformers.DistilBertForSequenceClassification`.
Example::
from transformers import DistilBertModel, DistilBertConfig
# Initializing a DistilBERT configuration
configuration = DistilBertConfig()
# Initializing a model from the configuration
model = DistilBertModel(configuration)
# Accessing the model configuration
configuration = model.config
Attributes:
pretrained_config_archive_map (Dict[str, str]):
A dictionary containing all the available pre-trained checkpoints.
"""
pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
model_type = "distilbert" model_type = "distilbert"
...@@ -47,7 +108,6 @@ class DistilBertConfig(PretrainedConfig): ...@@ -47,7 +108,6 @@ class DistilBertConfig(PretrainedConfig):
attention_dropout=0.1, attention_dropout=0.1,
activation="gelu", activation="gelu",
initializer_range=0.02, initializer_range=0.02,
tie_weights_=True,
qa_dropout=0.1, qa_dropout=0.1,
seq_classif_dropout=0.2, seq_classif_dropout=0.2,
**kwargs **kwargs
...@@ -64,7 +124,6 @@ class DistilBertConfig(PretrainedConfig): ...@@ -64,7 +124,6 @@ class DistilBertConfig(PretrainedConfig):
self.attention_dropout = attention_dropout self.attention_dropout = attention_dropout
self.activation = activation self.activation = activation
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.tie_weights_ = tie_weights_
self.qa_dropout = qa_dropout self.qa_dropout = qa_dropout
self.seq_classif_dropout = seq_classif_dropout self.seq_classif_dropout = seq_classif_dropout
......
...@@ -33,24 +33,84 @@ GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -33,24 +33,84 @@ GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class GPT2Config(PretrainedConfig): class GPT2Config(PretrainedConfig):
"""Configuration class to store the configuration of a `GPT2Model`. """
This is the configuration class to store the configuration of a :class:`~transformers.GPT2Model`.
It is used to instantiate an GPT-2 model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the GPT-2 `small <https://huggingface.co/gpt2>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Args: Args:
vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file. vocab_size (:obj:`int`, optional, defaults to 50257):
n_positions: Number of positional embeddings. Vocabulary size of the GPT-2 model. Defines the different tokens that
n_ctx: Size of the causal mask (usually same as n_positions). can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.GPT2Model`.
n_embd: Dimensionality of the embeddings and hidden states. n_positions (:obj:`int`, optional, defaults to 1024):
n_layer: Number of hidden layers in the Transformer encoder. The maximum sequence length that this model might ever be used with.
n_head: Number of attention heads for each attention layer in Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
the Transformer encoder. n_ctx (:obj:`int`, optional, defaults to 1024):
layer_norm_epsilon: epsilon to use in the layer norm layers Dimensionality of the causal mask (usually same as n_positions).
resid_pdrop: The dropout probabilitiy for all fully connected n_embd (:obj:`int`, optional, defaults to 768):
layers in the embeddings, encoder, and pooler. Dimensionality of the embeddings and hidden states.
attn_pdrop: The dropout ratio for the attention n_layer (:obj:`int`, optional, defaults to 12):
probabilities. Number of hidden layers in the Transformer encoder.
embd_pdrop: The dropout ratio for the embeddings. n_head (:obj:`int`, optional, defaults to 12):
initializer_range: The sttdev of the truncated_normal_initializer for Number of attention heads for each attention layer in the Transformer encoder.
initializing all weight matrices. resid_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
embd_pdrop (:obj:`int`, optional, defaults to 0.1):
The dropout ratio for the embeddings.
attn_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention.
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
The epsilon to use in the layer normalization layers
initializer_range (:obj:`float`, optional, defaults to 16):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
summary_type (:obj:`string`, optional, defaults to "cls_index"):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.GPT2DoubleHeadsModel`.
Is one of the following options:
- 'last' => take the last token hidden state (like XLNet)
- 'first' => take the first token hidden state (like Bert)
- 'mean' => take the mean of all tokens hidden states
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
- 'attn' => Not implemented now, use multi-head attention
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.GPT2DoubleHeadsModel`.
Add a projection after the vector extraction
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.GPT2DoubleHeadsModel`.
'tanh' => add a tanh activation to the output, Other => no activation.
summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.GPT2DoubleHeadsModel`.
If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.GPT2DoubleHeadsModel`.
Add a dropout before the projection and activation
Example::
from transformers import GPT2Model, GPT2Config
# Initializing a GPT2 configuration
configuration = GPT2Config()
# Initializing a model from the configuration
model = GPT2Model(configuration)
# Accessing the model configuration
configuration = model.config
Attributes:
pretrained_config_archive_map (Dict[str, str]):
A dictionary containing all the available pre-trained checkpoints.
""" """
pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
...@@ -76,26 +136,8 @@ class GPT2Config(PretrainedConfig): ...@@ -76,26 +136,8 @@ class GPT2Config(PretrainedConfig):
summary_first_dropout=0.1, summary_first_dropout=0.1,
**kwargs **kwargs
): ):
"""Constructs GPT2Config.
Args:
vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
n_positions: Number of positional embeddings.
n_ctx: Size of the causal mask (usually same as n_positions).
n_embd: Dimensionality of the embeddings and hidden states.
n_layer: Number of hidden layers in the Transformer encoder.
n_head: Number of attention heads for each attention layer in
the Transformer encoder.
layer_norm_epsilon: epsilon to use in the layer norm layers
resid_pdrop: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
attn_pdrop: The dropout ratio for the attention
probabilities.
embd_pdrop: The dropout ratio for the embeddings.
initializer_range: The sttdev of the truncated_normal_initializer for
initializing all weight matrices.
"""
super(GPT2Config, self).__init__(**kwargs) super(GPT2Config, self).__init__(**kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.n_ctx = n_ctx self.n_ctx = n_ctx
self.n_positions = n_positions self.n_positions = n_positions
......
...@@ -26,9 +26,13 @@ class MMBTConfig(object): ...@@ -26,9 +26,13 @@ class MMBTConfig(object):
"""Configuration class to store the configuration of a `MMBT Model`. """Configuration class to store the configuration of a `MMBT Model`.
Args: Args:
config: config of the underlying Transformer models. It's values are copied over to use a single config. config (:obj:`~transformers.PreTrainedConfig`):
num_labels: Size of final Linear layer for classification. Config of the underlying Transformer models. Its values are
modal_hidden_size: Embedding dimension of the non-text modality encoder. copied over to use a single config.
num_labels (:obj:`int` or :obj:`None`, optional, defaults to `None`):
Size of final Linear layer for classification.
modal_hidden_size (:obj:`int`, optional, defautls to 2048):
Embedding dimension of the non-text modality encoder.
""" """
def __init__(self, config, num_labels=None, modal_hidden_size=2048): def __init__(self, config, num_labels=None, modal_hidden_size=2048):
......
...@@ -30,27 +30,87 @@ OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -30,27 +30,87 @@ OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class OpenAIGPTConfig(PretrainedConfig): class OpenAIGPTConfig(PretrainedConfig):
""" """
Configuration class to store the configuration of a `OpenAIGPTModel`. This is the configuration class to store the configuration of an :class:`~transformers.OpenAIGPTModel`.
It is used to instantiate an GPT model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the `GPT <https://huggingface.co/openai-gpt>`__ architecture from OpenAI.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Args: Args:
vocab_size: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file. vocab_size (:obj:`int`, optional, defaults to 40478):
n_positions: Number of positional embeddings. Vocabulary size of the GPT model. Defines the different tokens that
n_ctx: Size of the causal mask (usually same as n_positions). can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
n_embd: Dimensionality of the embeddings and hidden states. n_positions (:obj:`int`, optional, defaults to 512):
n_layer: Number of hidden layers in the Transformer encoder. The maximum sequence length that this model might ever be used with.
n_head: Number of attention heads for each attention layer in Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
the Transformer encoder. n_ctx (:obj:`int`, optional, defaults to 512):
afn: The non-linear activation function (function or string) in the Dimensionality of the causal mask (usually same as n_positions).
encoder and pooler. If string, "gelu", "relu" and "swish" are supported. n_embd (:obj:`int`, optional, defaults to 768):
resid_pdrop: The dropout probabilitiy for all fully connected Dimensionality of the embeddings and hidden states.
layers in the embeddings, encoder, and pooler. n_layer (:obj:`int`, optional, defaults to 12):
attn_pdrop: The dropout ratio for the attention Number of hidden layers in the Transformer encoder.
probabilities. n_head (:obj:`int`, optional, defaults to 12):
embd_pdrop: The dropout ratio for the embeddings. Number of attention heads for each attention layer in the Transformer encoder.
layer_norm_epsilon: epsilon to use in the layer norm layers afn (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
initializer_range: The sttdev of the truncated_normal_initializer for The non-linear activation function (function or string) in the encoder and pooler.
initializing all weight matrices. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
predict_special_tokens: should we predict special tokens (when the model has a LM head) resid_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
embd_pdrop (:obj:`int`, optional, defaults to 0.1):
The dropout ratio for the embeddings.
attn_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention.
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
The epsilon to use in the layer normalization layers
initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
predict_special_tokens (:obj:`boolean`, optional, defaults to :obj:`True`):
Whether special tokens should be predicted when the model is has a language modeling head.
summary_type (:obj:`string`, optional, defaults to "cls_index"):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.OpenAIGPTDoubleHeadsModel`.
Is one of the following options:
- 'last' => take the last token hidden state (like XLNet)
- 'first' => take the first token hidden state (like Bert)
- 'mean' => take the mean of all tokens hidden states
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
- 'attn' => Not implemented now, use multi-head attention
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.OpenAIGPTDoubleHeadsModel`.
Add a projection after the vector extraction
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.OpenAIGPTDoubleHeadsModel`.
'tanh' => add a tanh activation to the output, Other => no activation.
summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.OpenAIGPTDoubleHeadsModel`.
If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.OpenAIGPTDoubleHeadsModel`.
Add a dropout before the projection and activation
Example::
from transformers import OpenAIGPTConfig, OpenAIGPTModel
# Initializing a GPT configuration
configuration = OpenAIGPTConfig()
# Initializing a model from the configuration
model = OpenAIGPTModel(configuration)
# Accessing the model configuration
configuration = model.config
Attributes:
pretrained_config_archive_map (Dict[str, str]):
A dictionary containing all the available pre-trained checkpoints.
""" """
pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
...@@ -78,9 +138,8 @@ class OpenAIGPTConfig(PretrainedConfig): ...@@ -78,9 +138,8 @@ class OpenAIGPTConfig(PretrainedConfig):
summary_first_dropout=0.1, summary_first_dropout=0.1,
**kwargs **kwargs
): ):
"""Constructs OpenAIGPTConfig.
"""
super(OpenAIGPTConfig, self).__init__(**kwargs) super(OpenAIGPTConfig, self).__init__(**kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.n_ctx = n_ctx self.n_ctx = n_ctx
self.n_positions = n_positions self.n_positions = n_positions
......
...@@ -34,5 +34,35 @@ ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -34,5 +34,35 @@ ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class RobertaConfig(BertConfig): class RobertaConfig(BertConfig):
r"""
This is the configuration class to store the configuration of an :class:`~transformers.RobertaModel`.
It is used to instantiate an RoBERTa model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`.
It reuses the same defaults. Please check the parent class for more information.
Example::
from transformers import RobertaConfig, RobertaModel
# Initializing a RoBERTa configuration
configuration = RobertaConfig()
# Initializing a model from the configuration
model = RobertaModel(configuration)
# Accessing the model configuration
configuration = model.config
Attributes:
pretrained_config_archive_map (Dict[str, str]):
A dictionary containing all the available pre-trained checkpoints.
"""
pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
model_type = "roberta" model_type = "roberta"
...@@ -29,39 +29,91 @@ TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -29,39 +29,91 @@ TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class TransfoXLConfig(PretrainedConfig): class TransfoXLConfig(PretrainedConfig):
"""Configuration class to store the configuration of a `TransfoXLModel`. """
This is the configuration class to store the configuration of an :class:`~transformers.TransfoXLModel`.
It is used to instantiate a Transformer XL model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the `Transformer XL <https://huggingface.co/transfo-xl-wt103>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Args: Args:
vocab_size: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file. vocab_size (:obj:`int`, optional, defaults to 267735):
cutoffs: cutoffs for the adaptive softmax Vocabulary size of the Transformer XL model. Defines the different tokens that
d_model: Dimensionality of the model's hidden states. can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.TransfoXLModel`.
d_embed: Dimensionality of the embeddings cutoffs (:obj:`List[int]`, optional, defaults to :obj:`[20000, 40000, 200000]`):
d_head: Dimensionality of the model's heads. Cutoffs for the adaptive softmax
div_val: divident value for adapative input and softmax d_model (:obj:`int`, optional, defaults to 1024):
pre_lnorm: apply LayerNorm to the input instead of the output Dimensionality of the model's hidden states.
d_inner: Inner dimension in FF d_embed (:obj:`int`, optional, defaults to 1024):
n_layer: Number of hidden layers in the Transformer encoder. Dimensionality of the embeddings
n_head: Number of attention heads for each attention layer in n_head (:obj:`int`, optional, defaults to 16):
the Transformer encoder. Number of attention heads for each attention layer in the Transformer encoder.
tgt_len: number of tokens to predict d_head (:obj:`int`, optional, defaults to 64):
ext_len: length of the extended context Dimensionality of the model's heads.
mem_len: length of the retained previous heads d_inner (:obj:`int`, optional, defaults to 4096):
same_length: use the same attn length for all tokens Inner dimension in FF
proj_share_all_but_first: True to share all but first projs, False not to share. div_val (:obj:`int`, optional, defaults to 4):
attn_type: attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al. Divident value for adapative input and softmax
clamp_len: use the same pos embeddings after clamp_len pre_lnorm (:obj:`boolean`, optional, defaults to :obj:`False`):
sample_softmax: number of samples in sampled softmax Apply LayerNorm to the input instead of the output
adaptive: use adaptive softmax n_layer (:obj:`int`, optional, defaults to 18):
tie_weight: tie the word embedding and softmax weights Number of hidden layers in the Transformer encoder.
dropout: The dropout probabilitiy for all fully connected tgt_len (:obj:`int`, optional, defaults to 128):
layers in the embeddings, encoder, and pooler. Number of tokens to predict
dropatt: The dropout ratio for the attention probabilities. ext_len (:obj:`int`, optional, defaults to 0):
untie_r: untie relative position biases Length of the extended context
embd_pdrop: The dropout ratio for the embeddings. mem_len (:obj:`int`, optional, defaults to 1600):
init: parameter initializer to use Length of the retained previous heads
init_range: parameters initialized by U(-init_range, init_range). clamp_len (:obj:`int`, optional, defaults to 1000):
proj_init_std: parameters initialized by N(0, init_std) use the same pos embeddings after clamp_len
init_std: parameters initialized by N(0, init_std) same_length (:obj:`boolean`, optional, defaults to :obj:`True`):
Use the same attn length for all tokens
proj_share_all_but_first (:obj:`boolean`, optional, defaults to :obj:`True`):
True to share all but first projs, False not to share.
attn_type (:obj:`int`, optional, defaults to 0):
Attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
sample_softmax (:obj:`int`, optional, defaults to -1):
number of samples in sampled softmax
adaptive (:obj:`boolean`, optional, defaults to :obj:`True`):
use adaptive softmax
tie_weight (:obj:`boolean`, optional, defaults to :obj:`True`):
tie the word embedding and softmax weights
dropout (:obj:`float`, optional, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
dropatt (:obj:`float`, optional, defaults to 0):
The dropout ratio for the attention probabilities.
untie_r (:obj:`boolean`, optional, defaults to :obj:`True`):
Untie relative position biases
init (:obj:`string`, optional, defaults to `normal`):
Parameter initializer to use
init_range (:obj:`float`, optional, defaults to 0.01):
Parameters initialized by U(-init_range, init_range).
proj_init_std (:obj:`float`, optional, defaults to 0.01):
Parameters initialized by N(0, init_std)
init_std (:obj:`float`, optional, defaults to 0.02):
Parameters initialized by N(0, init_std)
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
The epsilon to use in the layer normalization layers
Example::
from transformers import TransfoXLConfig, TransfoXLModel
# Initializing a Transformer XL configuration
configuration = TransfoXLConfig()
# Initializing a model from the configuration
model = TransfoXLModel(configuration)
# Accessing the model configuration
configuration = model.config
Attributes:
pretrained_config_archive_map (Dict[str, str]):
A dictionary containing all the available pre-trained checkpoints.
""" """
pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
...@@ -99,9 +151,8 @@ class TransfoXLConfig(PretrainedConfig): ...@@ -99,9 +151,8 @@ class TransfoXLConfig(PretrainedConfig):
layer_norm_epsilon=1e-5, layer_norm_epsilon=1e-5,
**kwargs **kwargs
): ):
"""Constructs TransfoXLConfig.
"""
super(TransfoXLConfig, self).__init__(**kwargs) super(TransfoXLConfig, self).__init__(**kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.cutoffs = [] self.cutoffs = []
self.cutoffs.extend(cutoffs) self.cutoffs.extend(cutoffs)
......
...@@ -37,44 +37,124 @@ XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -37,44 +37,124 @@ XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class XLMConfig(PretrainedConfig): class XLMConfig(PretrainedConfig):
"""Configuration class to store the configuration of a `XLMModel`. """
This is the configuration class to store the configuration of a :class:`~transformers.XLMModel`.
It is used to instantiate an XLM model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the `xlm-mlm-en-2048 <https://huggingface.co/xlm-mlm-en-2048>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Args: Args:
vocab_size: Vocabulary size of `inputs_ids` in `XLMModel`. vocab_size (:obj:`int`, optional, defaults to 30145):
d_model: Size of the encoder layers and the pooler layer. Vocabulary size of the XLM model. Defines the different tokens that
n_layer: Number of hidden layers in the Transformer encoder. can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLMModel`.
n_head: Number of attention heads for each attention layer in emb_dim (:obj:`int`, optional, defaults to 2048):
the Transformer encoder. Dimensionality of the encoder layers and the pooler layer.
d_inner: The size of the "intermediate" (i.e., feed-forward) n_layer (:obj:`int`, optional, defaults to 12):
layer in the Transformer encoder. Number of hidden layers in the Transformer encoder.
ff_activation: The non-linear activation function (function or string) in the n_head (:obj:`int`, optional, defaults to 16):
encoder and pooler. If string, "gelu", "relu" and "swish" are supported. Number of attention heads for each attention layer in the Transformer encoder.
untie_r: untie relative position biases dropout (:obj:`float`, optional, defaults to 0.1):
attn_type: 'bi' for XLM, 'uni' for Transformer-XL The dropout probability for all fully connected
dropout: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler. layers in the embeddings, encoder, and pooler.
max_position_embeddings: The maximum sequence length that this model might attention_dropout (:obj:`float`, optional, defaults to 0.1):
The dropout probability for the attention mechanism
gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`):
The non-linear activation function (function or string) in the
encoder and pooler. If set to `True`, "gelu" will be used instead of "relu".
sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`):
Whether to use sinusoidal positional embeddings instead of absolute positional embeddings.
causal (:obj:`boolean`, optional, defaults to :obj:`False`):
Set this to `True` for the model to behave in a causal manner.
Causal models use a triangular attention mask in order to only attend to the left-side context instead
if a bidirectional context.
asm (:obj:`boolean`, optional, defaults to :obj:`False`):
Whether to use an adaptive log softmax projection layer instead of a linear layer for the prediction
layer.
n_langs (:obj:`int`, optional, defaults to 1):
The number of languages the model handles. Set to 1 for monolingual models.
use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`)
Whether to use language embeddings. Some models use additional language embeddings, see
`the multilingual models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__
for information on how to use them.
max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might
ever be used with. Typically set this to something large just in case ever be used with. Typically set this to something large just in case
(e.g., 512 or 1024 or 2048). (e.g., 512 or 1024 or 2048).
initializer_range: The sttdev of the truncated_normal_initializer for embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5):
initializing all weight matrices. The standard deviation of the truncated_normal_initializer for
layer_norm_eps: The epsilon used by LayerNorm. initializing the embedding matrices.
init_std (:obj:`int`, optional, defaults to 50257):
dropout: float, dropout rate. The standard deviation of the truncated_normal_initializer for
init: str, the initialization scheme, either "normal" or "uniform". initializing all weight matrices except the embedding matrices.
init_range: float, initialize the parameters with a uniform distribution layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
in [-init_range, init_range]. Only effective when init="uniform". The epsilon used by the layer normalization layers.
init_std: float, initialize the parameters with a normal distribution bos_index (:obj:`int`, optional, defaults to 0):
with mean 0 and stddev init_std. Only effective when init="normal". The index of the beginning of sentence token in the vocabulary.
mem_len: int, the number of tokens to cache. eos_index (:obj:`int`, optional, defaults to 1):
reuse_len: int, the number of tokens in the currect batch to be cached The index of the end of sentence token in the vocabulary.
and reused in the future. pad_index (:obj:`int`, optional, defaults to 2):
bi_data: bool, whether to use bidirectional input pipeline. The index of the padding token in the vocabulary.
Usually set to True during pretraining and False during finetuning. unk_index (:obj:`int`, optional, defaults to 3):
clamp_len: int, clamp all relative distances larger than clamp_len. The index of the unknown token in the vocabulary.
-1 means no clamping. mask_index (:obj:`int`, optional, defaults to 5):
same_length: bool, whether to use the same attention length for each token. The index of the masking token in the vocabulary.
is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`):
Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
summary_type (:obj:`string`, optional, defaults to "first"):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.XLMForSequenceClassification`.
Is one of the following options:
- 'last' => take the last token hidden state (like XLNet)
- 'first' => take the first token hidden state (like Bert)
- 'mean' => take the mean of all tokens hidden states
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
- 'attn' => Not implemented now, use multi-head attention
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.XLMForSequenceClassification`.
Add a projection after the vector extraction
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.XLMForSequenceClassification`.
'tanh' => add a tanh activation to the output, Other => no activation.
summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.XLMForSequenceClassification`.
If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.XLMForSequenceClassification`.
Add a dropout before the projection and activation
start_n_top (:obj:`int`, optional, defaults to 5):
Used in the SQuAD evaluation script for XLM and XLNet.
end_n_top (:obj:`int`, optional, defaults to 5):
Used in the SQuAD evaluation script for XLM and XLNet.
mask_token_id (:obj:`int`, optional, defaults to 0):
Model agnostic parameter to identify masked tokens when generating text in an MLM context.
lang_id (:obj:`int`, optional, defaults to 1):
The ID of the language used by the model. This parameter is used when generating
text in a given language.
Example::
from transformers import XLMConfig, XLMModel
# Initializing a XLM configuration
configuration = XLMConfig()
# Initializing a model from the configuration
model = XLMModel(configuration)
# Accessing the model configuration
configuration = model.config
Attributes:
pretrained_config_archive_map (Dict[str, str]):
A dictionary containing all the available pre-trained checkpoints.
""" """
pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
......
...@@ -30,42 +30,102 @@ XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -30,42 +30,102 @@ XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class XLNetConfig(PretrainedConfig): class XLNetConfig(PretrainedConfig):
"""Configuration class to store the configuration of a ``XLNetModel``. """
This is the configuration class to store the configuration of a :class:`~transformers.XLNetModel`.
It is used to instantiate an XLNet model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the `xlnet-large-cased <https://huggingface.co/xlnet-large-cased>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Args: Args:
vocab_size: Vocabulary size of ``inputs_ids`` in ``XLNetModel``. vocab_size (:obj:`int`, optional, defaults to 32000):
d_model: Size of the encoder layers and the pooler layer. Vocabulary size of the XLNet model. Defines the different tokens that
n_layer: Number of hidden layers in the Transformer encoder. can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLNetModel`.
n_head: Number of attention heads for each attention layer in d_model (:obj:`int`, optional, defaults to 1024):
the Transformer encoder. Dimensionality of the encoder layers and the pooler layer.
d_inner: The size of the "intermediate" (i.e., feed-forward) n_layer (:obj:`int`, optional, defaults to 24):
layer in the Transformer encoder. Number of hidden layers in the Transformer encoder.
ff_activation: The non-linear activation function (function or string) in the n_head (:obj:`int`, optional, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
d_inner (:obj:`int`, optional, defaults to 4096):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
ff_activation (:obj:`string`, optional, defaults to "gelu"):
The non-linear activation function (function or string) in the
encoder and pooler. If string, "gelu", "relu" and "swish" are supported. encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
untie_r: untie relative position biases untie_r (:obj:`boolean`, optional, defaults to :obj:`True`):
attn_type: 'bi' for XLNet, 'uni' for Transformer-XL Untie relative position biases
attn_type (:obj:`string`, optional, defaults to "bi"):
dropout: The dropout probabilitiy for all fully connected The attention type used by the model. Set 'bi' for XLNet, 'uni' for Transformer-XL.
layers in the embeddings, encoder, and pooler. initializer_range (:obj:`float`, optional, defaults to 0.02):
initializer_range: The sttdev of the truncated_normal_initializer for The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
initializing all weight matrices. layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
layer_norm_eps: The epsilon used by LayerNorm. The epsilon used by the layer normalization layers.
dropout (:obj:`float`, optional, defaults to 0.1):
dropout: float, dropout rate. The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
init: str, the initialization scheme, either "normal" or "uniform". mem_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`):
init_range: float, initialize the parameters with a uniform distribution The number of tokens to cache. The key/value pairs that have already been pre-computed
in [-init_range, init_range]. Only effective when init="uniform". in a previous forward pass won't be re-computed. See the
init_std: float, initialize the parameters with a normal distribution `quickstart <https://huggingface.co/transformers/quickstart.html#using-the-past>`__
with mean 0 and stddev init_std. Only effective when init="normal". for more information.
mem_len: int, the number of tokens to cache. reuse_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`):
reuse_len: int, the number of tokens in the currect batch to be cached The number of tokens in the current batch to be cached and reused in the future.
and reused in the future. bi_data (:obj:`boolean`, optional, defaults to :obj:`False`):
bi_data: bool, whether to use bidirectional input pipeline. Whether to use bidirectional input pipeline. Usually set to `True` during
Usually set to True during pretraining and False during finetuning. pretraining and `False` during finetuning.
clamp_len: int, clamp all relative distances larger than clamp_len. clamp_len (:obj:`int`, optional, defaults to -1):
-1 means no clamping. Clamp all relative distances larger than clamp_len.
same_length: bool, whether to use the same attention length for each token. Setting this attribute to -1 means no clamping.
finetuning_task: name of the glue task on which the model was fine-tuned if any same_length (:obj:`boolean`, optional, defaults to :obj:`False`):
Whether to use the same attention length for each token.
summary_type (:obj:`string`, optional, defaults to "last"):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
Is one of the following options:
- 'last' => take the last token hidden state (like XLNet)
- 'first' => take the first token hidden state (like Bert)
- 'mean' => take the mean of all tokens hidden states
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
- 'attn' => Not implemented now, use multi-head attention
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
Add a projection after the vector extraction
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
'tanh' => add a tanh activation to the output, Other => no activation.
summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
summary_last_dropout (:obj:`float`, optional, defaults to 0.1):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
Add a dropout after the projection and activation
start_n_top (:obj:`int`, optional, defaults to 5):
Used in the SQuAD evaluation script for XLM and XLNet.
end_n_top (:obj:`int`, optional, defaults to 5):
Used in the SQuAD evaluation script for XLM and XLNet.
Example::
from transformers import XLNetConfig, XLNetModel
# Initializing a XLNet configuration
configuration = XLNetConfig()
# Initializing a model from the configuration
model = XLNetModel(configuration)
# Accessing the model configuration
configuration = model.config
Attributes:
pretrained_config_archive_map (Dict[str, str]):
A dictionary containing all the available pre-trained checkpoints.
""" """
pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment