"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "d0efbd3cd1fa268a3f5c5235237ceae0bde69776"
Commit 715fa638 authored by Julien Chaumond's avatar Julien Chaumond
Browse files

Merge branch 'master' into from_scratch_training

parents 764f836d 100e3b6f
ALBERT ALBERT
---------------------------------------------------- ----------------------------------------------------
``AlbrtConfig`` ``AlbertConfig``
~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.AlbertConfig .. autoclass:: transformers.AlbertConfig
......
...@@ -34,6 +34,13 @@ XLM ...@@ -34,6 +34,13 @@ XLM
:members: :members:
``XLMForQuestionAnsweringSimple``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.XLMForQuestionAnsweringSimple
:members:
``XLMForQuestionAnswering`` ``XLMForQuestionAnswering``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
......
...@@ -36,6 +36,27 @@ XLNet ...@@ -36,6 +36,27 @@ XLNet
:members: :members:
``XLNetForTokenClassification``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.XLNetForTokenClassification
:members:
``XLNetForMultipleChoice``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.XLNetForMultipleChoice
:members:
``XLNetForQuestionAnsweringSimple``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.XLNetForQuestionAnsweringSimple
:members:
``XLNetForQuestionAnswering`` ``XLNetForQuestionAnswering``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
......
...@@ -42,6 +42,7 @@ class LmSeqsDataset(Dataset): ...@@ -42,6 +42,7 @@ class LmSeqsDataset(Dataset):
self.check() self.check()
self.remove_long_sequences() self.remove_long_sequences()
self.remove_empty_sequences() self.remove_empty_sequences()
self.remove_unknown_sequences()
self.check() self.check()
self.print_statistics() self.print_statistics()
...@@ -109,6 +110,22 @@ class LmSeqsDataset(Dataset): ...@@ -109,6 +110,22 @@ class LmSeqsDataset(Dataset):
new_size = len(self) new_size = len(self)
logger.info(f"Remove {init_size - new_size} too short (<=11 tokens) sequences.") logger.info(f"Remove {init_size - new_size} too short (<=11 tokens) sequences.")
def remove_unknown_sequences(self):
"""
Remove sequences with a (too) high level of unknown tokens.
"""
if "unk_token" not in self.params.special_tok_ids:
return
else:
unk_token_id = self.params.special_tok_ids["unk_token"]
init_size = len(self)
unk_occs = np.array([np.count_nonzero(a == unk_token_id) for a in self.token_ids])
indices = (unk_occs / self.lengths) < 0.5
self.token_ids = self.token_ids[indices]
self.lengths = self.lengths[indices]
new_size = len(self)
logger.info(f"Remove {init_size - new_size} sequences with a high level of unknown tokens (50%).")
def print_statistics(self): def print_statistics(self):
""" """
Print some statistics on the corpus. Only the master process. Print some statistics on the corpus. Only the master process.
......
{
"activation": "gelu",
"attention_dropout": 0.1,
"dim": 768,
"dropout": 0.1,
"hidden_dim": 3072,
"initializer_range": 0.02,
"max_position_embeddings": 512,
"n_heads": 12,
"n_layers": 6,
"sinusoidal_pos_embds": true,
"tie_weights_": true,
"vocab_size": 119547
}
\ No newline at end of file
{
"vocab_size": 50265,
"hidden_size": 768,
"num_hidden_layers": 6,
"num_attention_heads": 12,
"intermediate_size": 3072,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"attention_probs_dropout_prob": 0.1,
"max_position_embeddings": 514,
"type_vocab_size": 1,
"initializer_range": 0.02,
"layer_norm_eps": 0.00001
}
\ No newline at end of file
...@@ -344,6 +344,7 @@ def full_text_generation( ...@@ -344,6 +344,7 @@ def full_text_generation(
gamma=1.5, gamma=1.5,
gm_scale=0.9, gm_scale=0.9,
kl_scale=0.01, kl_scale=0.01,
repetition_penalty=1.0,
**kwargs **kwargs
): ):
classifier, class_id = get_classifier(discrim, class_label, device) classifier, class_id = get_classifier(discrim, class_label, device)
...@@ -368,7 +369,14 @@ def full_text_generation( ...@@ -368,7 +369,14 @@ def full_text_generation(
raise Exception("Specify either a bag of words or a discriminator") raise Exception("Specify either a bag of words or a discriminator")
unpert_gen_tok_text, _, _ = generate_text_pplm( unpert_gen_tok_text, _, _ = generate_text_pplm(
model=model, tokenizer=tokenizer, context=context, device=device, length=length, sample=sample, perturb=False model=model,
tokenizer=tokenizer,
context=context,
device=device,
length=length,
sample=sample,
perturb=False,
repetition_penalty=repetition_penalty,
) )
if device == "cuda": if device == "cuda":
torch.cuda.empty_cache() torch.cuda.empty_cache()
...@@ -401,6 +409,7 @@ def full_text_generation( ...@@ -401,6 +409,7 @@ def full_text_generation(
gamma=gamma, gamma=gamma,
gm_scale=gm_scale, gm_scale=gm_scale,
kl_scale=kl_scale, kl_scale=kl_scale,
repetition_penalty=repetition_penalty,
) )
pert_gen_tok_texts.append(pert_gen_tok_text) pert_gen_tok_texts.append(pert_gen_tok_text)
if classifier is not None: if classifier is not None:
...@@ -437,6 +446,7 @@ def generate_text_pplm( ...@@ -437,6 +446,7 @@ def generate_text_pplm(
gamma=1.5, gamma=1.5,
gm_scale=0.9, gm_scale=0.9,
kl_scale=0.01, kl_scale=0.01,
repetition_penalty=1.0,
): ):
output_so_far = None output_so_far = None
if context: if context:
...@@ -508,6 +518,13 @@ def generate_text_pplm( ...@@ -508,6 +518,13 @@ def generate_text_pplm(
pert_logits, past, pert_all_hidden = model(last, past=pert_past) pert_logits, past, pert_all_hidden = model(last, past=pert_past)
pert_logits = pert_logits[:, -1, :] / temperature # + SMALL_CONST pert_logits = pert_logits[:, -1, :] / temperature # + SMALL_CONST
for token_idx in set(output_so_far[0].tolist()):
if pert_logits[0, token_idx] < 0:
pert_logits[0, token_idx] *= repetition_penalty
else:
pert_logits[0, token_idx] /= repetition_penalty
pert_probs = F.softmax(pert_logits, dim=-1) pert_probs = F.softmax(pert_logits, dim=-1)
if classifier is not None: if classifier is not None:
...@@ -588,6 +605,7 @@ def run_pplm_example( ...@@ -588,6 +605,7 @@ def run_pplm_example(
seed=0, seed=0,
no_cuda=False, no_cuda=False,
colorama=False, colorama=False,
repetition_penalty=1.0,
): ):
# set Random seed # set Random seed
torch.manual_seed(seed) torch.manual_seed(seed)
...@@ -655,6 +673,7 @@ def run_pplm_example( ...@@ -655,6 +673,7 @@ def run_pplm_example(
gamma=gamma, gamma=gamma,
gm_scale=gm_scale, gm_scale=gm_scale,
kl_scale=kl_scale, kl_scale=kl_scale,
repetition_penalty=repetition_penalty,
) )
# untokenize unperturbed text # untokenize unperturbed text
...@@ -767,6 +786,9 @@ if __name__ == "__main__": ...@@ -767,6 +786,9 @@ if __name__ == "__main__":
parser.add_argument("--seed", type=int, default=0) parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--no_cuda", action="store_true", help="no cuda") parser.add_argument("--no_cuda", action="store_true", help="no cuda")
parser.add_argument("--colorama", action="store_true", help="colors keywords") parser.add_argument("--colorama", action="store_true", help="colors keywords")
parser.add_argument(
"--repetition_penalty", type=float, default=1.0, help="Penalize repetition. More than 1.0 -> less repetition",
)
args = parser.parse_args() args = parser.parse_args()
run_pplm_example(**vars(args)) run_pplm_example(**vars(args))
...@@ -31,9 +31,73 @@ ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -31,9 +31,73 @@ ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class AlbertConfig(PretrainedConfig): class AlbertConfig(PretrainedConfig):
"""Configuration for `AlbertModel`. r"""
This is the configuration class to store the configuration of an :class:`~transformers.AlbertModel`.
It is used to instantiate an ALBERT model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the ALBERT `xxlarge <https://huggingface.co/albert-xxlarge-v2>`__ architecture.
The default settings match the configuration of model `albert_xxlarge`. Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Args:
vocab_size (:obj:`int`, optional, defaults to 30000):
Vocabulary size of the ALBERT model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`.
embedding_size (:obj:`int`, optional, defaults to 128):
Dimensionality of vocabulary embeddings.
hidden_size (:obj:`int`, optional, defaults to 4096):
Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (:obj:`int`, optional, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_hidden_groups (:obj:`int`, optional, defaults to 1):
Number of groups for the hidden layers, parameters in the same group are shared.
num_attention_heads (:obj:`int`, optional, defaults to 64):
Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (:obj:`int`, optional, defaults to 16384):
The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
inner_group_num (:obj:`int`, optional, defaults to 1):
The number of inner repetition of attention and ffn.
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"):
The non-linear activation function (function or string) in the encoder and pooler.
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
hidden_dropout_prob (:obj:`float`, optional, defaults to 0):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0):
The dropout ratio for the attention probabilities.
max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something
large (e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, optional, defaults to 2):
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`.
initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
The epsilon used by the layer normalization layers.
Example::
# Initializing an ALBERT-xxlarge style configuration
albert_xxlarge_configuration = AlbertConfig()
# Initializing an ALBERT-base style configuration
albert_base_configuration = AlbertConfig(
hidden_size=768,
num_attention_heads=12,
intermediate_size=3072,
)
# Initializing a model from the ALBERT-base style configuration
model = AlbertModel(bert_base_configuration)
# Accessing the model configuration
configuration = model.config
Attributes:
pretrained_config_archive_map (Dict[str, str]):
A dictionary containing all the available pre-trained checkpoints.
""" """
pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
...@@ -58,35 +122,6 @@ class AlbertConfig(PretrainedConfig): ...@@ -58,35 +122,6 @@ class AlbertConfig(PretrainedConfig):
layer_norm_eps=1e-12, layer_norm_eps=1e-12,
**kwargs **kwargs
): ):
"""Constructs AlbertConfig.
Args:
vocab_size: Vocabulary size of `inputs_ids` in `AlbertModel`.
embedding_size: size of voc embeddings.
hidden_size: Size of the encoder layers and the pooler layer.
num_hidden_layers: Number of hidden layers in the Transformer encoder.
num_hidden_groups: Number of group for the hidden layers, parameters in
the same group are shared.
num_attention_heads: Number of attention heads for each attention layer in
the Transformer encoder.
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
layer in the Transformer encoder.
inner_group_num: int, number of inner repetition of attention and ffn.
down_scale_factor: float, the scale to apply
hidden_act: The non-linear activation function (function or string) in the
encoder and pooler.
hidden_dropout_prob: The dropout probability for all fully connected
layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob: The dropout ratio for the attention
probabilities.
max_position_embeddings: The maximum sequence length that this model might
ever be used with. Typically set this to something large just in case
(e.g., 512 or 1024 or 2048).
type_vocab_size: The vocabulary size of the `token_type_ids` passed into
`AlbertModel`.
initializer_range: The stdev of the truncated_normal_initializer for
initializing all weight matrices.
"""
super(AlbertConfig, self).__init__(**kwargs) super(AlbertConfig, self).__init__(**kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
......
...@@ -77,32 +77,15 @@ CONFIG_MAPPING = OrderedDict( ...@@ -77,32 +77,15 @@ CONFIG_MAPPING = OrderedDict(
) )
class AutoConfig: class AutoConfig(object):
r""":class:`~transformers.AutoConfig` is a generic configuration class r"""
:class:`~transformers.AutoConfig` is a generic configuration class
that will be instantiated as one of the configuration classes of the library that will be instantiated as one of the configuration classes of the library
when created with the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` when created with the :func:`~transformers.AutoConfig.from_pretrained` class method.
class method.
The `from_pretrained()` method take care of returning the correct model class instance The :func:`~transformers.AutoConfig.from_pretrained` method takes care of returning the correct model class instance
based on the `model_type` property of the config object, or when it's missing, based on the `model_type` property of the config object, or when it's missing,
falling back to using pattern matching on the `pretrained_model_name_or_path` string. falling back to using pattern matching on the `pretrained_model_name_or_path` string.
When using string matching, the configuration class is matched on
the `pretrained_model_name_or_path` string in the following order:
- contains `t5`: T5Config (T5 model)
- contains `distilbert`: DistilBertConfig (DistilBERT model)
- contains `albert`: AlbertConfig (ALBERT model)
- contains `camembert`: CamembertConfig (CamemBERT model)
- contains `xlm-roberta`: XLMRobertaConfig (XLM-RoBERTa model)
- contains `roberta`: RobertaConfig (RoBERTa model)
- contains `bert`: BertConfig (Bert model)
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
- contains `xlnet`: XLNetConfig (XLNet model)
- contains `xlm`: XLMConfig (XLM model)
- contains `ctrl` : CTRLConfig (CTRL model)
This class cannot be instantiated using `__init__()` (throw an error).
""" """
def __init__(self): def __init__(self):
...@@ -124,60 +107,61 @@ class AutoConfig: ...@@ -124,60 +107,61 @@ class AutoConfig:
@classmethod @classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
r""" Instantiate one of the configuration classes of the library r""" Instantiates one of the configuration classes of the library
from a pre-trained model configuration. from a pre-trained model configuration.
The configuration class to instantiate is selected The configuration class to instantiate is selected
based on the `model_type` property of the config object, or when it's missing, based on the `model_type` property of the config object, or when it's missing,
falling back to using pattern matching on the `pretrained_model_name_or_path` string. falling back to using pattern matching on the `pretrained_model_name_or_path` string.
- contains `t5`: T5Config (T5 model) - contains `t5`: :class:`~transformers.T5Config` (T5 model)
- contains `distilbert`: DistilBertConfig (DistilBERT model) - contains `distilbert`: :class:`~transformers.DistilBertConfig` (DistilBERT model)
- contains `albert`: AlbertConfig (ALBERT model) - contains `albert`: :class:`~transformers.AlbertConfig` (ALBERT model)
- contains `camembert`: CamembertConfig (CamemBERT model) - contains `camembert`: :class:`~transformers.CamembertConfig` (CamemBERT model)
- contains `xlm-roberta`: XLMRobertaConfig (XLM-RoBERTa model) - contains `xlm-roberta`: :class:`~transformers.XLMRobertaConfig` (XLM-RoBERTa model)
- contains `roberta`: RobertaConfig (RoBERTa model) - contains `roberta`: :class:`~transformers.RobertaConfig` (RoBERTa model)
- contains `bert`: BertConfig (Bert model) - contains `bert`: :class:`~transformers.BertConfig` (Bert model)
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model) - contains `openai-gpt`: :class:`~transformers.OpenAIGPTConfig` (OpenAI GPT model)
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model) - contains `gpt2`: :class:`~transformers.GPT2Config` (OpenAI GPT-2 model)
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model) - contains `transfo-xl`: :class:`~transformers.TransfoXLConfig` (Transformer-XL model)
- contains `xlnet`: XLNetConfig (XLNet model) - contains `xlnet`: :class:`~transformers.XLNetConfig` (XLNet model)
- contains `xlm`: XLMConfig (XLM model) - contains `xlm`: :class:`~transformers.XLMConfig` (XLM model)
- contains `ctrl` : CTRLConfig (CTRL model) - contains `ctrl` : :class:`~transformers.CTRLConfig` (CTRL model)
Params:
pretrained_model_name_or_path: either:
Args:
- a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``. pretrained_model_name_or_path (:obj:`string`):
- a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. Is either: \
- a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``. - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
- a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``. - a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
- a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
cache_dir: (`optional`) string: - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
cache_dir (:obj:`string`, optional, defaults to `None`):
Path to a directory in which a downloaded pre-trained model Path to a directory in which a downloaded pre-trained model
configuration should be cached if the standard cache should not be used. configuration should be cached if the standard cache should not be used.
kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading. force_download (:obj:`boolean`, optional, defaults to `False`):
Force to (re-)download the model weights and configuration files and override the cached versions if they exist.
- The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
- Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
force_download: (`optional`) boolean, default False:
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
resume_download: (`optional`) boolean, default False: resume_download (:obj:`boolean`, optional, defaults to `False`):
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. Do not delete incompletely received file. Attempt to resume the download if such a file exists.
proxies: (`optional`) dict, default None: proxies (:obj:`Dict[str, str]`, optional, defaults to `None`):
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. A dictionary of proxy servers to use by protocol or endpoint, e.g.: :obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`.
The proxies are used on each request. The proxies are used on each request. See `the requests documentation <https://requests.readthedocs.io/en/master/user/advanced/#proxies>`__ for usage.
return_unused_kwargs: (`optional`) bool:
return_unused_kwargs (:obj:`boolean`, optional, defaults to `False`):
- If False, then this function returns just the final configuration object. - If False, then this function returns just the final configuration object.
- If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored. - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`): key/value pairs with which to update the configuration object after loading.
- The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
- Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
Examples:: Examples::
config = AutoConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. config = AutoConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache.
config = AutoConfig.from_pretrained('./test/bert_saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')` config = AutoConfig.from_pretrained('./test/bert_saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json') config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json')
config = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False) config = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
......
...@@ -50,32 +50,61 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -50,32 +50,61 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class BertConfig(PretrainedConfig): class BertConfig(PretrainedConfig):
r""" r"""
:class:`~transformers.BertConfig` is the configuration class to store the configuration of a This is the configuration class to store the configuration of a :class:`~transformers.BertModel`.
`BertModel`. It is used to instantiate an BERT model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
Arguments:
vocab_size: Vocabulary size of `inputs_ids` in `BertModel`. Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
hidden_size: Size of the encoder layers and the pooler layer. to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
num_hidden_layers: Number of hidden layers in the Transformer encoder. for more information.
num_attention_heads: Number of attention heads for each attention layer in
the Transformer encoder.
intermediate_size: The size of the "intermediate" (i.e., feed-forward) Args:
layer in the Transformer encoder. vocab_size (:obj:`int`, optional, defaults to 30522):
hidden_act: The non-linear activation function (function or string) in the Vocabulary size of the BERT model. Defines the different tokens that
encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported. can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
hidden_dropout_prob: The dropout probabilitiy for all fully connected hidden_size (:obj:`int`, optional, defaults to 768):
layers in the embeddings, encoder, and pooler. Dimensionality of the encoder layers and the pooler layer.
attention_probs_dropout_prob: The dropout ratio for the attention num_hidden_layers (:obj:`int`, optional, defaults to 12):
probabilities. Number of hidden layers in the Transformer encoder.
max_position_embeddings: The maximum sequence length that this model might num_attention_heads (:obj:`int`, optional, defaults to 12):
ever be used with. Typically set this to something large just in case Number of attention heads for each attention layer in the Transformer encoder.
(e.g., 512 or 1024 or 2048). intermediate_size (:obj:`int`, optional, defaults to 3072):
type_vocab_size: The vocabulary size of the `token_type_ids` passed into Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
`BertModel`. hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
initializer_range: The sttdev of the truncated_normal_initializer for The non-linear activation function (function or string) in the encoder and pooler.
initializing all weight matrices. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
layer_norm_eps: The epsilon used by LayerNorm. hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention probabilities.
max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, optional, defaults to 2):
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
The epsilon used by the layer normalization layers.
Example::
from transformers import BertModel, BertConfig
# Initializing a BERT bert-base-uncased style configuration
configuration = BertConfig()
# Initializing a model from the bert-base-uncased style configuration
model = BertModel(configuration)
# Accessing the model configuration
configuration = model.config
Attributes:
pretrained_config_archive_map (Dict[str, str]):
A dictionary containing all the available pre-trained checkpoints.
""" """
pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
model_type = "bert" model_type = "bert"
...@@ -97,6 +126,7 @@ class BertConfig(PretrainedConfig): ...@@ -97,6 +126,7 @@ class BertConfig(PretrainedConfig):
**kwargs **kwargs
): ):
super(BertConfig, self).__init__(**kwargs) super(BertConfig, self).__init__(**kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers self.num_hidden_layers = num_hidden_layers
......
...@@ -29,5 +29,35 @@ CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -29,5 +29,35 @@ CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class CamembertConfig(RobertaConfig): class CamembertConfig(RobertaConfig):
r"""
This is the configuration class to store the configuration of an :class:`~transformers.CamembertModel`.
It is used to instantiate an Camembert model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
The :class:`~transformers.CamembertConfig` class directly inherits :class:`~transformers.BertConfig`.
It reuses the same defaults. Please check the parent class for more information.
Example::
from transformers import CamembertModel, CamembertConfig
# Initializing a CamemBERT configuration
configuration = CamembertConfig()
# Initializing a model from the configuration
model = CamembertModel(configuration)
# Accessing the model configuration
configuration = model.config
Attributes:
pretrained_config_archive_map (Dict[str, str]):
A dictionary containing all the available pre-trained checkpoints.
"""
pretrained_config_archive_map = CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
model_type = "camembert" model_type = "camembert"
...@@ -26,25 +26,60 @@ CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://storage.googleapis.com/sf ...@@ -26,25 +26,60 @@ CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://storage.googleapis.com/sf
class CTRLConfig(PretrainedConfig): class CTRLConfig(PretrainedConfig):
"""Configuration class to store the configuration of a `CTRLModel`. """
This is the configuration class to store the configuration of an :class:`~transformers.CTRLModel`.
Args: It is used to instantiate an CTRL model according to the specified arguments, defining the model
vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file. architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
n_positions: Number of positional embeddings. the `ctrl <https://huggingface.co/ctrl>`__ architecture from SalesForce.
n_ctx: Size of the causal mask (usually same as n_positions).
dff: Size of the inner dimension of the FFN. Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
n_embd: Dimensionality of the embeddings and hidden states. to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
n_layer: Number of hidden layers in the Transformer encoder. for more information.
n_head: Number of attention heads for each attention layer in
the Transformer encoder. Args:
layer_norm_epsilon: epsilon to use in the layer norm layers vocab_size (:obj:`int`, optional, defaults to 246534):
resid_pdrop: The dropout probabilitiy for all fully connected Vocabulary size of the CTRL model. Defines the different tokens that
layers in the embeddings, encoder, and pooler. can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
attn_pdrop: The dropout ratio for the attention n_positions (:obj:`int`, optional, defaults to 256):
probabilities. The maximum sequence length that this model might ever be used with.
embd_pdrop: The dropout ratio for the embeddings. Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
initializer_range: The sttdev of the truncated_normal_initializer for n_ctx (:obj:`int`, optional, defaults to 256):
initializing all weight matrices. Dimensionality of the causal mask (usually same as n_positions).
n_embd (:obj:`int`, optional, defaults to 1280):
Dimensionality of the embeddings and hidden states.
dff (:obj:`int`, optional, defaults to 8192):
Dimensionality of the inner dimension of the FFN.
n_layer (:obj:`int`, optional, defaults to 48):
Number of hidden layers in the Transformer encoder.
n_head (:obj:`int`, optional, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
resid_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
embd_pdrop (:obj:`int`, optional, defaults to 0.1):
The dropout ratio for the embeddings.
attn_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention.
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-6):
The epsilon to use in the layer normalization layers
initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
Example::
from transformers import CTRLModel, CTRLConfig
# Initializing a CTRL configuration
configuration = CTRLConfig()
# Initializing a model from the configuration
model = CTRLModel(configuration)
# Accessing the model configuration
configuration = model.config
Attributes:
pretrained_config_archive_map (Dict[str, str]):
A dictionary containing all the available pre-trained checkpoints.
""" """
pretrained_config_archive_map = CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
...@@ -71,26 +106,6 @@ class CTRLConfig(PretrainedConfig): ...@@ -71,26 +106,6 @@ class CTRLConfig(PretrainedConfig):
summary_first_dropout=0.1, summary_first_dropout=0.1,
**kwargs **kwargs
): ):
"""Constructs CTRLConfig.
Args:
vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
n_positions: Number of positional embeddings.
n_ctx: Size of the causal mask (usually same as n_positions).
dff: Size of the inner dimension of the FFN.
n_embd: Dimensionality of the embeddings and hidden states.
n_layer: Number of hidden layers in the Transformer encoder.
n_head: Number of attention heads for each attention layer in
the Transformer encoder.
layer_norm_epsilon: epsilon to use in the layer norm layers
resid_pdrop: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
attn_pdrop: The dropout ratio for the attention
probabilities.
embd_pdrop: The dropout ratio for the embeddings.
initializer_range: The sttdev of the truncated_normal_initializer for
initializing all weight matrices.
"""
super(CTRLConfig, self).__init__(**kwargs) super(CTRLConfig, self).__init__(**kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.n_ctx = n_ctx self.n_ctx = n_ctx
......
...@@ -31,6 +31,67 @@ DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -31,6 +31,67 @@ DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class DistilBertConfig(PretrainedConfig): class DistilBertConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a :class:`~transformers.DistilBertModel`.
It is used to instantiate a DistilBERT model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the DistilBERT `distilbert-base-uncased <https://huggingface.co/distilbert-base-uncased>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Args:
vocab_size (:obj:`int`, optional, defaults to 30522):
Vocabulary size of the DistilBERT model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
sinusoidal_pos_embds (:obj:`boolean`, optional, defaults to :obj:`False`):
Whether to use sinusoidal positional embeddings.
n_layers (:obj:`int`, optional, defaults to 6):
Number of hidden layers in the Transformer encoder.
n_heads (:obj:`int`, optional, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
dim (:obj:`int`, optional, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
intermediate_size (:obj:`int`, optional, defaults to 3072):
The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
dropout (:obj:`float`, optional, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention probabilities.
activation (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
The non-linear activation function (function or string) in the encoder and pooler.
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
qa_dropout (:obj:`float`, optional, defaults to 0.1):
The dropout probabilities used in the question answering model
:class:`~tranformers.DistilBertForQuestionAnswering`.
seq_classif_dropout (:obj:`float`, optional, defaults to 0.2):
The dropout probabilities used in the sequence classification model
:class:`~tranformers.DistilBertForSequenceClassification`.
Example::
from transformers import DistilBertModel, DistilBertConfig
# Initializing a DistilBERT configuration
configuration = DistilBertConfig()
# Initializing a model from the configuration
model = DistilBertModel(configuration)
# Accessing the model configuration
configuration = model.config
Attributes:
pretrained_config_archive_map (Dict[str, str]):
A dictionary containing all the available pre-trained checkpoints.
"""
pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
model_type = "distilbert" model_type = "distilbert"
...@@ -47,7 +108,6 @@ class DistilBertConfig(PretrainedConfig): ...@@ -47,7 +108,6 @@ class DistilBertConfig(PretrainedConfig):
attention_dropout=0.1, attention_dropout=0.1,
activation="gelu", activation="gelu",
initializer_range=0.02, initializer_range=0.02,
tie_weights_=True,
qa_dropout=0.1, qa_dropout=0.1,
seq_classif_dropout=0.2, seq_classif_dropout=0.2,
**kwargs **kwargs
...@@ -64,7 +124,6 @@ class DistilBertConfig(PretrainedConfig): ...@@ -64,7 +124,6 @@ class DistilBertConfig(PretrainedConfig):
self.attention_dropout = attention_dropout self.attention_dropout = attention_dropout
self.activation = activation self.activation = activation
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.tie_weights_ = tie_weights_
self.qa_dropout = qa_dropout self.qa_dropout = qa_dropout
self.seq_classif_dropout = seq_classif_dropout self.seq_classif_dropout = seq_classif_dropout
......
...@@ -33,24 +33,84 @@ GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -33,24 +33,84 @@ GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class GPT2Config(PretrainedConfig): class GPT2Config(PretrainedConfig):
"""Configuration class to store the configuration of a `GPT2Model`. """
This is the configuration class to store the configuration of a :class:`~transformers.GPT2Model`.
Args: It is used to instantiate an GPT-2 model according to the specified arguments, defining the model
vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file. architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
n_positions: Number of positional embeddings. the GPT-2 `small <https://huggingface.co/gpt2>`__ architecture.
n_ctx: Size of the causal mask (usually same as n_positions).
n_embd: Dimensionality of the embeddings and hidden states. Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
n_layer: Number of hidden layers in the Transformer encoder. to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
n_head: Number of attention heads for each attention layer in for more information.
the Transformer encoder.
layer_norm_epsilon: epsilon to use in the layer norm layers
resid_pdrop: The dropout probabilitiy for all fully connected Args:
layers in the embeddings, encoder, and pooler. vocab_size (:obj:`int`, optional, defaults to 50257):
attn_pdrop: The dropout ratio for the attention Vocabulary size of the GPT-2 model. Defines the different tokens that
probabilities. can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.GPT2Model`.
embd_pdrop: The dropout ratio for the embeddings. n_positions (:obj:`int`, optional, defaults to 1024):
initializer_range: The sttdev of the truncated_normal_initializer for The maximum sequence length that this model might ever be used with.
initializing all weight matrices. Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
n_ctx (:obj:`int`, optional, defaults to 1024):
Dimensionality of the causal mask (usually same as n_positions).
n_embd (:obj:`int`, optional, defaults to 768):
Dimensionality of the embeddings and hidden states.
n_layer (:obj:`int`, optional, defaults to 12):
Number of hidden layers in the Transformer encoder.
n_head (:obj:`int`, optional, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
resid_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
embd_pdrop (:obj:`int`, optional, defaults to 0.1):
The dropout ratio for the embeddings.
attn_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention.
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
The epsilon to use in the layer normalization layers
initializer_range (:obj:`float`, optional, defaults to 16):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
summary_type (:obj:`string`, optional, defaults to "cls_index"):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.GPT2DoubleHeadsModel`.
Is one of the following options:
- 'last' => take the last token hidden state (like XLNet)
- 'first' => take the first token hidden state (like Bert)
- 'mean' => take the mean of all tokens hidden states
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
- 'attn' => Not implemented now, use multi-head attention
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.GPT2DoubleHeadsModel`.
Add a projection after the vector extraction
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.GPT2DoubleHeadsModel`.
'tanh' => add a tanh activation to the output, Other => no activation.
summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.GPT2DoubleHeadsModel`.
If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.GPT2DoubleHeadsModel`.
Add a dropout before the projection and activation
Example::
from transformers import GPT2Model, GPT2Config
# Initializing a GPT2 configuration
configuration = GPT2Config()
# Initializing a model from the configuration
model = GPT2Model(configuration)
# Accessing the model configuration
configuration = model.config
Attributes:
pretrained_config_archive_map (Dict[str, str]):
A dictionary containing all the available pre-trained checkpoints.
""" """
pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
...@@ -76,26 +136,8 @@ class GPT2Config(PretrainedConfig): ...@@ -76,26 +136,8 @@ class GPT2Config(PretrainedConfig):
summary_first_dropout=0.1, summary_first_dropout=0.1,
**kwargs **kwargs
): ):
"""Constructs GPT2Config.
Args:
vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
n_positions: Number of positional embeddings.
n_ctx: Size of the causal mask (usually same as n_positions).
n_embd: Dimensionality of the embeddings and hidden states.
n_layer: Number of hidden layers in the Transformer encoder.
n_head: Number of attention heads for each attention layer in
the Transformer encoder.
layer_norm_epsilon: epsilon to use in the layer norm layers
resid_pdrop: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
attn_pdrop: The dropout ratio for the attention
probabilities.
embd_pdrop: The dropout ratio for the embeddings.
initializer_range: The sttdev of the truncated_normal_initializer for
initializing all weight matrices.
"""
super(GPT2Config, self).__init__(**kwargs) super(GPT2Config, self).__init__(**kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.n_ctx = n_ctx self.n_ctx = n_ctx
self.n_positions = n_positions self.n_positions = n_positions
......
...@@ -26,9 +26,13 @@ class MMBTConfig(object): ...@@ -26,9 +26,13 @@ class MMBTConfig(object):
"""Configuration class to store the configuration of a `MMBT Model`. """Configuration class to store the configuration of a `MMBT Model`.
Args: Args:
config: config of the underlying Transformer models. It's values are copied over to use a single config. config (:obj:`~transformers.PreTrainedConfig`):
num_labels: Size of final Linear layer for classification. Config of the underlying Transformer models. Its values are
modal_hidden_size: Embedding dimension of the non-text modality encoder. copied over to use a single config.
num_labels (:obj:`int` or :obj:`None`, optional, defaults to `None`):
Size of final Linear layer for classification.
modal_hidden_size (:obj:`int`, optional, defautls to 2048):
Embedding dimension of the non-text modality encoder.
""" """
def __init__(self, config, num_labels=None, modal_hidden_size=2048): def __init__(self, config, num_labels=None, modal_hidden_size=2048):
......
...@@ -30,27 +30,87 @@ OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -30,27 +30,87 @@ OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class OpenAIGPTConfig(PretrainedConfig): class OpenAIGPTConfig(PretrainedConfig):
""" """
Configuration class to store the configuration of a `OpenAIGPTModel`. This is the configuration class to store the configuration of an :class:`~transformers.OpenAIGPTModel`.
It is used to instantiate an GPT model according to the specified arguments, defining the model
Args: architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
vocab_size: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file. the `GPT <https://huggingface.co/openai-gpt>`__ architecture from OpenAI.
n_positions: Number of positional embeddings.
n_ctx: Size of the causal mask (usually same as n_positions). Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
n_embd: Dimensionality of the embeddings and hidden states. to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
n_layer: Number of hidden layers in the Transformer encoder. for more information.
n_head: Number of attention heads for each attention layer in
the Transformer encoder. Args:
afn: The non-linear activation function (function or string) in the vocab_size (:obj:`int`, optional, defaults to 40478):
encoder and pooler. If string, "gelu", "relu" and "swish" are supported. Vocabulary size of the GPT model. Defines the different tokens that
resid_pdrop: The dropout probabilitiy for all fully connected can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
layers in the embeddings, encoder, and pooler. n_positions (:obj:`int`, optional, defaults to 512):
attn_pdrop: The dropout ratio for the attention The maximum sequence length that this model might ever be used with.
probabilities. Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
embd_pdrop: The dropout ratio for the embeddings. n_ctx (:obj:`int`, optional, defaults to 512):
layer_norm_epsilon: epsilon to use in the layer norm layers Dimensionality of the causal mask (usually same as n_positions).
initializer_range: The sttdev of the truncated_normal_initializer for n_embd (:obj:`int`, optional, defaults to 768):
initializing all weight matrices. Dimensionality of the embeddings and hidden states.
predict_special_tokens: should we predict special tokens (when the model has a LM head) n_layer (:obj:`int`, optional, defaults to 12):
Number of hidden layers in the Transformer encoder.
n_head (:obj:`int`, optional, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
afn (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
The non-linear activation function (function or string) in the encoder and pooler.
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
resid_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
embd_pdrop (:obj:`int`, optional, defaults to 0.1):
The dropout ratio for the embeddings.
attn_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention.
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
The epsilon to use in the layer normalization layers
initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
predict_special_tokens (:obj:`boolean`, optional, defaults to :obj:`True`):
Whether special tokens should be predicted when the model is has a language modeling head.
summary_type (:obj:`string`, optional, defaults to "cls_index"):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.OpenAIGPTDoubleHeadsModel`.
Is one of the following options:
- 'last' => take the last token hidden state (like XLNet)
- 'first' => take the first token hidden state (like Bert)
- 'mean' => take the mean of all tokens hidden states
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
- 'attn' => Not implemented now, use multi-head attention
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.OpenAIGPTDoubleHeadsModel`.
Add a projection after the vector extraction
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.OpenAIGPTDoubleHeadsModel`.
'tanh' => add a tanh activation to the output, Other => no activation.
summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.OpenAIGPTDoubleHeadsModel`.
If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.OpenAIGPTDoubleHeadsModel`.
Add a dropout before the projection and activation
Example::
from transformers import OpenAIGPTConfig, OpenAIGPTModel
# Initializing a GPT configuration
configuration = OpenAIGPTConfig()
# Initializing a model from the configuration
model = OpenAIGPTModel(configuration)
# Accessing the model configuration
configuration = model.config
Attributes:
pretrained_config_archive_map (Dict[str, str]):
A dictionary containing all the available pre-trained checkpoints.
""" """
pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
...@@ -78,9 +138,8 @@ class OpenAIGPTConfig(PretrainedConfig): ...@@ -78,9 +138,8 @@ class OpenAIGPTConfig(PretrainedConfig):
summary_first_dropout=0.1, summary_first_dropout=0.1,
**kwargs **kwargs
): ):
"""Constructs OpenAIGPTConfig.
"""
super(OpenAIGPTConfig, self).__init__(**kwargs) super(OpenAIGPTConfig, self).__init__(**kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.n_ctx = n_ctx self.n_ctx = n_ctx
self.n_positions = n_positions self.n_positions = n_positions
......
...@@ -34,5 +34,35 @@ ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -34,5 +34,35 @@ ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class RobertaConfig(BertConfig): class RobertaConfig(BertConfig):
r"""
This is the configuration class to store the configuration of an :class:`~transformers.RobertaModel`.
It is used to instantiate an RoBERTa model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`.
It reuses the same defaults. Please check the parent class for more information.
Example::
from transformers import RobertaConfig, RobertaModel
# Initializing a RoBERTa configuration
configuration = RobertaConfig()
# Initializing a model from the configuration
model = RobertaModel(configuration)
# Accessing the model configuration
configuration = model.config
Attributes:
pretrained_config_archive_map (Dict[str, str]):
A dictionary containing all the available pre-trained checkpoints.
"""
pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
model_type = "roberta" model_type = "roberta"
...@@ -29,39 +29,91 @@ TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -29,39 +29,91 @@ TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class TransfoXLConfig(PretrainedConfig): class TransfoXLConfig(PretrainedConfig):
"""Configuration class to store the configuration of a `TransfoXLModel`. """
This is the configuration class to store the configuration of an :class:`~transformers.TransfoXLModel`.
It is used to instantiate a Transformer XL model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the `Transformer XL <https://huggingface.co/transfo-xl-wt103>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Args: Args:
vocab_size: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file. vocab_size (:obj:`int`, optional, defaults to 267735):
cutoffs: cutoffs for the adaptive softmax Vocabulary size of the Transformer XL model. Defines the different tokens that
d_model: Dimensionality of the model's hidden states. can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.TransfoXLModel`.
d_embed: Dimensionality of the embeddings cutoffs (:obj:`List[int]`, optional, defaults to :obj:`[20000, 40000, 200000]`):
d_head: Dimensionality of the model's heads. Cutoffs for the adaptive softmax
div_val: divident value for adapative input and softmax d_model (:obj:`int`, optional, defaults to 1024):
pre_lnorm: apply LayerNorm to the input instead of the output Dimensionality of the model's hidden states.
d_inner: Inner dimension in FF d_embed (:obj:`int`, optional, defaults to 1024):
n_layer: Number of hidden layers in the Transformer encoder. Dimensionality of the embeddings
n_head: Number of attention heads for each attention layer in n_head (:obj:`int`, optional, defaults to 16):
the Transformer encoder. Number of attention heads for each attention layer in the Transformer encoder.
tgt_len: number of tokens to predict d_head (:obj:`int`, optional, defaults to 64):
ext_len: length of the extended context Dimensionality of the model's heads.
mem_len: length of the retained previous heads d_inner (:obj:`int`, optional, defaults to 4096):
same_length: use the same attn length for all tokens Inner dimension in FF
proj_share_all_but_first: True to share all but first projs, False not to share. div_val (:obj:`int`, optional, defaults to 4):
attn_type: attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al. Divident value for adapative input and softmax
clamp_len: use the same pos embeddings after clamp_len pre_lnorm (:obj:`boolean`, optional, defaults to :obj:`False`):
sample_softmax: number of samples in sampled softmax Apply LayerNorm to the input instead of the output
adaptive: use adaptive softmax n_layer (:obj:`int`, optional, defaults to 18):
tie_weight: tie the word embedding and softmax weights Number of hidden layers in the Transformer encoder.
dropout: The dropout probabilitiy for all fully connected tgt_len (:obj:`int`, optional, defaults to 128):
layers in the embeddings, encoder, and pooler. Number of tokens to predict
dropatt: The dropout ratio for the attention probabilities. ext_len (:obj:`int`, optional, defaults to 0):
untie_r: untie relative position biases Length of the extended context
embd_pdrop: The dropout ratio for the embeddings. mem_len (:obj:`int`, optional, defaults to 1600):
init: parameter initializer to use Length of the retained previous heads
init_range: parameters initialized by U(-init_range, init_range). clamp_len (:obj:`int`, optional, defaults to 1000):
proj_init_std: parameters initialized by N(0, init_std) use the same pos embeddings after clamp_len
init_std: parameters initialized by N(0, init_std) same_length (:obj:`boolean`, optional, defaults to :obj:`True`):
Use the same attn length for all tokens
proj_share_all_but_first (:obj:`boolean`, optional, defaults to :obj:`True`):
True to share all but first projs, False not to share.
attn_type (:obj:`int`, optional, defaults to 0):
Attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
sample_softmax (:obj:`int`, optional, defaults to -1):
number of samples in sampled softmax
adaptive (:obj:`boolean`, optional, defaults to :obj:`True`):
use adaptive softmax
tie_weight (:obj:`boolean`, optional, defaults to :obj:`True`):
tie the word embedding and softmax weights
dropout (:obj:`float`, optional, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
dropatt (:obj:`float`, optional, defaults to 0):
The dropout ratio for the attention probabilities.
untie_r (:obj:`boolean`, optional, defaults to :obj:`True`):
Untie relative position biases
init (:obj:`string`, optional, defaults to `normal`):
Parameter initializer to use
init_range (:obj:`float`, optional, defaults to 0.01):
Parameters initialized by U(-init_range, init_range).
proj_init_std (:obj:`float`, optional, defaults to 0.01):
Parameters initialized by N(0, init_std)
init_std (:obj:`float`, optional, defaults to 0.02):
Parameters initialized by N(0, init_std)
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
The epsilon to use in the layer normalization layers
Example::
from transformers import TransfoXLConfig, TransfoXLModel
# Initializing a Transformer XL configuration
configuration = TransfoXLConfig()
# Initializing a model from the configuration
model = TransfoXLModel(configuration)
# Accessing the model configuration
configuration = model.config
Attributes:
pretrained_config_archive_map (Dict[str, str]):
A dictionary containing all the available pre-trained checkpoints.
""" """
pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
...@@ -99,9 +151,8 @@ class TransfoXLConfig(PretrainedConfig): ...@@ -99,9 +151,8 @@ class TransfoXLConfig(PretrainedConfig):
layer_norm_epsilon=1e-5, layer_norm_epsilon=1e-5,
**kwargs **kwargs
): ):
"""Constructs TransfoXLConfig.
"""
super(TransfoXLConfig, self).__init__(**kwargs) super(TransfoXLConfig, self).__init__(**kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.cutoffs = [] self.cutoffs = []
self.cutoffs.extend(cutoffs) self.cutoffs.extend(cutoffs)
......
...@@ -37,44 +37,124 @@ XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -37,44 +37,124 @@ XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class XLMConfig(PretrainedConfig): class XLMConfig(PretrainedConfig):
"""Configuration class to store the configuration of a `XLMModel`. """
This is the configuration class to store the configuration of a :class:`~transformers.XLMModel`.
Args: It is used to instantiate an XLM model according to the specified arguments, defining the model
vocab_size: Vocabulary size of `inputs_ids` in `XLMModel`. architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
d_model: Size of the encoder layers and the pooler layer. the `xlm-mlm-en-2048 <https://huggingface.co/xlm-mlm-en-2048>`__ architecture.
n_layer: Number of hidden layers in the Transformer encoder.
n_head: Number of attention heads for each attention layer in Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
the Transformer encoder. to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
d_inner: The size of the "intermediate" (i.e., feed-forward) for more information.
layer in the Transformer encoder.
ff_activation: The non-linear activation function (function or string) in the Args:
encoder and pooler. If string, "gelu", "relu" and "swish" are supported. vocab_size (:obj:`int`, optional, defaults to 30145):
untie_r: untie relative position biases Vocabulary size of the XLM model. Defines the different tokens that
attn_type: 'bi' for XLM, 'uni' for Transformer-XL can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLMModel`.
emb_dim (:obj:`int`, optional, defaults to 2048):
dropout: The dropout probabilitiy for all fully connected Dimensionality of the encoder layers and the pooler layer.
layers in the embeddings, encoder, and pooler. n_layer (:obj:`int`, optional, defaults to 12):
max_position_embeddings: The maximum sequence length that this model might Number of hidden layers in the Transformer encoder.
ever be used with. Typically set this to something large just in case n_head (:obj:`int`, optional, defaults to 16):
(e.g., 512 or 1024 or 2048). Number of attention heads for each attention layer in the Transformer encoder.
initializer_range: The sttdev of the truncated_normal_initializer for dropout (:obj:`float`, optional, defaults to 0.1):
initializing all weight matrices. The dropout probability for all fully connected
layer_norm_eps: The epsilon used by LayerNorm. layers in the embeddings, encoder, and pooler.
attention_dropout (:obj:`float`, optional, defaults to 0.1):
dropout: float, dropout rate. The dropout probability for the attention mechanism
init: str, the initialization scheme, either "normal" or "uniform". gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`):
init_range: float, initialize the parameters with a uniform distribution The non-linear activation function (function or string) in the
in [-init_range, init_range]. Only effective when init="uniform". encoder and pooler. If set to `True`, "gelu" will be used instead of "relu".
init_std: float, initialize the parameters with a normal distribution sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`):
with mean 0 and stddev init_std. Only effective when init="normal". Whether to use sinusoidal positional embeddings instead of absolute positional embeddings.
mem_len: int, the number of tokens to cache. causal (:obj:`boolean`, optional, defaults to :obj:`False`):
reuse_len: int, the number of tokens in the currect batch to be cached Set this to `True` for the model to behave in a causal manner.
and reused in the future. Causal models use a triangular attention mask in order to only attend to the left-side context instead
bi_data: bool, whether to use bidirectional input pipeline. if a bidirectional context.
Usually set to True during pretraining and False during finetuning. asm (:obj:`boolean`, optional, defaults to :obj:`False`):
clamp_len: int, clamp all relative distances larger than clamp_len. Whether to use an adaptive log softmax projection layer instead of a linear layer for the prediction
-1 means no clamping. layer.
same_length: bool, whether to use the same attention length for each token. n_langs (:obj:`int`, optional, defaults to 1):
The number of languages the model handles. Set to 1 for monolingual models.
use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`)
Whether to use language embeddings. Some models use additional language embeddings, see
`the multilingual models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__
for information on how to use them.
max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might
ever be used with. Typically set this to something large just in case
(e.g., 512 or 1024 or 2048).
embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5):
The standard deviation of the truncated_normal_initializer for
initializing the embedding matrices.
init_std (:obj:`int`, optional, defaults to 50257):
The standard deviation of the truncated_normal_initializer for
initializing all weight matrices except the embedding matrices.
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
The epsilon used by the layer normalization layers.
bos_index (:obj:`int`, optional, defaults to 0):
The index of the beginning of sentence token in the vocabulary.
eos_index (:obj:`int`, optional, defaults to 1):
The index of the end of sentence token in the vocabulary.
pad_index (:obj:`int`, optional, defaults to 2):
The index of the padding token in the vocabulary.
unk_index (:obj:`int`, optional, defaults to 3):
The index of the unknown token in the vocabulary.
mask_index (:obj:`int`, optional, defaults to 5):
The index of the masking token in the vocabulary.
is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`):
Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
summary_type (:obj:`string`, optional, defaults to "first"):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.XLMForSequenceClassification`.
Is one of the following options:
- 'last' => take the last token hidden state (like XLNet)
- 'first' => take the first token hidden state (like Bert)
- 'mean' => take the mean of all tokens hidden states
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
- 'attn' => Not implemented now, use multi-head attention
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.XLMForSequenceClassification`.
Add a projection after the vector extraction
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.XLMForSequenceClassification`.
'tanh' => add a tanh activation to the output, Other => no activation.
summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.XLMForSequenceClassification`.
If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.XLMForSequenceClassification`.
Add a dropout before the projection and activation
start_n_top (:obj:`int`, optional, defaults to 5):
Used in the SQuAD evaluation script for XLM and XLNet.
end_n_top (:obj:`int`, optional, defaults to 5):
Used in the SQuAD evaluation script for XLM and XLNet.
mask_token_id (:obj:`int`, optional, defaults to 0):
Model agnostic parameter to identify masked tokens when generating text in an MLM context.
lang_id (:obj:`int`, optional, defaults to 1):
The ID of the language used by the model. This parameter is used when generating
text in a given language.
Example::
from transformers import XLMConfig, XLMModel
# Initializing a XLM configuration
configuration = XLMConfig()
# Initializing a model from the configuration
model = XLMModel(configuration)
# Accessing the model configuration
configuration = model.config
Attributes:
pretrained_config_archive_map (Dict[str, str]):
A dictionary containing all the available pre-trained checkpoints.
""" """
pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
......
...@@ -30,42 +30,102 @@ XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -30,42 +30,102 @@ XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class XLNetConfig(PretrainedConfig): class XLNetConfig(PretrainedConfig):
"""Configuration class to store the configuration of a ``XLNetModel``. """
This is the configuration class to store the configuration of a :class:`~transformers.XLNetModel`.
Args: It is used to instantiate an XLNet model according to the specified arguments, defining the model
vocab_size: Vocabulary size of ``inputs_ids`` in ``XLNetModel``. architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
d_model: Size of the encoder layers and the pooler layer. the `xlnet-large-cased <https://huggingface.co/xlnet-large-cased>`__ architecture.
n_layer: Number of hidden layers in the Transformer encoder.
n_head: Number of attention heads for each attention layer in Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
the Transformer encoder. to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
d_inner: The size of the "intermediate" (i.e., feed-forward) for more information.
layer in the Transformer encoder.
ff_activation: The non-linear activation function (function or string) in the Args:
encoder and pooler. If string, "gelu", "relu" and "swish" are supported. vocab_size (:obj:`int`, optional, defaults to 32000):
untie_r: untie relative position biases Vocabulary size of the XLNet model. Defines the different tokens that
attn_type: 'bi' for XLNet, 'uni' for Transformer-XL can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLNetModel`.
d_model (:obj:`int`, optional, defaults to 1024):
dropout: The dropout probabilitiy for all fully connected Dimensionality of the encoder layers and the pooler layer.
layers in the embeddings, encoder, and pooler. n_layer (:obj:`int`, optional, defaults to 24):
initializer_range: The sttdev of the truncated_normal_initializer for Number of hidden layers in the Transformer encoder.
initializing all weight matrices. n_head (:obj:`int`, optional, defaults to 16):
layer_norm_eps: The epsilon used by LayerNorm. Number of attention heads for each attention layer in the Transformer encoder.
d_inner (:obj:`int`, optional, defaults to 4096):
dropout: float, dropout rate. Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
init: str, the initialization scheme, either "normal" or "uniform". ff_activation (:obj:`string`, optional, defaults to "gelu"):
init_range: float, initialize the parameters with a uniform distribution The non-linear activation function (function or string) in the
in [-init_range, init_range]. Only effective when init="uniform". encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
init_std: float, initialize the parameters with a normal distribution untie_r (:obj:`boolean`, optional, defaults to :obj:`True`):
with mean 0 and stddev init_std. Only effective when init="normal". Untie relative position biases
mem_len: int, the number of tokens to cache. attn_type (:obj:`string`, optional, defaults to "bi"):
reuse_len: int, the number of tokens in the currect batch to be cached The attention type used by the model. Set 'bi' for XLNet, 'uni' for Transformer-XL.
and reused in the future. initializer_range (:obj:`float`, optional, defaults to 0.02):
bi_data: bool, whether to use bidirectional input pipeline. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
Usually set to True during pretraining and False during finetuning. layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
clamp_len: int, clamp all relative distances larger than clamp_len. The epsilon used by the layer normalization layers.
-1 means no clamping. dropout (:obj:`float`, optional, defaults to 0.1):
same_length: bool, whether to use the same attention length for each token. The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
finetuning_task: name of the glue task on which the model was fine-tuned if any mem_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`):
The number of tokens to cache. The key/value pairs that have already been pre-computed
in a previous forward pass won't be re-computed. See the
`quickstart <https://huggingface.co/transformers/quickstart.html#using-the-past>`__
for more information.
reuse_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`):
The number of tokens in the current batch to be cached and reused in the future.
bi_data (:obj:`boolean`, optional, defaults to :obj:`False`):
Whether to use bidirectional input pipeline. Usually set to `True` during
pretraining and `False` during finetuning.
clamp_len (:obj:`int`, optional, defaults to -1):
Clamp all relative distances larger than clamp_len.
Setting this attribute to -1 means no clamping.
same_length (:obj:`boolean`, optional, defaults to :obj:`False`):
Whether to use the same attention length for each token.
summary_type (:obj:`string`, optional, defaults to "last"):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
Is one of the following options:
- 'last' => take the last token hidden state (like XLNet)
- 'first' => take the first token hidden state (like Bert)
- 'mean' => take the mean of all tokens hidden states
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
- 'attn' => Not implemented now, use multi-head attention
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
Add a projection after the vector extraction
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
'tanh' => add a tanh activation to the output, Other => no activation.
summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
summary_last_dropout (:obj:`float`, optional, defaults to 0.1):
Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
Add a dropout after the projection and activation
start_n_top (:obj:`int`, optional, defaults to 5):
Used in the SQuAD evaluation script for XLM and XLNet.
end_n_top (:obj:`int`, optional, defaults to 5):
Used in the SQuAD evaluation script for XLM and XLNet.
Example::
from transformers import XLNetConfig, XLNetModel
# Initializing a XLNet configuration
configuration = XLNetConfig()
# Initializing a model from the configuration
model = XLNetModel(configuration)
# Accessing the model configuration
configuration = model.config
Attributes:
pretrained_config_archive_map (Dict[str, str]):
A dictionary containing all the available pre-trained checkpoints.
""" """
pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment