"...csrc/git@developer.sourcefind.cn:change/sglang.git" did not exist on "737d73ed5bcb5cd161c142551c7e43257e8aa130"
Commit a75c64d8 authored by Lysandre's avatar Lysandre
Browse files

Black 20 release

parent e78c1103
...@@ -90,11 +90,11 @@ class TokenClassificationTask: ...@@ -90,11 +90,11 @@ class TokenClassificationTask:
sequence_a_segment_id=0, sequence_a_segment_id=0,
mask_padding_with_zero=True, mask_padding_with_zero=True,
) -> List[InputFeatures]: ) -> List[InputFeatures]:
""" Loads a data file into a list of `InputFeatures` """Loads a data file into a list of `InputFeatures`
`cls_token_at_end` define the location of the CLS token: `cls_token_at_end` define the location of the CLS token:
- False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
- True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
`cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet) `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
""" """
# TODO clean up all this to leverage built-in features of tokenizers # TODO clean up all this to leverage built-in features of tokenizers
...@@ -230,7 +230,8 @@ if is_torch_available(): ...@@ -230,7 +230,8 @@ if is_torch_available():
): ):
# Load data features from cache or dataset file # Load data features from cache or dataset file
cached_features_file = os.path.join( cached_features_file = os.path.join(
data_dir, "cached_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length)), data_dir,
"cached_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length)),
) )
# Make sure only the first process in distributed training processes the dataset, # Make sure only the first process in distributed training processes the dataset,
......
...@@ -14,18 +14,18 @@ def swish(x): ...@@ -14,18 +14,18 @@ def swish(x):
def _gelu_python(x): def _gelu_python(x):
""" Original Implementation of the gelu activation function in Google Bert repo when initially created. """Original Implementation of the gelu activation function in Google Bert repo when initially created.
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
This is now written in C in torch.nn.functional This is now written in C in torch.nn.functional
Also see https://arxiv.org/abs/1606.08415 Also see https://arxiv.org/abs/1606.08415
""" """
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
def gelu_new(x): def gelu_new(x):
""" Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT). """Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
Also see https://arxiv.org/abs/1606.08415 Also see https://arxiv.org/abs/1606.08415
""" """
return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0)))) return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
......
...@@ -199,11 +199,17 @@ class PyTorchBenchmark(Benchmark): ...@@ -199,11 +199,17 @@ class PyTorchBenchmark(Benchmark):
# run additional 10 times to stabilize compilation for tpu and torchscript # run additional 10 times to stabilize compilation for tpu and torchscript
logger.info("Do inference on TPU or torchscript. Running model 5 times to stabilize compilation") logger.info("Do inference on TPU or torchscript. Running model 5 times to stabilize compilation")
timeit.repeat( timeit.repeat(
func, repeat=1, number=5, func,
repeat=1,
number=5,
) )
# as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average # as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average
runtimes = timeit.repeat(func, repeat=self.args.repeat, number=10,) runtimes = timeit.repeat(
func,
repeat=self.args.repeat,
number=10,
)
if self.args.is_tpu and self.args.torch_xla_tpu_print_metrics: if self.args.is_tpu and self.args.torch_xla_tpu_print_metrics:
import torch_xla.debug.metrics as met import torch_xla.debug.metrics as met
......
...@@ -32,10 +32,12 @@ logger = logging.get_logger(__name__) ...@@ -32,10 +32,12 @@ logger = logging.get_logger(__name__)
@dataclass @dataclass
class TensorFlowBenchmarkArguments(BenchmarkArguments): class TensorFlowBenchmarkArguments(BenchmarkArguments):
tpu_name: str = field( tpu_name: str = field(
default=None, metadata={"help": "Name of TPU"}, default=None,
metadata={"help": "Name of TPU"},
) )
device_idx: int = field( device_idx: int = field(
default=0, metadata={"help": "CPU / GPU device index. Defaults to 0."}, default=0,
metadata={"help": "CPU / GPU device index. Defaults to 0."},
) )
eager_mode: bool = field(default=False, metadata={"help": "Benchmark models in eager model."}) eager_mode: bool = field(default=False, metadata={"help": "Benchmark models in eager model."})
use_xla: bool = field( use_xla: bool = field(
......
...@@ -219,7 +219,11 @@ class TensorFlowBenchmark(Benchmark): ...@@ -219,7 +219,11 @@ class TensorFlowBenchmark(Benchmark):
timeit.repeat(func, repeat=1, number=5) timeit.repeat(func, repeat=1, number=5)
# as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average # as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average
runtimes = timeit.repeat(func, repeat=self.args.repeat, number=10,) runtimes = timeit.repeat(
func,
repeat=self.args.repeat,
number=10,
)
return min(runtimes) / 10.0 return min(runtimes) / 10.0
except ResourceExhaustedError as e: except ResourceExhaustedError as e:
......
...@@ -32,71 +32,71 @@ ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -32,71 +32,71 @@ ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class AlbertConfig(PretrainedConfig): class AlbertConfig(PretrainedConfig):
r""" r"""
This is the configuration class to store the configuration of a :class:`~transformers.AlbertModel`. This is the configuration class to store the configuration of a :class:`~transformers.AlbertModel`.
It is used to instantiate an ALBERT model according to the specified arguments, defining the model It is used to instantiate an ALBERT model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the ALBERT `xxlarge <https://huggingface.co/albert-xxlarge-v2>`__ architecture. the ALBERT `xxlarge <https://huggingface.co/albert-xxlarge-v2>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information. for more information.
Args: Args:
vocab_size (:obj:`int`, optional, defaults to 30000): vocab_size (:obj:`int`, optional, defaults to 30000):
Vocabulary size of the ALBERT model. Defines the different tokens that Vocabulary size of the ALBERT model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`. can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`.
embedding_size (:obj:`int`, optional, defaults to 128): embedding_size (:obj:`int`, optional, defaults to 128):
Dimensionality of vocabulary embeddings. Dimensionality of vocabulary embeddings.
hidden_size (:obj:`int`, optional, defaults to 4096): hidden_size (:obj:`int`, optional, defaults to 4096):
Dimensionality of the encoder layers and the pooler layer. Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (:obj:`int`, optional, defaults to 12): num_hidden_layers (:obj:`int`, optional, defaults to 12):
Number of hidden layers in the Transformer encoder. Number of hidden layers in the Transformer encoder.
num_hidden_groups (:obj:`int`, optional, defaults to 1): num_hidden_groups (:obj:`int`, optional, defaults to 1):
Number of groups for the hidden layers, parameters in the same group are shared. Number of groups for the hidden layers, parameters in the same group are shared.
num_attention_heads (:obj:`int`, optional, defaults to 64): num_attention_heads (:obj:`int`, optional, defaults to 64):
Number of attention heads for each attention layer in the Transformer encoder. Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (:obj:`int`, optional, defaults to 16384): intermediate_size (:obj:`int`, optional, defaults to 16384):
The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
inner_group_num (:obj:`int`, optional, defaults to 1): inner_group_num (:obj:`int`, optional, defaults to 1):
The number of inner repetition of attention and ffn. The number of inner repetition of attention and ffn.
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"): hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"):
The non-linear activation function (function or string) in the encoder and pooler. The non-linear activation function (function or string) in the encoder and pooler.
If string, "gelu", "relu", "swish" and "gelu_new" are supported. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
hidden_dropout_prob (:obj:`float`, optional, defaults to 0): hidden_dropout_prob (:obj:`float`, optional, defaults to 0):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0): attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
max_position_embeddings (:obj:`int`, optional, defaults to 512): max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something The maximum sequence length that this model might ever be used with. Typically set this to something
large (e.g., 512 or 1024 or 2048). large (e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, optional, defaults to 2): type_vocab_size (:obj:`int`, optional, defaults to 2):
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`. The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`.
initializer_range (:obj:`float`, optional, defaults to 0.02): initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
The epsilon used by the layer normalization layers. The epsilon used by the layer normalization layers.
classifier_dropout_prob (:obj:`float`, optional, defaults to 0.1): classifier_dropout_prob (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for attached classifiers. The dropout ratio for attached classifiers.
Example:: Example::
>>> from transformers import AlbertConfig, AlbertModel >>> from transformers import AlbertConfig, AlbertModel
>>> # Initializing an ALBERT-xxlarge style configuration >>> # Initializing an ALBERT-xxlarge style configuration
>>> albert_xxlarge_configuration = AlbertConfig() >>> albert_xxlarge_configuration = AlbertConfig()
>>> # Initializing an ALBERT-base style configuration >>> # Initializing an ALBERT-base style configuration
>>> albert_base_configuration = AlbertConfig( >>> albert_base_configuration = AlbertConfig(
... hidden_size=768, ... hidden_size=768,
... num_attention_heads=12, ... num_attention_heads=12,
... intermediate_size=3072, ... intermediate_size=3072,
... ) ... )
>>> # Initializing a model from the ALBERT-base style configuration >>> # Initializing a model from the ALBERT-base style configuration
>>> model = AlbertModel(albert_xxlarge_configuration) >>> model = AlbertModel(albert_xxlarge_configuration)
>>> # Accessing the model configuration >>> # Accessing the model configuration
>>> configuration = model.config >>> configuration = model.config
""" """
model_type = "albert" model_type = "albert"
......
...@@ -73,43 +73,112 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict( ...@@ -73,43 +73,112 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict(
CONFIG_MAPPING = OrderedDict( CONFIG_MAPPING = OrderedDict(
[ [
("retribert", RetriBertConfig,), (
("t5", T5Config,), "retribert",
("mobilebert", MobileBertConfig,), RetriBertConfig,
("distilbert", DistilBertConfig,), ),
("albert", AlbertConfig,), (
("camembert", CamembertConfig,), "t5",
("xlm-roberta", XLMRobertaConfig,), T5Config,
),
(
"mobilebert",
MobileBertConfig,
),
(
"distilbert",
DistilBertConfig,
),
(
"albert",
AlbertConfig,
),
(
"camembert",
CamembertConfig,
),
(
"xlm-roberta",
XLMRobertaConfig,
),
("pegasus", PegasusConfig), ("pegasus", PegasusConfig),
("marian", MarianConfig,), (
("mbart", MBartConfig,), "marian",
("bart", BartConfig,), MarianConfig,
("reformer", ReformerConfig,), ),
("longformer", LongformerConfig,), (
("roberta", RobertaConfig,), "mbart",
("flaubert", FlaubertConfig,), MBartConfig,
("bert", BertConfig,), ),
("openai-gpt", OpenAIGPTConfig,), (
("gpt2", GPT2Config,), "bart",
("transfo-xl", TransfoXLConfig,), BartConfig,
("xlnet", XLNetConfig,), ),
("xlm", XLMConfig,), (
("ctrl", CTRLConfig,), "reformer",
("electra", ElectraConfig,), ReformerConfig,
("encoder-decoder", EncoderDecoderConfig,), ),
(
"longformer",
LongformerConfig,
),
(
"roberta",
RobertaConfig,
),
(
"flaubert",
FlaubertConfig,
),
(
"bert",
BertConfig,
),
(
"openai-gpt",
OpenAIGPTConfig,
),
(
"gpt2",
GPT2Config,
),
(
"transfo-xl",
TransfoXLConfig,
),
(
"xlnet",
XLNetConfig,
),
(
"xlm",
XLMConfig,
),
(
"ctrl",
CTRLConfig,
),
(
"electra",
ElectraConfig,
),
(
"encoder-decoder",
EncoderDecoderConfig,
),
] ]
) )
class AutoConfig: class AutoConfig:
r""" r"""
:class:`~transformers.AutoConfig` is a generic configuration class :class:`~transformers.AutoConfig` is a generic configuration class
that will be instantiated as one of the configuration classes of the library that will be instantiated as one of the configuration classes of the library
when created with the :func:`~transformers.AutoConfig.from_pretrained` class method. when created with the :func:`~transformers.AutoConfig.from_pretrained` class method.
The :func:`~transformers.AutoConfig.from_pretrained` method takes care of returning the correct model class instance The :func:`~transformers.AutoConfig.from_pretrained` method takes care of returning the correct model class instance
based on the `model_type` property of the config object, or when it's missing, based on the `model_type` property of the config object, or when it's missing,
falling back to using pattern matching on the `pretrained_model_name_or_path` string. falling back to using pattern matching on the `pretrained_model_name_or_path` string.
""" """
def __init__(self): def __init__(self):
......
...@@ -102,7 +102,7 @@ BART_CONFIG_ARGS_DOC = r""" ...@@ -102,7 +102,7 @@ BART_CONFIG_ARGS_DOC = r"""
@add_start_docstrings_to_callable(BART_CONFIG_ARGS_DOC) @add_start_docstrings_to_callable(BART_CONFIG_ARGS_DOC)
class BartConfig(PretrainedConfig): class BartConfig(PretrainedConfig):
r""" r"""
Configuration class for Bart. Parameters are renamed from the fairseq implementation Configuration class for Bart. Parameters are renamed from the fairseq implementation
""" """
model_type = "bart" model_type = "bart"
...@@ -141,14 +141,14 @@ class BartConfig(PretrainedConfig): ...@@ -141,14 +141,14 @@ class BartConfig(PretrainedConfig):
**common_kwargs **common_kwargs
): ):
r""" r"""
:class:`~transformers.BartConfig` is the configuration class for `BartModel`. :class:`~transformers.BartConfig` is the configuration class for `BartModel`.
Examples:: Examples::
>>> from transformers import BartConfig, BartModel >>> from transformers import BartConfig, BartModel
>>> config = BartConfig.from_pretrained('facebook/bart-large') >>> config = BartConfig.from_pretrained('facebook/bart-large')
>>> model = BartModel(config) >>> model = BartModel(config)
""" """
if "hidden_size" in common_kwargs: if "hidden_size" in common_kwargs:
......
...@@ -50,59 +50,59 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -50,59 +50,59 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class BertConfig(PretrainedConfig): class BertConfig(PretrainedConfig):
r""" r"""
This is the configuration class to store the configuration of a :class:`~transformers.BertModel`. This is the configuration class to store the configuration of a :class:`~transformers.BertModel`.
It is used to instantiate an BERT model according to the specified arguments, defining the model It is used to instantiate an BERT model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture. the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information. for more information.
Args: Args:
vocab_size (:obj:`int`, optional, defaults to 30522): vocab_size (:obj:`int`, optional, defaults to 30522):
Vocabulary size of the BERT model. Defines the different tokens that Vocabulary size of the BERT model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`. can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
hidden_size (:obj:`int`, optional, defaults to 768): hidden_size (:obj:`int`, optional, defaults to 768):
Dimensionality of the encoder layers and the pooler layer. Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (:obj:`int`, optional, defaults to 12): num_hidden_layers (:obj:`int`, optional, defaults to 12):
Number of hidden layers in the Transformer encoder. Number of hidden layers in the Transformer encoder.
num_attention_heads (:obj:`int`, optional, defaults to 12): num_attention_heads (:obj:`int`, optional, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder. Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (:obj:`int`, optional, defaults to 3072): intermediate_size (:obj:`int`, optional, defaults to 3072):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"): hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
The non-linear activation function (function or string) in the encoder and pooler. The non-linear activation function (function or string) in the encoder and pooler.
If string, "gelu", "relu", "swish" and "gelu_new" are supported. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1): hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1): attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
max_position_embeddings (:obj:`int`, optional, defaults to 512): max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might ever be used with. The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048). Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, optional, defaults to 2): type_vocab_size (:obj:`int`, optional, defaults to 2):
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`. The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
initializer_range (:obj:`float`, optional, defaults to 0.02): initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
The epsilon used by the layer normalization layers. The epsilon used by the layer normalization layers.
gradient_checkpointing (:obj:`bool`, optional, defaults to False): gradient_checkpointing (:obj:`bool`, optional, defaults to False):
If True, use gradient checkpointing to save memory at the expense of slower backward pass. If True, use gradient checkpointing to save memory at the expense of slower backward pass.
Example:: Example::
>>> from transformers import BertModel, BertConfig >>> from transformers import BertModel, BertConfig
>>> # Initializing a BERT bert-base-uncased style configuration >>> # Initializing a BERT bert-base-uncased style configuration
>>> configuration = BertConfig() >>> configuration = BertConfig()
>>> # Initializing a model from the bert-base-uncased style configuration >>> # Initializing a model from the bert-base-uncased style configuration
>>> model = BertModel(configuration) >>> model = BertModel(configuration)
>>> # Accessing the model configuration >>> # Accessing the model configuration
>>> configuration = model.config >>> configuration = model.config
""" """
model_type = "bert" model_type = "bert"
......
...@@ -25,55 +25,55 @@ CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://s3.amazonaws.com/models.h ...@@ -25,55 +25,55 @@ CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://s3.amazonaws.com/models.h
class CTRLConfig(PretrainedConfig): class CTRLConfig(PretrainedConfig):
""" """
This is the configuration class to store the configuration of a :class:`~transformers.CTRLModel`. This is the configuration class to store the configuration of a :class:`~transformers.CTRLModel`.
It is used to instantiate an CTRL model according to the specified arguments, defining the model It is used to instantiate an CTRL model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the `ctrl <https://huggingface.co/ctrl>`__ architecture from SalesForce. the `ctrl <https://huggingface.co/ctrl>`__ architecture from SalesForce.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information. for more information.
Args: Args:
vocab_size (:obj:`int`, optional, defaults to 246534): vocab_size (:obj:`int`, optional, defaults to 246534):
Vocabulary size of the CTRL model. Defines the different tokens that Vocabulary size of the CTRL model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`. can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
n_positions (:obj:`int`, optional, defaults to 256): n_positions (:obj:`int`, optional, defaults to 256):
The maximum sequence length that this model might ever be used with. The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048). Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
n_ctx (:obj:`int`, optional, defaults to 256): n_ctx (:obj:`int`, optional, defaults to 256):
Dimensionality of the causal mask (usually same as n_positions). Dimensionality of the causal mask (usually same as n_positions).
n_embd (:obj:`int`, optional, defaults to 1280): n_embd (:obj:`int`, optional, defaults to 1280):
Dimensionality of the embeddings and hidden states. Dimensionality of the embeddings and hidden states.
dff (:obj:`int`, optional, defaults to 8192): dff (:obj:`int`, optional, defaults to 8192):
Dimensionality of the inner dimension of the FFN. Dimensionality of the inner dimension of the FFN.
n_layer (:obj:`int`, optional, defaults to 48): n_layer (:obj:`int`, optional, defaults to 48):
Number of hidden layers in the Transformer encoder. Number of hidden layers in the Transformer encoder.
n_head (:obj:`int`, optional, defaults to 16): n_head (:obj:`int`, optional, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder. Number of attention heads for each attention layer in the Transformer encoder.
resid_pdrop (:obj:`float`, optional, defaults to 0.1): resid_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
embd_pdrop (:obj:`int`, optional, defaults to 0.1): embd_pdrop (:obj:`int`, optional, defaults to 0.1):
The dropout ratio for the embeddings. The dropout ratio for the embeddings.
attn_pdrop (:obj:`float`, optional, defaults to 0.1): attn_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention. The dropout ratio for the attention.
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-6): layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-6):
The epsilon to use in the layer normalization layers The epsilon to use in the layer normalization layers
initializer_range (:obj:`float`, optional, defaults to 0.02): initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
Example:: Example::
>>> from transformers import CTRLModel, CTRLConfig >>> from transformers import CTRLModel, CTRLConfig
>>> # Initializing a CTRL configuration >>> # Initializing a CTRL configuration
>>> configuration = CTRLConfig() >>> configuration = CTRLConfig()
>>> # Initializing a model from the configuration >>> # Initializing a model from the configuration
>>> model = CTRLModel(configuration) >>> model = CTRLModel(configuration)
>>> # Accessing the model configuration >>> # Accessing the model configuration
>>> configuration = model.config >>> configuration = model.config
""" """
model_type = "ctrl" model_type = "ctrl"
......
...@@ -33,61 +33,61 @@ DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -33,61 +33,61 @@ DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class DistilBertConfig(PretrainedConfig): class DistilBertConfig(PretrainedConfig):
r""" r"""
This is the configuration class to store the configuration of a :class:`~transformers.DistilBertModel`. This is the configuration class to store the configuration of a :class:`~transformers.DistilBertModel`.
It is used to instantiate a DistilBERT model according to the specified arguments, defining the model It is used to instantiate a DistilBERT model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the DistilBERT `distilbert-base-uncased <https://huggingface.co/distilbert-base-uncased>`__ architecture. the DistilBERT `distilbert-base-uncased <https://huggingface.co/distilbert-base-uncased>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information. for more information.
Args: Args:
vocab_size (:obj:`int`, optional, defaults to 30522): vocab_size (:obj:`int`, optional, defaults to 30522):
Vocabulary size of the DistilBERT model. Defines the different tokens that Vocabulary size of the DistilBERT model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`. can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
max_position_embeddings (:obj:`int`, optional, defaults to 512): max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might ever be used with. The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048). Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
sinusoidal_pos_embds (:obj:`boolean`, optional, defaults to :obj:`False`): sinusoidal_pos_embds (:obj:`boolean`, optional, defaults to :obj:`False`):
Whether to use sinusoidal positional embeddings. Whether to use sinusoidal positional embeddings.
n_layers (:obj:`int`, optional, defaults to 6): n_layers (:obj:`int`, optional, defaults to 6):
Number of hidden layers in the Transformer encoder. Number of hidden layers in the Transformer encoder.
n_heads (:obj:`int`, optional, defaults to 12): n_heads (:obj:`int`, optional, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder. Number of attention heads for each attention layer in the Transformer encoder.
dim (:obj:`int`, optional, defaults to 768): dim (:obj:`int`, optional, defaults to 768):
Dimensionality of the encoder layers and the pooler layer. Dimensionality of the encoder layers and the pooler layer.
hidden_dim (:obj:`int`, optional, defaults to 3072): hidden_dim (:obj:`int`, optional, defaults to 3072):
The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
dropout (:obj:`float`, optional, defaults to 0.1): dropout (:obj:`float`, optional, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (:obj:`float`, optional, defaults to 0.1): attention_dropout (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
activation (:obj:`str` or :obj:`function`, optional, defaults to "gelu"): activation (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
The non-linear activation function (function or string) in the encoder and pooler. The non-linear activation function (function or string) in the encoder and pooler.
If string, "gelu", "relu", "swish" and "gelu_new" are supported. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
initializer_range (:obj:`float`, optional, defaults to 0.02): initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
qa_dropout (:obj:`float`, optional, defaults to 0.1): qa_dropout (:obj:`float`, optional, defaults to 0.1):
The dropout probabilities used in the question answering model The dropout probabilities used in the question answering model
:class:`~transformers.DistilBertForQuestionAnswering`. :class:`~transformers.DistilBertForQuestionAnswering`.
seq_classif_dropout (:obj:`float`, optional, defaults to 0.2): seq_classif_dropout (:obj:`float`, optional, defaults to 0.2):
The dropout probabilities used in the sequence classification and the multiple choice model The dropout probabilities used in the sequence classification and the multiple choice model
:class:`~transformers.DistilBertForSequenceClassification`. :class:`~transformers.DistilBertForSequenceClassification`.
Example:: Example::
>>> from transformers import DistilBertModel, DistilBertConfig >>> from transformers import DistilBertModel, DistilBertConfig
>>> # Initializing a DistilBERT configuration >>> # Initializing a DistilBERT configuration
>>> configuration = DistilBertConfig() >>> configuration = DistilBertConfig()
>>> # Initializing a model from the configuration >>> # Initializing a model from the configuration
>>> model = DistilBertModel(configuration) >>> model = DistilBertModel(configuration)
>>> # Accessing the model configuration >>> # Accessing the model configuration
>>> configuration = model.config >>> configuration = model.config
""" """
model_type = "distilbert" model_type = "distilbert"
......
...@@ -29,16 +29,16 @@ DPR_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -29,16 +29,16 @@ DPR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class DPRConfig(BertConfig): class DPRConfig(BertConfig):
r""" r"""
:class:`~transformers.DPRConfig` is the configuration class to store the configuration of a :class:`~transformers.DPRConfig` is the configuration class to store the configuration of a
`DPRModel`. `DPRModel`.
This is the configuration class to store the configuration of a `DPRContextEncoder`, `DPRQuestionEncoder`, or a `DPRReader`. This is the configuration class to store the configuration of a `DPRContextEncoder`, `DPRQuestionEncoder`, or a `DPRReader`.
It is used to instantiate the components of the DPR model. It is used to instantiate the components of the DPR model.
Args: Args:
projection_dim (:obj:`int`, optional, defaults to 0): projection_dim (:obj:`int`, optional, defaults to 0):
Dimension of the projection for the context and question encoders. Dimension of the projection for the context and question encoders.
If it is set to zero (default), then no projection is done. If it is set to zero (default), then no projection is done.
""" """
model_type = "dpr" model_type = "dpr"
......
...@@ -33,82 +33,82 @@ ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -33,82 +33,82 @@ ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class ElectraConfig(PretrainedConfig): class ElectraConfig(PretrainedConfig):
r""" r"""
This is the configuration class to store the configuration of a :class:`~transformers.ElectraModel`. This is the configuration class to store the configuration of a :class:`~transformers.ElectraModel`.
It is used to instantiate an ELECTRA model according to the specified arguments, defining the model It is used to instantiate an ELECTRA model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the ELECTRA `google/electra-small-discriminator <https://huggingface.co/google/electra-small-discriminator>`__ the ELECTRA `google/electra-small-discriminator <https://huggingface.co/google/electra-small-discriminator>`__
architecture. architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information. for more information.
Args: Args:
vocab_size (:obj:`int`, optional, defaults to 30522): vocab_size (:obj:`int`, optional, defaults to 30522):
Vocabulary size of the ELECTRA model. Defines the different tokens that Vocabulary size of the ELECTRA model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.ElectraModel`. can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.ElectraModel`.
embedding_size (:obj:`int`, optional, defaults to 128): embedding_size (:obj:`int`, optional, defaults to 128):
Dimensionality of the encoder layers and the pooler layer. Dimensionality of the encoder layers and the pooler layer.
hidden_size (:obj:`int`, optional, defaults to 256): hidden_size (:obj:`int`, optional, defaults to 256):
Dimensionality of the encoder layers and the pooler layer. Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (:obj:`int`, optional, defaults to 12): num_hidden_layers (:obj:`int`, optional, defaults to 12):
Number of hidden layers in the Transformer encoder. Number of hidden layers in the Transformer encoder.
num_attention_heads (:obj:`int`, optional, defaults to 4): num_attention_heads (:obj:`int`, optional, defaults to 4):
Number of attention heads for each attention layer in the Transformer encoder. Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (:obj:`int`, optional, defaults to 1024): intermediate_size (:obj:`int`, optional, defaults to 1024):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"): hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
The non-linear activation function (function or string) in the encoder and pooler. The non-linear activation function (function or string) in the encoder and pooler.
If string, "gelu", "relu", "swish" and "gelu_new" are supported. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1): hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1): attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
max_position_embeddings (:obj:`int`, optional, defaults to 512): max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might ever be used with. The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048). Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, optional, defaults to 2): type_vocab_size (:obj:`int`, optional, defaults to 2):
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.ElectraModel`. The vocabulary size of the `token_type_ids` passed into :class:`~transformers.ElectraModel`.
initializer_range (:obj:`float`, optional, defaults to 0.02): initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
The epsilon used by the layer normalization layers. The epsilon used by the layer normalization layers.
summary_type (:obj:`string`, optional, defaults to "first"): summary_type (:obj:`string`, optional, defaults to "first"):
Argument used when doing sequence summary. Used in for the multiple choice head in Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.ElectraForMultipleChoice`. :class:`~transformers.ElectraForMultipleChoice`.
Is one of the following options: Is one of the following options:
- 'last' => take the last token hidden state (like XLNet) - 'last' => take the last token hidden state (like XLNet)
- 'first' => take the first token hidden state (like Bert) - 'first' => take the first token hidden state (like Bert)
- 'mean' => take the mean of all tokens hidden states - 'mean' => take the mean of all tokens hidden states
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2) - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
- 'attn' => Not implemented now, use multi-head attention - 'attn' => Not implemented now, use multi-head attention
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`): summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.ElectraForMultipleChoice`. :class:`~transformers.ElectraForMultipleChoice`.
Add a projection after the vector extraction Add a projection after the vector extraction
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`): summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
Argument used when doing sequence summary. Used in for the multiple choice head in Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.ElectraForMultipleChoice`. :class:`~transformers.ElectraForMultipleChoice`.
'gelu' => add a gelu activation to the output, Other => no activation. 'gelu' => add a gelu activation to the output, Other => no activation.
summary_last_dropout (:obj:`float`, optional, defaults to 0.0): summary_last_dropout (:obj:`float`, optional, defaults to 0.0):
Argument used when doing sequence summary. Used in for the multiple choice head in Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.ElectraForMultipleChoice`. :class:`~transformers.ElectraForMultipleChoice`.
Add a dropout after the projection and activation Add a dropout after the projection and activation
Example:: Example::
>>> from transformers import ElectraModel, ElectraConfig >>> from transformers import ElectraModel, ElectraConfig
>>> # Initializing a ELECTRA electra-base-uncased style configuration >>> # Initializing a ELECTRA electra-base-uncased style configuration
>>> configuration = ElectraConfig() >>> configuration = ElectraConfig()
>>> # Initializing a model from the electra-base-uncased style configuration >>> # Initializing a model from the electra-base-uncased style configuration
>>> model = ElectraModel(configuration) >>> model = ElectraModel(configuration)
>>> # Accessing the model configuration >>> # Accessing the model configuration
>>> configuration = model.config >>> configuration = model.config
""" """
model_type = "electra" model_type = "electra"
......
...@@ -25,47 +25,47 @@ logger = logging.get_logger(__name__) ...@@ -25,47 +25,47 @@ logger = logging.get_logger(__name__)
class EncoderDecoderConfig(PretrainedConfig): class EncoderDecoderConfig(PretrainedConfig):
r""" r"""
:class:`~transformers.EncoderDecoderConfig` is the configuration class to store the configuration of a `EncoderDecoderModel`. :class:`~transformers.EncoderDecoderConfig` is the configuration class to store the configuration of a `EncoderDecoderModel`.
It is used to instantiate an Encoder Decoder model according to the specified arguments, defining the encoder and decoder configs. It is used to instantiate an Encoder Decoder model according to the specified arguments, defining the encoder and decoder configs.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` Configuration objects inherit from :class:`~transformers.PretrainedConfig`
and can be used to control the model outputs. and can be used to control the model outputs.
See the documentation for :class:`~transformers.PretrainedConfig` for more information. See the documentation for :class:`~transformers.PretrainedConfig` for more information.
Args: Args:
kwargs (`optional`): kwargs (`optional`):
Remaining dictionary of keyword arguments. Notably: Remaining dictionary of keyword arguments. Notably:
encoder (:class:`PretrainedConfig`, optional, defaults to `None`): encoder (:class:`PretrainedConfig`, optional, defaults to `None`):
An instance of a configuration object that defines the encoder config. An instance of a configuration object that defines the encoder config.
decoder (:class:`PretrainedConfig`, optional, defaults to `None`): decoder (:class:`PretrainedConfig`, optional, defaults to `None`):
An instance of a configuration object that defines the decoder config. An instance of a configuration object that defines the decoder config.
Example:: Example::
>>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel >>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel
>>> # Initializing a BERT bert-base-uncased style configuration >>> # Initializing a BERT bert-base-uncased style configuration
>>> config_encoder = BertConfig() >>> config_encoder = BertConfig()
>>> config_decoder = BertConfig() >>> config_decoder = BertConfig()
>>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder) >>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
>>> # Initializing a Bert2Bert model from the bert-base-uncased style configurations >>> # Initializing a Bert2Bert model from the bert-base-uncased style configurations
>>> model = EncoderDecoderModel(config=config) >>> model = EncoderDecoderModel(config=config)
>>> # Accessing the model configuration >>> # Accessing the model configuration
>>> config_encoder = model.config.encoder >>> config_encoder = model.config.encoder
>>> config_decoder = model.config.decoder >>> config_decoder = model.config.decoder
>>> # set decoder config to causal lm >>> # set decoder config to causal lm
>>> config_decoder.is_decoder = True >>> config_decoder.is_decoder = True
>>> config_decoder.add_cross_attention = True >>> config_decoder.add_cross_attention = True
>>> # Saving the model, including its configuration >>> # Saving the model, including its configuration
>>> model.save_pretrained('my-model') >>> model.save_pretrained('my-model')
>>> # loading model and config from pretrained folder >>> # loading model and config from pretrained folder
>>> encoder_decoder_config = EncoderDecoderConfig.from_pretrained('my-model') >>> encoder_decoder_config = EncoderDecoderConfig.from_pretrained('my-model')
>>> model = EncoderDecoderModel.from_pretrained('my-model', config=encoder_decoder_config) >>> model = EncoderDecoderModel.from_pretrained('my-model', config=encoder_decoder_config)
""" """
model_type = "encoder_decoder" model_type = "encoder_decoder"
......
...@@ -30,121 +30,120 @@ FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -30,121 +30,120 @@ FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class FlaubertConfig(XLMConfig): class FlaubertConfig(XLMConfig):
""" """
Configuration class to store the configuration of a `FlaubertModel`. Configuration class to store the configuration of a `FlaubertModel`.
This is the configuration class to store the configuration of a :class:`~transformers.XLMModel`. This is the configuration class to store the configuration of a :class:`~transformers.XLMModel`.
It is used to instantiate an XLM model according to the specified arguments, defining the model It is used to instantiate an XLM model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the `xlm-mlm-en-2048 <https://huggingface.co/xlm-mlm-en-2048>`__ architecture. the `xlm-mlm-en-2048 <https://huggingface.co/xlm-mlm-en-2048>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information. for more information.
Args: Args:
pre_norm (:obj:`bool`, `optional`, defaults to :obj:`False`): pre_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to apply the layer normalization before or after the feed forward layer following the Whether to apply the layer normalization before or after the feed forward layer following the
attention in each layer (Vaswani et al., Tensor2Tensor for Neural Machine Translation. 2018) attention in each layer (Vaswani et al., Tensor2Tensor for Neural Machine Translation. 2018)
layerdrop (:obj:`float`, `optional`, defaults to 0.0): layerdrop (:obj:`float`, `optional`, defaults to 0.0):
Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand
with Structured Dropout. ICLR 2020) with Structured Dropout. ICLR 2020)
vocab_size (:obj:`int`, optional, defaults to 30145): vocab_size (:obj:`int`, optional, defaults to 30145):
Vocabulary size of the Flaubert model. Defines the different tokens that Vocabulary size of the Flaubert model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.FlaubertModel`. can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.FlaubertModel`.
emb_dim (:obj:`int`, optional, defaults to 2048): emb_dim (:obj:`int`, optional, defaults to 2048):
Dimensionality of the encoder layers and the pooler layer. Dimensionality of the encoder layers and the pooler layer.
n_layer (:obj:`int`, optional, defaults to 12): n_layer (:obj:`int`, optional, defaults to 12):
Number of hidden layers in the Transformer encoder. Number of hidden layers in the Transformer encoder.
n_head (:obj:`int`, optional, defaults to 16): n_head (:obj:`int`, optional, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder. Number of attention heads for each attention layer in the Transformer encoder.
dropout (:obj:`float`, optional, defaults to 0.1): dropout (:obj:`float`, optional, defaults to 0.1):
The dropout probability for all fully connected The dropout probability for all fully connected
layers in the embeddings, encoder, and pooler. layers in the embeddings, encoder, and pooler.
attention_dropout (:obj:`float`, optional, defaults to 0.1): attention_dropout (:obj:`float`, optional, defaults to 0.1):
The dropout probability for the attention mechanism The dropout probability for the attention mechanism
gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`): gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`):
The non-linear activation function (function or string) in the The non-linear activation function (function or string) in the
encoder and pooler. If set to `True`, "gelu" will be used instead of "relu". encoder and pooler. If set to `True`, "gelu" will be used instead of "relu".
sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`): sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`):
Whether to use sinusoidal positional embeddings instead of absolute positional embeddings. Whether to use sinusoidal positional embeddings instead of absolute positional embeddings.
causal (:obj:`boolean`, optional, defaults to :obj:`False`): causal (:obj:`boolean`, optional, defaults to :obj:`False`):
Set this to `True` for the model to behave in a causal manner. Set this to `True` for the model to behave in a causal manner.
Causal models use a triangular attention mask in order to only attend to the left-side context instead Causal models use a triangular attention mask in order to only attend to the left-side context instead
if a bidirectional context. if a bidirectional context.
asm (:obj:`boolean`, optional, defaults to :obj:`False`): asm (:obj:`boolean`, optional, defaults to :obj:`False`):
Whether to use an adaptive log softmax projection layer instead of a linear layer for the prediction Whether to use an adaptive log softmax projection layer instead of a linear layer for the prediction
layer. layer.
n_langs (:obj:`int`, optional, defaults to 1): n_langs (:obj:`int`, optional, defaults to 1):
The number of languages the model handles. Set to 1 for monolingual models. The number of languages the model handles. Set to 1 for monolingual models.
use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`) use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`)
Whether to use language embeddings. Some models use additional language embeddings, see Whether to use language embeddings. Some models use additional language embeddings, see
`the multilingual models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__ `the multilingual models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__
for information on how to use them. for information on how to use them.
max_position_embeddings (:obj:`int`, optional, defaults to 512): max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might The maximum sequence length that this model might
ever be used with. Typically set this to something large just in case ever be used with. Typically set this to something large just in case
(e.g., 512 or 1024 or 2048). (e.g., 512 or 1024 or 2048).
embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5): embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5):
The standard deviation of the truncated_normal_initializer for The standard deviation of the truncated_normal_initializer for
initializing the embedding matrices. initializing the embedding matrices.
init_std (:obj:`int`, optional, defaults to 50257): init_std (:obj:`int`, optional, defaults to 50257):
The standard deviation of the truncated_normal_initializer for The standard deviation of the truncated_normal_initializer for
initializing all weight matrices except the embedding matrices. initializing all weight matrices except the embedding matrices.
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
The epsilon used by the layer normalization layers. The epsilon used by the layer normalization layers.
bos_index (:obj:`int`, optional, defaults to 0): bos_index (:obj:`int`, optional, defaults to 0):
The index of the beginning of sentence token in the vocabulary. The index of the beginning of sentence token in the vocabulary.
eos_index (:obj:`int`, optional, defaults to 1): eos_index (:obj:`int`, optional, defaults to 1):
The index of the end of sentence token in the vocabulary. The index of the end of sentence token in the vocabulary.
pad_index (:obj:`int`, optional, defaults to 2): pad_index (:obj:`int`, optional, defaults to 2):
The index of the padding token in the vocabulary. The index of the padding token in the vocabulary.
unk_index (:obj:`int`, optional, defaults to 3): unk_index (:obj:`int`, optional, defaults to 3):
The index of the unknown token in the vocabulary. The index of the unknown token in the vocabulary.
mask_index (:obj:`int`, optional, defaults to 5): mask_index (:obj:`int`, optional, defaults to 5):
The index of the masking token in the vocabulary. The index of the masking token in the vocabulary.
is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`): is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`):
Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al. Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
summary_type (:obj:`string`, optional, defaults to "first"): summary_type (:obj:`string`, optional, defaults to "first"):
Argument used when doing sequence summary. Used in for the multiple choice head in Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.XLMForSequenceClassification`. :class:`~transformers.XLMForSequenceClassification`.
Is one of the following options: Is one of the following options:
- 'last' => take the last token hidden state (like XLNet) - 'last' => take the last token hidden state (like XLNet)
- 'first' => take the first token hidden state (like Bert) - 'first' => take the first token hidden state (like Bert)
- 'mean' => take the mean of all tokens hidden states - 'mean' => take the mean of all tokens hidden states
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2) - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
- 'attn' => Not implemented now, use multi-head attention - 'attn' => Not implemented now, use multi-head attention
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`): summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.XLMForSequenceClassification`. :class:`~transformers.XLMForSequenceClassification`.
Add a projection after the vector extraction Add a projection after the vector extraction
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`): summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
Argument used when doing sequence summary. Used in for the multiple choice head in Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.XLMForSequenceClassification`. :class:`~transformers.XLMForSequenceClassification`.
'tanh' => add a tanh activation to the output, Other => no activation. 'tanh' => add a tanh activation to the output, Other => no activation.
summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`): summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.XLMForSequenceClassification`. :class:`~transformers.XLMForSequenceClassification`.
If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False. If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
summary_first_dropout (:obj:`float`, optional, defaults to 0.1): summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
Argument used when doing sequence summary. Used in for the multiple choice head in Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.XLMForSequenceClassification`. :class:`~transformers.XLMForSequenceClassification`.
Add a dropout before the projection and activation Add a dropout before the projection and activation
start_n_top (:obj:`int`, optional, defaults to 5): start_n_top (:obj:`int`, optional, defaults to 5):
Used in the SQuAD evaluation script for XLM and XLNet. Used in the SQuAD evaluation script for XLM and XLNet.
end_n_top (:obj:`int`, optional, defaults to 5): end_n_top (:obj:`int`, optional, defaults to 5):
Used in the SQuAD evaluation script for XLM and XLNet. Used in the SQuAD evaluation script for XLM and XLNet.
mask_token_id (:obj:`int`, optional, defaults to 0): mask_token_id (:obj:`int`, optional, defaults to 0):
Model agnostic parameter to identify masked tokens when generating text in an MLM context. Model agnostic parameter to identify masked tokens when generating text in an MLM context.
lang_id (:obj:`int`, optional, defaults to 1): lang_id (:obj:`int`, optional, defaults to 1):
The ID of the language used by the model. This parameter is used when generating The ID of the language used by the model. This parameter is used when generating
text in a given language. text in a given language.
""" """
model_type = "flaubert" model_type = "flaubert"
def __init__(self, layerdrop=0.0, pre_norm=False, pad_token_id=2, bos_token_id=0, **kwargs): def __init__(self, layerdrop=0.0, pre_norm=False, pad_token_id=2, bos_token_id=0, **kwargs):
"""Constructs FlaubertConfig. """Constructs FlaubertConfig."""
"""
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs) super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs)
self.layerdrop = layerdrop self.layerdrop = layerdrop
self.pre_norm = pre_norm self.pre_norm = pre_norm
...@@ -32,84 +32,84 @@ GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -32,84 +32,84 @@ GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class GPT2Config(PretrainedConfig): class GPT2Config(PretrainedConfig):
""" """
This is the configuration class to store the configuration of a :class:`~transformers.GPT2Model`. This is the configuration class to store the configuration of a :class:`~transformers.GPT2Model`.
It is used to instantiate an GPT-2 model according to the specified arguments, defining the model It is used to instantiate an GPT-2 model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the GPT-2 `small <https://huggingface.co/gpt2>`__ architecture. the GPT-2 `small <https://huggingface.co/gpt2>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information. for more information.
Args: Args:
vocab_size (:obj:`int`, optional, defaults to 50257): vocab_size (:obj:`int`, optional, defaults to 50257):
Vocabulary size of the GPT-2 model. Defines the different tokens that Vocabulary size of the GPT-2 model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.GPT2Model`. can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.GPT2Model`.
n_positions (:obj:`int`, optional, defaults to 1024): n_positions (:obj:`int`, optional, defaults to 1024):
The maximum sequence length that this model might ever be used with. The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048). Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
n_ctx (:obj:`int`, optional, defaults to 1024): n_ctx (:obj:`int`, optional, defaults to 1024):
Dimensionality of the causal mask (usually same as n_positions). Dimensionality of the causal mask (usually same as n_positions).
n_embd (:obj:`int`, optional, defaults to 768): n_embd (:obj:`int`, optional, defaults to 768):
Dimensionality of the embeddings and hidden states. Dimensionality of the embeddings and hidden states.
n_layer (:obj:`int`, optional, defaults to 12): n_layer (:obj:`int`, optional, defaults to 12):
Number of hidden layers in the Transformer encoder. Number of hidden layers in the Transformer encoder.
n_head (:obj:`int`, optional, defaults to 12): n_head (:obj:`int`, optional, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder. Number of attention heads for each attention layer in the Transformer encoder.
n_inner (:obj:`int`, optional, defaults to None): n_inner (:obj:`int`, optional, defaults to None):
Dimensionality of the inner feed-forward layers. :obj:`None` will set it to 4 times n_embd Dimensionality of the inner feed-forward layers. :obj:`None` will set it to 4 times n_embd
activation_function (:obj:`str`, optional, defaults to 'gelu'): activation_function (:obj:`str`, optional, defaults to 'gelu'):
Activation function selected in the list ["relu", "swish", "gelu", "tanh", "gelu_new"]. Activation function selected in the list ["relu", "swish", "gelu", "tanh", "gelu_new"].
resid_pdrop (:obj:`float`, optional, defaults to 0.1): resid_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
embd_pdrop (:obj:`int`, optional, defaults to 0.1): embd_pdrop (:obj:`int`, optional, defaults to 0.1):
The dropout ratio for the embeddings. The dropout ratio for the embeddings.
attn_pdrop (:obj:`float`, optional, defaults to 0.1): attn_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention. The dropout ratio for the attention.
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5): layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
The epsilon to use in the layer normalization layers The epsilon to use in the layer normalization layers
initializer_range (:obj:`float`, optional, defaults to 0.02): initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
summary_type (:obj:`string`, optional, defaults to "cls_index"): summary_type (:obj:`string`, optional, defaults to "cls_index"):
Argument used when doing sequence summary. Used in for the multiple choice head in Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.GPT2DoubleHeadsModel`. :class:`~transformers.GPT2DoubleHeadsModel`.
Is one of the following options: Is one of the following options:
- 'last' => take the last token hidden state (like XLNet) - 'last' => take the last token hidden state (like XLNet)
- 'first' => take the first token hidden state (like Bert) - 'first' => take the first token hidden state (like Bert)
- 'mean' => take the mean of all tokens hidden states - 'mean' => take the mean of all tokens hidden states
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2) - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
- 'attn' => Not implemented now, use multi-head attention - 'attn' => Not implemented now, use multi-head attention
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`): summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.GPT2DoubleHeadsModel`. :class:`~transformers.GPT2DoubleHeadsModel`.
Add a projection after the vector extraction Add a projection after the vector extraction
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`): summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
Argument used when doing sequence summary. Used in for the multiple choice head in Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.GPT2DoubleHeadsModel`. :class:`~transformers.GPT2DoubleHeadsModel`.
'tanh' => add a tanh activation to the output, Other => no activation. 'tanh' => add a tanh activation to the output, Other => no activation.
summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`): summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.GPT2DoubleHeadsModel`. :class:`~transformers.GPT2DoubleHeadsModel`.
If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False. If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
summary_first_dropout (:obj:`float`, optional, defaults to 0.1): summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
Argument used when doing sequence summary. Used in for the multiple choice head in Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.GPT2DoubleHeadsModel`. :class:`~transformers.GPT2DoubleHeadsModel`.
Add a dropout before the projection and activation Add a dropout before the projection and activation
Example:: Example::
>>> from transformers import GPT2Model, GPT2Config >>> from transformers import GPT2Model, GPT2Config
>>> # Initializing a GPT2 configuration >>> # Initializing a GPT2 configuration
>>> configuration = GPT2Config() >>> configuration = GPT2Config()
>>> # Initializing a model from the configuration >>> # Initializing a model from the configuration
>>> model = GPT2Model(configuration) >>> model = GPT2Model(configuration)
>>> # Accessing the model configuration >>> # Accessing the model configuration
>>> configuration = model.config >>> configuration = model.config
""" """
model_type = "gpt2" model_type = "gpt2"
......
...@@ -33,32 +33,32 @@ LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -33,32 +33,32 @@ LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class LongformerConfig(RobertaConfig): class LongformerConfig(RobertaConfig):
r""" r"""
This is the configuration class to store the configuration of a :class:`~transformers.LongformerModel`. This is the configuration class to store the configuration of a :class:`~transformers.LongformerModel`.
It is used to instantiate an Longformer model according to the specified arguments, defining the model It is used to instantiate an Longformer model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the RoBERTa `roberta-base <https://huggingface.co/roberta-base>`__ architecture with a sequence length 4,096. the RoBERTa `roberta-base <https://huggingface.co/roberta-base>`__ architecture with a sequence length 4,096.
The :class:`~transformers.LongformerConfig` class directly inherits :class:`~transformers.RobertaConfig`. The :class:`~transformers.LongformerConfig` class directly inherits :class:`~transformers.RobertaConfig`.
It reuses the same defaults. Please check the parent class for more information. It reuses the same defaults. Please check the parent class for more information.
Args: Args:
attention_window (:obj:`int` or :obj:`List[int]`, optional, defaults to 512): attention_window (:obj:`int` or :obj:`List[int]`, optional, defaults to 512):
Size of an attention window around each token. If :obj:`int`, use the same size for all layers. Size of an attention window around each token. If :obj:`int`, use the same size for all layers.
To specify a different window size for each layer, use a :obj:`List[int]` where To specify a different window size for each layer, use a :obj:`List[int]` where
``len(attention_window) == num_hidden_layers``. ``len(attention_window) == num_hidden_layers``.
Example:: Example::
>>> from transformers import LongformerConfig, LongformerModel >>> from transformers import LongformerConfig, LongformerModel
>>> # Initializing a Longformer configuration >>> # Initializing a Longformer configuration
>>> configuration = LongformerConfig() >>> configuration = LongformerConfig()
>>> # Initializing a model from the configuration >>> # Initializing a model from the configuration
>>> model = LongformerModel(configuration) >>> model = LongformerModel(configuration)
>>> # Accessing the model configuration >>> # Accessing the model configuration
>>> configuration = model.config >>> configuration = model.config
""" """
model_type = "longformer" model_type = "longformer"
......
...@@ -25,79 +25,79 @@ MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -25,79 +25,79 @@ MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class MobileBertConfig(PretrainedConfig): class MobileBertConfig(PretrainedConfig):
r""" r"""
This is the configuration class to store the configuration of a :class:`~transformers.MobileBertModel`. This is the configuration class to store the configuration of a :class:`~transformers.MobileBertModel`.
It is used to instantiate a MobileBERT model according to the specified arguments, defining the model It is used to instantiate a MobileBERT model according to the specified arguments, defining the model
architecture. architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information. for more information.
Args: Args:
vocab_size (:obj:`int`, optional, defaults to 30522): vocab_size (:obj:`int`, optional, defaults to 30522):
Vocabulary size of the MobileBERT model. Defines the different tokens that Vocabulary size of the MobileBERT model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.MobileBertModel`. can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.MobileBertModel`.
hidden_size (:obj:`int`, optional, defaults to 512): hidden_size (:obj:`int`, optional, defaults to 512):
Dimensionality of the encoder layers and the pooler layer. Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (:obj:`int`, optional, defaults to 24): num_hidden_layers (:obj:`int`, optional, defaults to 24):
Number of hidden layers in the Transformer encoder. Number of hidden layers in the Transformer encoder.
num_attention_heads (:obj:`int`, optional, defaults to 4): num_attention_heads (:obj:`int`, optional, defaults to 4):
Number of attention heads for each attention layer in the Transformer encoder. Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (:obj:`int`, optional, defaults to 512): intermediate_size (:obj:`int`, optional, defaults to 512):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "relu"): hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "relu"):
The non-linear activation function (function or string) in the encoder and pooler. The non-linear activation function (function or string) in the encoder and pooler.
If string, "gelu", "relu", "swish" and "gelu_new" are supported. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
hidden_dropout_prob (:obj:`float`, optional, defaults to 0.0): hidden_dropout_prob (:obj:`float`, optional, defaults to 0.0):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1): attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
max_position_embeddings (:obj:`int`, optional, defaults to 512): max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might ever be used with. The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048). Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, optional, defaults to 2): type_vocab_size (:obj:`int`, optional, defaults to 2):
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.MobileBertModel`. The vocabulary size of the `token_type_ids` passed into :class:`~transformers.MobileBertModel`.
initializer_range (:obj:`float`, optional, defaults to 0.02): initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
The epsilon used by the layer normalization layers. The epsilon used by the layer normalization layers.
pad_token_id (:obj:`int`, optional, defaults to 0): pad_token_id (:obj:`int`, optional, defaults to 0):
The ID of the token in the word embedding to use as padding. The ID of the token in the word embedding to use as padding.
embedding_size (:obj:`int`, optional, defaults to 128): embedding_size (:obj:`int`, optional, defaults to 128):
The dimension of the word embedding vectors. The dimension of the word embedding vectors.
trigram_input (:obj:`bool`, optional, defaults to True): trigram_input (:obj:`bool`, optional, defaults to True):
Use a convolution of trigram as input. Use a convolution of trigram as input.
use_bottleneck (:obj:`bool`, optional, defaults to True): use_bottleneck (:obj:`bool`, optional, defaults to True):
Whether to use bottleneck in BERT. Whether to use bottleneck in BERT.
intra_bottleneck_size (:obj:`int`, optional, defaults to 128): intra_bottleneck_size (:obj:`int`, optional, defaults to 128):
Size of bottleneck layer output. Size of bottleneck layer output.
use_bottleneck_attention (:obj:`bool`, optional, defaults to False): use_bottleneck_attention (:obj:`bool`, optional, defaults to False):
Whether to use attention inputs from the bottleneck transformation. Whether to use attention inputs from the bottleneck transformation.
key_query_shared_bottleneck (:obj:`bool`, optional, defaults to True): key_query_shared_bottleneck (:obj:`bool`, optional, defaults to True):
Whether to use the same linear transformation for query&key in the bottleneck. Whether to use the same linear transformation for query&key in the bottleneck.
num_feedforward_networks (:obj:`int`, optional, defaults to 4): num_feedforward_networks (:obj:`int`, optional, defaults to 4):
Number of FFNs in a block. Number of FFNs in a block.
normalization_type (:obj:`str`, optional, defaults to "no_norm"): normalization_type (:obj:`str`, optional, defaults to "no_norm"):
The normalization type in BERT. The normalization type in BERT.
Example: Example:
>>> from transformers import MobileBertModel, MobileBertConfig >>> from transformers import MobileBertModel, MobileBertConfig
>>> # Initializing a MobileBERT configuration >>> # Initializing a MobileBERT configuration
>>> configuration = MobileBertConfig() >>> configuration = MobileBertConfig()
>>> # Initializing a model from the configuration above >>> # Initializing a model from the configuration above
>>> model = MobileBertModel(configuration) >>> model = MobileBertModel(configuration)
>>> # Accessing the model configuration >>> # Accessing the model configuration
>>> configuration = model.config >>> configuration = model.config
Attributes: Attributes:
pretrained_config_archive_map (Dict[str, str]): pretrained_config_archive_map (Dict[str, str]):
A dictionary containing all the available pre-trained checkpoints. A dictionary containing all the available pre-trained checkpoints.
""" """
pretrained_config_archive_map = MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
model_type = "mobilebert" model_type = "mobilebert"
......
...@@ -28,84 +28,84 @@ OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -28,84 +28,84 @@ OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
class OpenAIGPTConfig(PretrainedConfig): class OpenAIGPTConfig(PretrainedConfig):
""" """
This is the configuration class to store the configuration of a :class:`~transformers.OpenAIGPTModel`. This is the configuration class to store the configuration of a :class:`~transformers.OpenAIGPTModel`.
It is used to instantiate an GPT model according to the specified arguments, defining the model It is used to instantiate an GPT model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the `GPT <https://huggingface.co/openai-gpt>`__ architecture from OpenAI. the `GPT <https://huggingface.co/openai-gpt>`__ architecture from OpenAI.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information. for more information.
Args: Args:
vocab_size (:obj:`int`, optional, defaults to 40478): vocab_size (:obj:`int`, optional, defaults to 40478):
Vocabulary size of the GPT model. Defines the different tokens that Vocabulary size of the GPT model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`. can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
n_positions (:obj:`int`, optional, defaults to 512): n_positions (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might ever be used with. The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048). Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
n_ctx (:obj:`int`, optional, defaults to 512): n_ctx (:obj:`int`, optional, defaults to 512):
Dimensionality of the causal mask (usually same as n_positions). Dimensionality of the causal mask (usually same as n_positions).
n_embd (:obj:`int`, optional, defaults to 768): n_embd (:obj:`int`, optional, defaults to 768):
Dimensionality of the embeddings and hidden states. Dimensionality of the embeddings and hidden states.
n_layer (:obj:`int`, optional, defaults to 12): n_layer (:obj:`int`, optional, defaults to 12):
Number of hidden layers in the Transformer encoder. Number of hidden layers in the Transformer encoder.
n_head (:obj:`int`, optional, defaults to 12): n_head (:obj:`int`, optional, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder. Number of attention heads for each attention layer in the Transformer encoder.
afn (:obj:`str` or :obj:`function`, optional, defaults to "gelu"): afn (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
The non-linear activation function (function or string) in the encoder and pooler. The non-linear activation function (function or string) in the encoder and pooler.
If string, "gelu", "relu", "swish" and "gelu_new" are supported. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
resid_pdrop (:obj:`float`, optional, defaults to 0.1): resid_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
embd_pdrop (:obj:`int`, optional, defaults to 0.1): embd_pdrop (:obj:`int`, optional, defaults to 0.1):
The dropout ratio for the embeddings. The dropout ratio for the embeddings.
attn_pdrop (:obj:`float`, optional, defaults to 0.1): attn_pdrop (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention. The dropout ratio for the attention.
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5): layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
The epsilon to use in the layer normalization layers The epsilon to use in the layer normalization layers
initializer_range (:obj:`float`, optional, defaults to 0.02): initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
predict_special_tokens (:obj:`boolean`, optional, defaults to :obj:`True`): predict_special_tokens (:obj:`boolean`, optional, defaults to :obj:`True`):
Whether special tokens should be predicted when the model is has a language modeling head. Whether special tokens should be predicted when the model is has a language modeling head.
summary_type (:obj:`string`, optional, defaults to "cls_index"): summary_type (:obj:`string`, optional, defaults to "cls_index"):
Argument used when doing sequence summary. Used in for the multiple choice head in Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.OpenAIGPTDoubleHeadsModel`. :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
Is one of the following options: Is one of the following options:
- 'last' => take the last token hidden state (like XLNet) - 'last' => take the last token hidden state (like XLNet)
- 'first' => take the first token hidden state (like Bert) - 'first' => take the first token hidden state (like Bert)
- 'mean' => take the mean of all tokens hidden states - 'mean' => take the mean of all tokens hidden states
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2) - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
- 'attn' => Not implemented now, use multi-head attention - 'attn' => Not implemented now, use multi-head attention
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`): summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.OpenAIGPTDoubleHeadsModel`. :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
Add a projection after the vector extraction Add a projection after the vector extraction
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`): summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
Argument used when doing sequence summary. Used in for the multiple choice head in Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.OpenAIGPTDoubleHeadsModel`. :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
'tanh' => add a tanh activation to the output, Other => no activation. 'tanh' => add a tanh activation to the output, Other => no activation.
summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`): summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
Argument used when doing sequence summary. Used in for the multiple choice head in Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.OpenAIGPTDoubleHeadsModel`. :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False. If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
summary_first_dropout (:obj:`float`, optional, defaults to 0.1): summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
Argument used when doing sequence summary. Used in for the multiple choice head in Argument used when doing sequence summary. Used in for the multiple choice head in
:class:`~transformers.OpenAIGPTDoubleHeadsModel`. :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
Add a dropout before the projection and activation Add a dropout before the projection and activation
Example:: Example::
>>> from transformers import OpenAIGPTConfig, OpenAIGPTModel >>> from transformers import OpenAIGPTConfig, OpenAIGPTModel
>>> # Initializing a GPT configuration >>> # Initializing a GPT configuration
>>> configuration = OpenAIGPTConfig() >>> configuration = OpenAIGPTConfig()
>>> # Initializing a model from the configuration >>> # Initializing a model from the configuration
>>> model = OpenAIGPTModel(configuration) >>> model = OpenAIGPTModel(configuration)
>>> # Accessing the model configuration >>> # Accessing the model configuration
>>> configuration = model.config >>> configuration = model.config
""" """
model_type = "openai-gpt" model_type = "openai-gpt"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment