"docs/vscode:/vscode.git/clone" did not exist on "7783fa6bb3dca3aa10283bd7f382d224615e44c6"
Unverified Commit 0c1c42c1 authored by Philip May's avatar Philip May Committed by GitHub
Browse files

add `classifier_dropout` to classification heads (#12794)



* add classifier_dropout to Electra

* no type annotations yet
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* add classifier_dropout to Electra

* add classifier_dropout to Electra ForTokenClass.

* add classifier_dropout to bert

* add classifier_dropout to roberta

* add classifier_dropout to big_bird

* add classifier_dropout to mobilebert

* empty commit to trigger CI

* add classifier_dropout to reformer

* add classifier_dropout to ConvBERT

* add classifier_dropout to Albert

* add classifier_dropout to Albert
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
parent 9ff672fc
......@@ -1088,7 +1088,12 @@ class AlbertForTokenClassification(AlbertPreTrainedModel):
self.num_labels = config.num_labels
self.albert = AlbertModel(config, add_pooling_layer=False)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
classifier_dropout_prob = (
config.classifier_dropout_prob
if config.classifier_dropout_prob is not None
else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
self.init_weights()
......
......@@ -1199,7 +1199,12 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat
self.num_labels = config.num_labels
self.albert = TFAlbertMainLayer(config, add_pooling_layer=False, name="albert")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
classifier_dropout_prob = (
config.classifier_dropout_prob
if config.classifier_dropout_prob is not None
else config.hidden_dropout_prob
)
self.dropout = tf.keras.layers.Dropout(rate=classifier_dropout_prob)
self.classifier = tf.keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
......
......@@ -104,6 +104,8 @@ class BertConfig(PretrainedConfig):
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if ``config.is_decoder=True``.
classifier_dropout (:obj:`float`, `optional`):
The dropout ratio for the classification head.
Examples::
......@@ -138,6 +140,7 @@ class BertConfig(PretrainedConfig):
gradient_checkpointing=False,
position_embedding_type="absolute",
use_cache=True,
classifier_dropout=None,
**kwargs
):
super().__init__(pad_token_id=pad_token_id, **kwargs)
......@@ -157,6 +160,7 @@ class BertConfig(PretrainedConfig):
self.gradient_checkpointing = gradient_checkpointing
self.position_embedding_type = position_embedding_type
self.use_cache = use_cache
self.classifier_dropout = classifier_dropout
class BertOnnxConfig(OnnxConfig):
......
......@@ -1486,7 +1486,10 @@ class BertForSequenceClassification(BertPreTrainedModel):
self.config = config
self.bert = BertModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
......@@ -1677,7 +1680,10 @@ class BertForTokenClassification(BertPreTrainedModel):
self.num_labels = config.num_labels
self.bert = BertModel(config, add_pooling_layer=False)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
......
......@@ -915,7 +915,12 @@ class FlaxBertForSequenceClassificationModule(nn.Module):
def setup(self):
self.bert = FlaxBertModule(config=self.config, dtype=self.dtype)
self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
classifier_dropout = (
self.config.classifier_dropout
if self.config.classifier_dropout is not None
else self.config.hidden_dropout_prob
)
self.dropout = nn.Dropout(rate=classifier_dropout)
self.classifier = nn.Dense(
self.config.num_labels,
dtype=self.dtype,
......@@ -1057,7 +1062,12 @@ class FlaxBertForTokenClassificationModule(nn.Module):
def setup(self):
self.bert = FlaxBertModule(config=self.config, dtype=self.dtype, add_pooling_layer=False)
self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
classifier_dropout = (
self.config.classifier_dropout
if self.config.classifier_dropout is not None
else self.config.hidden_dropout_prob
)
self.dropout = nn.Dropout(rate=classifier_dropout)
self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype)
def __call__(
......
......@@ -1386,7 +1386,10 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific
self.num_labels = config.num_labels
self.bert = TFBertMainLayer(config, name="bert")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = tf.keras.layers.Dropout(rate=classifier_dropout)
self.classifier = tf.keras.layers.Dense(
units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
......@@ -1652,7 +1655,10 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL
self.num_labels = config.num_labels
self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = tf.keras.layers.Dropout(rate=classifier_dropout)
self.classifier = tf.keras.layers.Dense(
units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
......
......@@ -84,6 +84,8 @@ class BigBirdConfig(PretrainedConfig):
"block_sparse"`.
gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
If True, use gradient checkpointing to save memory at the expense of slower backward pass.
classifier_dropout (:obj:`float`, `optional`):
The dropout ratio for the classification head.
Example::
......@@ -126,6 +128,7 @@ class BigBirdConfig(PretrainedConfig):
block_size=64,
num_random_blocks=3,
gradient_checkpointing=False,
classifier_dropout=None,
**kwargs
):
super().__init__(
......@@ -157,3 +160,4 @@ class BigBirdConfig(PretrainedConfig):
self.use_bias = use_bias
self.block_size = block_size
self.num_random_blocks = num_random_blocks
self.classifier_dropout = classifier_dropout
......@@ -2605,7 +2605,10 @@ class BigBirdClassificationHead(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
self.config = config
......@@ -2821,7 +2824,10 @@ class BigBirdForTokenClassification(BigBirdPreTrainedModel):
self.num_labels = config.num_labels
self.bert = BigBirdModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
......
......@@ -1654,7 +1654,12 @@ class FlaxBigBirdClassificationHead(nn.Module):
def setup(self):
self.dense = nn.Dense(self.config.hidden_size, dtype=self.dtype)
self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
classifier_dropout = (
self.config.classifier_dropout
if self.config.classifier_dropout is not None
else self.config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.out_proj = nn.Dense(self.config.num_labels, dtype=self.dtype)
def __call__(self, features, deterministic=True):
......@@ -1831,7 +1836,12 @@ class FlaxBigBirdForTokenClassificationModule(nn.Module):
def setup(self):
self.bert = FlaxBigBirdModule(config=self.config, dtype=self.dtype, add_pooling_layer=False)
self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
classifier_dropout = (
self.config.classifier_dropout
if self.config.classifier_dropout is not None
else self.config.hidden_dropout_prob
)
self.dropout = nn.Dropout(rate=classifier_dropout)
self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype)
def __call__(
......
......@@ -73,7 +73,8 @@ class ConvBertConfig(PretrainedConfig):
The number of groups for grouped linear layers for ConvBert model
conv_kernel_size (:obj:`int`, `optional`, defaults to 9):
The size of the convolutional kernel.
classifier_dropout (:obj:`float`, `optional`):
The dropout ratio for the classification head.
Example::
>>> from transformers import ConvBertModel, ConvBertConfig
......@@ -108,6 +109,7 @@ class ConvBertConfig(PretrainedConfig):
head_ratio=2,
conv_kernel_size=9,
num_groups=1,
classifier_dropout=None,
**kwargs,
):
super().__init__(
......@@ -134,3 +136,4 @@ class ConvBertConfig(PretrainedConfig):
self.head_ratio = head_ratio
self.conv_kernel_size = conv_kernel_size
self.num_groups = num_groups
self.classifier_dropout = classifier_dropout
......@@ -936,7 +936,10 @@ class ConvBertClassificationHead(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
self.config = config
......@@ -1152,7 +1155,10 @@ class ConvBertForTokenClassification(ConvBertPreTrainedModel):
self.num_labels = config.num_labels
self.convbert = ConvBertModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
......
......@@ -970,7 +970,10 @@ class TFConvBertClassificationHead(tf.keras.layers.Layer):
self.dense = tf.keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = tf.keras.layers.Dropout(classifier_dropout)
self.out_proj = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
)
......@@ -1240,7 +1243,10 @@ class TFConvBertForTokenClassification(TFConvBertPreTrainedModel, TFTokenClassif
self.num_labels = config.num_labels
self.convbert = TFConvBertMainLayer(config, name="convbert")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = tf.keras.layers.Dropout(classifier_dropout)
self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
......
......@@ -104,6 +104,8 @@ class ElectraConfig(PretrainedConfig):
<https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to
`Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
<https://arxiv.org/abs/2009.13658>`__.
classifier_dropout (:obj:`float`, `optional`):
The dropout ratio for the classification head.
Examples::
......@@ -141,6 +143,7 @@ class ElectraConfig(PretrainedConfig):
summary_last_dropout=0.1,
pad_token_id=0,
position_embedding_type="absolute",
classifier_dropout=None,
**kwargs
):
super().__init__(pad_token_id=pad_token_id, **kwargs)
......@@ -164,3 +167,4 @@ class ElectraConfig(PretrainedConfig):
self.summary_activation = summary_activation
self.summary_last_dropout = summary_last_dropout
self.position_embedding_type = position_embedding_type
self.classifier_dropout = classifier_dropout
......@@ -900,7 +900,10 @@ class ElectraClassificationHead(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
def forward(self, features, **kwargs):
......@@ -1200,7 +1203,10 @@ class ElectraForTokenClassification(ElectraPreTrainedModel):
super().__init__(config)
self.electra = ElectraModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
......
......@@ -783,7 +783,12 @@ class FlaxElectraForTokenClassificationModule(nn.Module):
def setup(self):
self.electra = FlaxElectraModule(config=self.config, dtype=self.dtype)
self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
classifier_dropout = (
self.config.classifier_dropout
if self.config.classifier_dropout is not None
else self.config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Dense(self.config.num_labels)
def __call__(
......@@ -1069,7 +1074,12 @@ class FlaxElectraClassificationHead(nn.Module):
def setup(self):
self.dense = nn.Dense(self.config.hidden_size, dtype=self.dtype)
self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
classifier_dropout = (
self.config.classifier_dropout
if self.config.classifier_dropout is not None
else self.config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.out_proj = nn.Dense(self.config.num_labels, dtype=self.dtype)
def __call__(self, hidden_states, deterministic: bool = True):
......
......@@ -1039,7 +1039,12 @@ class TFElectraClassificationHead(tf.keras.layers.Layer):
self.dense = tf.keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
classifier_dropout = (
config.classifhidden_dropout_probier_dropout
if config.classifier_dropout is not None
else config.hidden_dropout_prob
)
self.dropout = tf.keras.layers.Dropout(classifier_dropout)
self.out_proj = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
)
......@@ -1309,7 +1314,10 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassific
super().__init__(config, **kwargs)
self.electra = TFElectraMainLayer(config, name="electra")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = tf.keras.layers.Dropout(classifier_dropout)
self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
......
......@@ -84,6 +84,8 @@ class MobileBertConfig(PretrainedConfig):
Number of FFNs in a block.
normalization_type (:obj:`str`, `optional`, defaults to :obj:`"no_norm"`):
The normalization type in MobileBERT.
classifier_dropout (:obj:`float`, `optional`):
The dropout ratio for the classification head.
Examples::
......@@ -128,6 +130,7 @@ class MobileBertConfig(PretrainedConfig):
num_feedforward_networks=4,
normalization_type="no_norm",
classifier_activation=True,
classifier_dropout=None,
**kwargs
):
super().__init__(pad_token_id=pad_token_id, **kwargs)
......@@ -158,3 +161,5 @@ class MobileBertConfig(PretrainedConfig):
self.true_hidden_size = intra_bottleneck_size
else:
self.true_hidden_size = hidden_size
self.classifier_dropout = classifier_dropout
......@@ -1212,7 +1212,10 @@ class MobileBertForSequenceClassification(MobileBertPreTrainedModel):
self.config = config
self.mobilebert = MobileBertModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
......@@ -1510,7 +1513,10 @@ class MobileBertForTokenClassification(MobileBertPreTrainedModel):
self.num_labels = config.num_labels
self.mobilebert = MobileBertModel(config, add_pooling_layer=False)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
......
......@@ -1339,7 +1339,10 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque
self.num_labels = config.num_labels
self.mobilebert = TFMobileBertMainLayer(config, name="mobilebert")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = tf.keras.layers.Dropout(classifier_dropout)
self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
......@@ -1730,7 +1733,10 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla
self.num_labels = config.num_labels
self.mobilebert = TFMobileBertMainLayer(config, add_pooling_layer=False, name="mobilebert")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = tf.keras.layers.Dropout(classifier_dropout)
self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
......
......@@ -140,6 +140,8 @@ class ReformerConfig(PretrainedConfig):
Whether to tie input and output embeddings.
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not the model should return the last key/values attentions (not used by all models).
classifier_dropout (:obj:`float`, `optional`):
The dropout ratio for the classification head.
Examples::
......@@ -191,6 +193,7 @@ class ReformerConfig(PretrainedConfig):
vocab_size=320,
tie_word_embeddings=False,
use_cache=True,
classifier_dropout=None,
**kwargs
):
super().__init__(
......@@ -230,3 +233,4 @@ class ReformerConfig(PretrainedConfig):
self.chunk_size_lm_head = chunk_size_lm_head
self.attn_layers = attn_layers
self.use_cache = use_cache
self.classifier_dropout = classifier_dropout
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment