Unverified Commit 0c1c42c1 authored by Philip May's avatar Philip May Committed by GitHub
Browse files

add `classifier_dropout` to classification heads (#12794)



* add classifier_dropout to Electra

* no type annotations yet
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* add classifier_dropout to Electra

* add classifier_dropout to Electra ForTokenClass.

* add classifier_dropout to bert

* add classifier_dropout to roberta

* add classifier_dropout to big_bird

* add classifier_dropout to mobilebert

* empty commit to trigger CI

* add classifier_dropout to reformer

* add classifier_dropout to ConvBERT

* add classifier_dropout to Albert

* add classifier_dropout to Albert
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
parent 9ff672fc
...@@ -1088,7 +1088,12 @@ class AlbertForTokenClassification(AlbertPreTrainedModel): ...@@ -1088,7 +1088,12 @@ class AlbertForTokenClassification(AlbertPreTrainedModel):
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.albert = AlbertModel(config, add_pooling_layer=False) self.albert = AlbertModel(config, add_pooling_layer=False)
self.dropout = nn.Dropout(config.hidden_dropout_prob) classifier_dropout_prob = (
config.classifier_dropout_prob
if config.classifier_dropout_prob is not None
else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, self.config.num_labels) self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
self.init_weights() self.init_weights()
......
...@@ -1199,7 +1199,12 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat ...@@ -1199,7 +1199,12 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.albert = TFAlbertMainLayer(config, add_pooling_layer=False, name="albert") self.albert = TFAlbertMainLayer(config, add_pooling_layer=False, name="albert")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) classifier_dropout_prob = (
config.classifier_dropout_prob
if config.classifier_dropout_prob is not None
else config.hidden_dropout_prob
)
self.dropout = tf.keras.layers.Dropout(rate=classifier_dropout_prob)
self.classifier = tf.keras.layers.Dense( self.classifier = tf.keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
......
...@@ -104,6 +104,8 @@ class BertConfig(PretrainedConfig): ...@@ -104,6 +104,8 @@ class BertConfig(PretrainedConfig):
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`): use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if ``config.is_decoder=True``. relevant if ``config.is_decoder=True``.
classifier_dropout (:obj:`float`, `optional`):
The dropout ratio for the classification head.
Examples:: Examples::
...@@ -138,6 +140,7 @@ class BertConfig(PretrainedConfig): ...@@ -138,6 +140,7 @@ class BertConfig(PretrainedConfig):
gradient_checkpointing=False, gradient_checkpointing=False,
position_embedding_type="absolute", position_embedding_type="absolute",
use_cache=True, use_cache=True,
classifier_dropout=None,
**kwargs **kwargs
): ):
super().__init__(pad_token_id=pad_token_id, **kwargs) super().__init__(pad_token_id=pad_token_id, **kwargs)
...@@ -157,6 +160,7 @@ class BertConfig(PretrainedConfig): ...@@ -157,6 +160,7 @@ class BertConfig(PretrainedConfig):
self.gradient_checkpointing = gradient_checkpointing self.gradient_checkpointing = gradient_checkpointing
self.position_embedding_type = position_embedding_type self.position_embedding_type = position_embedding_type
self.use_cache = use_cache self.use_cache = use_cache
self.classifier_dropout = classifier_dropout
class BertOnnxConfig(OnnxConfig): class BertOnnxConfig(OnnxConfig):
......
...@@ -1486,7 +1486,10 @@ class BertForSequenceClassification(BertPreTrainedModel): ...@@ -1486,7 +1486,10 @@ class BertForSequenceClassification(BertPreTrainedModel):
self.config = config self.config = config
self.bert = BertModel(config) self.bert = BertModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob) classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() self.init_weights()
...@@ -1677,7 +1680,10 @@ class BertForTokenClassification(BertPreTrainedModel): ...@@ -1677,7 +1680,10 @@ class BertForTokenClassification(BertPreTrainedModel):
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.bert = BertModel(config, add_pooling_layer=False) self.bert = BertModel(config, add_pooling_layer=False)
self.dropout = nn.Dropout(config.hidden_dropout_prob) classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() self.init_weights()
......
...@@ -915,7 +915,12 @@ class FlaxBertForSequenceClassificationModule(nn.Module): ...@@ -915,7 +915,12 @@ class FlaxBertForSequenceClassificationModule(nn.Module):
def setup(self): def setup(self):
self.bert = FlaxBertModule(config=self.config, dtype=self.dtype) self.bert = FlaxBertModule(config=self.config, dtype=self.dtype)
self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) classifier_dropout = (
self.config.classifier_dropout
if self.config.classifier_dropout is not None
else self.config.hidden_dropout_prob
)
self.dropout = nn.Dropout(rate=classifier_dropout)
self.classifier = nn.Dense( self.classifier = nn.Dense(
self.config.num_labels, self.config.num_labels,
dtype=self.dtype, dtype=self.dtype,
...@@ -1057,7 +1062,12 @@ class FlaxBertForTokenClassificationModule(nn.Module): ...@@ -1057,7 +1062,12 @@ class FlaxBertForTokenClassificationModule(nn.Module):
def setup(self): def setup(self):
self.bert = FlaxBertModule(config=self.config, dtype=self.dtype, add_pooling_layer=False) self.bert = FlaxBertModule(config=self.config, dtype=self.dtype, add_pooling_layer=False)
self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) classifier_dropout = (
self.config.classifier_dropout
if self.config.classifier_dropout is not None
else self.config.hidden_dropout_prob
)
self.dropout = nn.Dropout(rate=classifier_dropout)
self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype) self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype)
def __call__( def __call__(
......
...@@ -1386,7 +1386,10 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific ...@@ -1386,7 +1386,10 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.bert = TFBertMainLayer(config, name="bert") self.bert = TFBertMainLayer(config, name="bert")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = tf.keras.layers.Dropout(rate=classifier_dropout)
self.classifier = tf.keras.layers.Dense( self.classifier = tf.keras.layers.Dense(
units=config.num_labels, units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
...@@ -1652,7 +1655,10 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL ...@@ -1652,7 +1655,10 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert") self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = tf.keras.layers.Dropout(rate=classifier_dropout)
self.classifier = tf.keras.layers.Dense( self.classifier = tf.keras.layers.Dense(
units=config.num_labels, units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
......
...@@ -84,6 +84,8 @@ class BigBirdConfig(PretrainedConfig): ...@@ -84,6 +84,8 @@ class BigBirdConfig(PretrainedConfig):
"block_sparse"`. "block_sparse"`.
gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`): gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
If True, use gradient checkpointing to save memory at the expense of slower backward pass. If True, use gradient checkpointing to save memory at the expense of slower backward pass.
classifier_dropout (:obj:`float`, `optional`):
The dropout ratio for the classification head.
Example:: Example::
...@@ -126,6 +128,7 @@ class BigBirdConfig(PretrainedConfig): ...@@ -126,6 +128,7 @@ class BigBirdConfig(PretrainedConfig):
block_size=64, block_size=64,
num_random_blocks=3, num_random_blocks=3,
gradient_checkpointing=False, gradient_checkpointing=False,
classifier_dropout=None,
**kwargs **kwargs
): ):
super().__init__( super().__init__(
...@@ -157,3 +160,4 @@ class BigBirdConfig(PretrainedConfig): ...@@ -157,3 +160,4 @@ class BigBirdConfig(PretrainedConfig):
self.use_bias = use_bias self.use_bias = use_bias
self.block_size = block_size self.block_size = block_size
self.num_random_blocks = num_random_blocks self.num_random_blocks = num_random_blocks
self.classifier_dropout = classifier_dropout
...@@ -2605,7 +2605,10 @@ class BigBirdClassificationHead(nn.Module): ...@@ -2605,7 +2605,10 @@ class BigBirdClassificationHead(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob) classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.out_proj = nn.Linear(config.hidden_size, config.num_labels) self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
self.config = config self.config = config
...@@ -2821,7 +2824,10 @@ class BigBirdForTokenClassification(BigBirdPreTrainedModel): ...@@ -2821,7 +2824,10 @@ class BigBirdForTokenClassification(BigBirdPreTrainedModel):
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.bert = BigBirdModel(config) self.bert = BigBirdModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob) classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() self.init_weights()
......
...@@ -1654,7 +1654,12 @@ class FlaxBigBirdClassificationHead(nn.Module): ...@@ -1654,7 +1654,12 @@ class FlaxBigBirdClassificationHead(nn.Module):
def setup(self): def setup(self):
self.dense = nn.Dense(self.config.hidden_size, dtype=self.dtype) self.dense = nn.Dense(self.config.hidden_size, dtype=self.dtype)
self.dropout = nn.Dropout(self.config.hidden_dropout_prob) classifier_dropout = (
self.config.classifier_dropout
if self.config.classifier_dropout is not None
else self.config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.out_proj = nn.Dense(self.config.num_labels, dtype=self.dtype) self.out_proj = nn.Dense(self.config.num_labels, dtype=self.dtype)
def __call__(self, features, deterministic=True): def __call__(self, features, deterministic=True):
...@@ -1831,7 +1836,12 @@ class FlaxBigBirdForTokenClassificationModule(nn.Module): ...@@ -1831,7 +1836,12 @@ class FlaxBigBirdForTokenClassificationModule(nn.Module):
def setup(self): def setup(self):
self.bert = FlaxBigBirdModule(config=self.config, dtype=self.dtype, add_pooling_layer=False) self.bert = FlaxBigBirdModule(config=self.config, dtype=self.dtype, add_pooling_layer=False)
self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) classifier_dropout = (
self.config.classifier_dropout
if self.config.classifier_dropout is not None
else self.config.hidden_dropout_prob
)
self.dropout = nn.Dropout(rate=classifier_dropout)
self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype) self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype)
def __call__( def __call__(
......
...@@ -73,7 +73,8 @@ class ConvBertConfig(PretrainedConfig): ...@@ -73,7 +73,8 @@ class ConvBertConfig(PretrainedConfig):
The number of groups for grouped linear layers for ConvBert model The number of groups for grouped linear layers for ConvBert model
conv_kernel_size (:obj:`int`, `optional`, defaults to 9): conv_kernel_size (:obj:`int`, `optional`, defaults to 9):
The size of the convolutional kernel. The size of the convolutional kernel.
classifier_dropout (:obj:`float`, `optional`):
The dropout ratio for the classification head.
Example:: Example::
>>> from transformers import ConvBertModel, ConvBertConfig >>> from transformers import ConvBertModel, ConvBertConfig
...@@ -108,6 +109,7 @@ class ConvBertConfig(PretrainedConfig): ...@@ -108,6 +109,7 @@ class ConvBertConfig(PretrainedConfig):
head_ratio=2, head_ratio=2,
conv_kernel_size=9, conv_kernel_size=9,
num_groups=1, num_groups=1,
classifier_dropout=None,
**kwargs, **kwargs,
): ):
super().__init__( super().__init__(
...@@ -134,3 +136,4 @@ class ConvBertConfig(PretrainedConfig): ...@@ -134,3 +136,4 @@ class ConvBertConfig(PretrainedConfig):
self.head_ratio = head_ratio self.head_ratio = head_ratio
self.conv_kernel_size = conv_kernel_size self.conv_kernel_size = conv_kernel_size
self.num_groups = num_groups self.num_groups = num_groups
self.classifier_dropout = classifier_dropout
...@@ -936,7 +936,10 @@ class ConvBertClassificationHead(nn.Module): ...@@ -936,7 +936,10 @@ class ConvBertClassificationHead(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob) classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.out_proj = nn.Linear(config.hidden_size, config.num_labels) self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
self.config = config self.config = config
...@@ -1152,7 +1155,10 @@ class ConvBertForTokenClassification(ConvBertPreTrainedModel): ...@@ -1152,7 +1155,10 @@ class ConvBertForTokenClassification(ConvBertPreTrainedModel):
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.convbert = ConvBertModel(config) self.convbert = ConvBertModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob) classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() self.init_weights()
......
...@@ -970,7 +970,10 @@ class TFConvBertClassificationHead(tf.keras.layers.Layer): ...@@ -970,7 +970,10 @@ class TFConvBertClassificationHead(tf.keras.layers.Layer):
self.dense = tf.keras.layers.Dense( self.dense = tf.keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = tf.keras.layers.Dropout(classifier_dropout)
self.out_proj = tf.keras.layers.Dense( self.out_proj = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
) )
...@@ -1240,7 +1243,10 @@ class TFConvBertForTokenClassification(TFConvBertPreTrainedModel, TFTokenClassif ...@@ -1240,7 +1243,10 @@ class TFConvBertForTokenClassification(TFConvBertPreTrainedModel, TFTokenClassif
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.convbert = TFConvBertMainLayer(config, name="convbert") self.convbert = TFConvBertMainLayer(config, name="convbert")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = tf.keras.layers.Dropout(classifier_dropout)
self.classifier = tf.keras.layers.Dense( self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
......
...@@ -104,6 +104,8 @@ class ElectraConfig(PretrainedConfig): ...@@ -104,6 +104,8 @@ class ElectraConfig(PretrainedConfig):
<https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to <https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to
`Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.) `Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
<https://arxiv.org/abs/2009.13658>`__. <https://arxiv.org/abs/2009.13658>`__.
classifier_dropout (:obj:`float`, `optional`):
The dropout ratio for the classification head.
Examples:: Examples::
...@@ -141,6 +143,7 @@ class ElectraConfig(PretrainedConfig): ...@@ -141,6 +143,7 @@ class ElectraConfig(PretrainedConfig):
summary_last_dropout=0.1, summary_last_dropout=0.1,
pad_token_id=0, pad_token_id=0,
position_embedding_type="absolute", position_embedding_type="absolute",
classifier_dropout=None,
**kwargs **kwargs
): ):
super().__init__(pad_token_id=pad_token_id, **kwargs) super().__init__(pad_token_id=pad_token_id, **kwargs)
...@@ -164,3 +167,4 @@ class ElectraConfig(PretrainedConfig): ...@@ -164,3 +167,4 @@ class ElectraConfig(PretrainedConfig):
self.summary_activation = summary_activation self.summary_activation = summary_activation
self.summary_last_dropout = summary_last_dropout self.summary_last_dropout = summary_last_dropout
self.position_embedding_type = position_embedding_type self.position_embedding_type = position_embedding_type
self.classifier_dropout = classifier_dropout
...@@ -900,7 +900,10 @@ class ElectraClassificationHead(nn.Module): ...@@ -900,7 +900,10 @@ class ElectraClassificationHead(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob) classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.out_proj = nn.Linear(config.hidden_size, config.num_labels) self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
def forward(self, features, **kwargs): def forward(self, features, **kwargs):
...@@ -1200,7 +1203,10 @@ class ElectraForTokenClassification(ElectraPreTrainedModel): ...@@ -1200,7 +1203,10 @@ class ElectraForTokenClassification(ElectraPreTrainedModel):
super().__init__(config) super().__init__(config)
self.electra = ElectraModel(config) self.electra = ElectraModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob) classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() self.init_weights()
......
...@@ -783,7 +783,12 @@ class FlaxElectraForTokenClassificationModule(nn.Module): ...@@ -783,7 +783,12 @@ class FlaxElectraForTokenClassificationModule(nn.Module):
def setup(self): def setup(self):
self.electra = FlaxElectraModule(config=self.config, dtype=self.dtype) self.electra = FlaxElectraModule(config=self.config, dtype=self.dtype)
self.dropout = nn.Dropout(self.config.hidden_dropout_prob) classifier_dropout = (
self.config.classifier_dropout
if self.config.classifier_dropout is not None
else self.config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Dense(self.config.num_labels) self.classifier = nn.Dense(self.config.num_labels)
def __call__( def __call__(
...@@ -1069,7 +1074,12 @@ class FlaxElectraClassificationHead(nn.Module): ...@@ -1069,7 +1074,12 @@ class FlaxElectraClassificationHead(nn.Module):
def setup(self): def setup(self):
self.dense = nn.Dense(self.config.hidden_size, dtype=self.dtype) self.dense = nn.Dense(self.config.hidden_size, dtype=self.dtype)
self.dropout = nn.Dropout(self.config.hidden_dropout_prob) classifier_dropout = (
self.config.classifier_dropout
if self.config.classifier_dropout is not None
else self.config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.out_proj = nn.Dense(self.config.num_labels, dtype=self.dtype) self.out_proj = nn.Dense(self.config.num_labels, dtype=self.dtype)
def __call__(self, hidden_states, deterministic: bool = True): def __call__(self, hidden_states, deterministic: bool = True):
......
...@@ -1039,7 +1039,12 @@ class TFElectraClassificationHead(tf.keras.layers.Layer): ...@@ -1039,7 +1039,12 @@ class TFElectraClassificationHead(tf.keras.layers.Layer):
self.dense = tf.keras.layers.Dense( self.dense = tf.keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) classifier_dropout = (
config.classifhidden_dropout_probier_dropout
if config.classifier_dropout is not None
else config.hidden_dropout_prob
)
self.dropout = tf.keras.layers.Dropout(classifier_dropout)
self.out_proj = tf.keras.layers.Dense( self.out_proj = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
) )
...@@ -1309,7 +1314,10 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassific ...@@ -1309,7 +1314,10 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassific
super().__init__(config, **kwargs) super().__init__(config, **kwargs)
self.electra = TFElectraMainLayer(config, name="electra") self.electra = TFElectraMainLayer(config, name="electra")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = tf.keras.layers.Dropout(classifier_dropout)
self.classifier = tf.keras.layers.Dense( self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
......
...@@ -84,6 +84,8 @@ class MobileBertConfig(PretrainedConfig): ...@@ -84,6 +84,8 @@ class MobileBertConfig(PretrainedConfig):
Number of FFNs in a block. Number of FFNs in a block.
normalization_type (:obj:`str`, `optional`, defaults to :obj:`"no_norm"`): normalization_type (:obj:`str`, `optional`, defaults to :obj:`"no_norm"`):
The normalization type in MobileBERT. The normalization type in MobileBERT.
classifier_dropout (:obj:`float`, `optional`):
The dropout ratio for the classification head.
Examples:: Examples::
...@@ -128,6 +130,7 @@ class MobileBertConfig(PretrainedConfig): ...@@ -128,6 +130,7 @@ class MobileBertConfig(PretrainedConfig):
num_feedforward_networks=4, num_feedforward_networks=4,
normalization_type="no_norm", normalization_type="no_norm",
classifier_activation=True, classifier_activation=True,
classifier_dropout=None,
**kwargs **kwargs
): ):
super().__init__(pad_token_id=pad_token_id, **kwargs) super().__init__(pad_token_id=pad_token_id, **kwargs)
...@@ -158,3 +161,5 @@ class MobileBertConfig(PretrainedConfig): ...@@ -158,3 +161,5 @@ class MobileBertConfig(PretrainedConfig):
self.true_hidden_size = intra_bottleneck_size self.true_hidden_size = intra_bottleneck_size
else: else:
self.true_hidden_size = hidden_size self.true_hidden_size = hidden_size
self.classifier_dropout = classifier_dropout
...@@ -1212,7 +1212,10 @@ class MobileBertForSequenceClassification(MobileBertPreTrainedModel): ...@@ -1212,7 +1212,10 @@ class MobileBertForSequenceClassification(MobileBertPreTrainedModel):
self.config = config self.config = config
self.mobilebert = MobileBertModel(config) self.mobilebert = MobileBertModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob) classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() self.init_weights()
...@@ -1510,7 +1513,10 @@ class MobileBertForTokenClassification(MobileBertPreTrainedModel): ...@@ -1510,7 +1513,10 @@ class MobileBertForTokenClassification(MobileBertPreTrainedModel):
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.mobilebert = MobileBertModel(config, add_pooling_layer=False) self.mobilebert = MobileBertModel(config, add_pooling_layer=False)
self.dropout = nn.Dropout(config.hidden_dropout_prob) classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() self.init_weights()
......
...@@ -1339,7 +1339,10 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque ...@@ -1339,7 +1339,10 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.mobilebert = TFMobileBertMainLayer(config, name="mobilebert") self.mobilebert = TFMobileBertMainLayer(config, name="mobilebert")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = tf.keras.layers.Dropout(classifier_dropout)
self.classifier = tf.keras.layers.Dense( self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
...@@ -1730,7 +1733,10 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla ...@@ -1730,7 +1733,10 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.mobilebert = TFMobileBertMainLayer(config, add_pooling_layer=False, name="mobilebert") self.mobilebert = TFMobileBertMainLayer(config, add_pooling_layer=False, name="mobilebert")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = tf.keras.layers.Dropout(classifier_dropout)
self.classifier = tf.keras.layers.Dense( self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
......
...@@ -140,6 +140,8 @@ class ReformerConfig(PretrainedConfig): ...@@ -140,6 +140,8 @@ class ReformerConfig(PretrainedConfig):
Whether to tie input and output embeddings. Whether to tie input and output embeddings.
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`): use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not the model should return the last key/values attentions (not used by all models). Whether or not the model should return the last key/values attentions (not used by all models).
classifier_dropout (:obj:`float`, `optional`):
The dropout ratio for the classification head.
Examples:: Examples::
...@@ -191,6 +193,7 @@ class ReformerConfig(PretrainedConfig): ...@@ -191,6 +193,7 @@ class ReformerConfig(PretrainedConfig):
vocab_size=320, vocab_size=320,
tie_word_embeddings=False, tie_word_embeddings=False,
use_cache=True, use_cache=True,
classifier_dropout=None,
**kwargs **kwargs
): ):
super().__init__( super().__init__(
...@@ -230,3 +233,4 @@ class ReformerConfig(PretrainedConfig): ...@@ -230,3 +233,4 @@ class ReformerConfig(PretrainedConfig):
self.chunk_size_lm_head = chunk_size_lm_head self.chunk_size_lm_head = chunk_size_lm_head
self.attn_layers = attn_layers self.attn_layers = attn_layers
self.use_cache = use_cache self.use_cache = use_cache
self.classifier_dropout = classifier_dropout
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment