Unverified Commit 4430b912 authored by Yih-Dar's avatar Yih-Dar Committed by GitHub
Browse files

clean up unused `classifier_dropout` in config (#20596)


Co-authored-by: default avatarydshieh <ydshieh@users.noreply.github.com>
parent eefae413
...@@ -71,8 +71,6 @@ class BlenderbotConfig(PretrainedConfig): ...@@ -71,8 +71,6 @@ class BlenderbotConfig(PretrainedConfig):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
activation_dropout (`float`, *optional*, defaults to 0.0): activation_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for activations inside the fully connected layer. The dropout ratio for activations inside the fully connected layer.
classifier_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for classifier.
max_position_embeddings (`int`, *optional*, defaults to 128): max_position_embeddings (`int`, *optional*, defaults to 128):
The maximum sequence length that this model might ever be used with. Typically set this to something large The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048). just in case (e.g., 512 or 1024 or 2048).
...@@ -131,7 +129,6 @@ class BlenderbotConfig(PretrainedConfig): ...@@ -131,7 +129,6 @@ class BlenderbotConfig(PretrainedConfig):
activation_dropout=0.0, activation_dropout=0.0,
init_std=0.02, init_std=0.02,
decoder_start_token_id=1, decoder_start_token_id=1,
classifier_dropout=0.0,
scale_embedding=False, scale_embedding=False,
pad_token_id=0, pad_token_id=0,
bos_token_id=1, bos_token_id=1,
...@@ -156,7 +153,6 @@ class BlenderbotConfig(PretrainedConfig): ...@@ -156,7 +153,6 @@ class BlenderbotConfig(PretrainedConfig):
self.init_std = init_std self.init_std = init_std
self.encoder_layerdrop = encoder_layerdrop self.encoder_layerdrop = encoder_layerdrop
self.decoder_layerdrop = decoder_layerdrop self.decoder_layerdrop = decoder_layerdrop
self.classifier_dropout = classifier_dropout
self.use_cache = use_cache self.use_cache = use_cache
self.num_hidden_layers = encoder_layers self.num_hidden_layers = encoder_layers
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
......
...@@ -71,8 +71,6 @@ class BlenderbotSmallConfig(PretrainedConfig): ...@@ -71,8 +71,6 @@ class BlenderbotSmallConfig(PretrainedConfig):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
activation_dropout (`float`, *optional*, defaults to 0.0): activation_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for activations inside the fully connected layer. The dropout ratio for activations inside the fully connected layer.
classifier_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for classifier.
max_position_embeddings (`int`, *optional*, defaults to 512): max_position_embeddings (`int`, *optional*, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something large The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048). just in case (e.g., 512 or 1024 or 2048).
...@@ -131,7 +129,6 @@ class BlenderbotSmallConfig(PretrainedConfig): ...@@ -131,7 +129,6 @@ class BlenderbotSmallConfig(PretrainedConfig):
activation_dropout=0.0, activation_dropout=0.0,
init_std=0.02, init_std=0.02,
decoder_start_token_id=1, decoder_start_token_id=1,
classifier_dropout=0.0,
scale_embedding=False, scale_embedding=False,
pad_token_id=0, pad_token_id=0,
bos_token_id=1, bos_token_id=1,
...@@ -155,7 +152,6 @@ class BlenderbotSmallConfig(PretrainedConfig): ...@@ -155,7 +152,6 @@ class BlenderbotSmallConfig(PretrainedConfig):
self.init_std = init_std self.init_std = init_std
self.encoder_layerdrop = encoder_layerdrop self.encoder_layerdrop = encoder_layerdrop
self.decoder_layerdrop = decoder_layerdrop self.decoder_layerdrop = decoder_layerdrop
self.classifier_dropout = classifier_dropout
self.use_cache = use_cache self.use_cache = use_cache
self.num_hidden_layers = encoder_layers self.num_hidden_layers = encoder_layers
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
......
...@@ -87,8 +87,6 @@ class ChineseCLIPTextConfig(PretrainedConfig): ...@@ -87,8 +87,6 @@ class ChineseCLIPTextConfig(PretrainedConfig):
use_cache (`bool`, *optional*, defaults to `True`): use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`. relevant if `config.is_decoder=True`.
classifier_dropout (`float`, *optional*):
The dropout ratio for the classification head.
Example: Example:
...@@ -124,7 +122,6 @@ class ChineseCLIPTextConfig(PretrainedConfig): ...@@ -124,7 +122,6 @@ class ChineseCLIPTextConfig(PretrainedConfig):
pad_token_id=0, pad_token_id=0,
position_embedding_type="absolute", position_embedding_type="absolute",
use_cache=True, use_cache=True,
classifier_dropout=None,
**kwargs **kwargs
): ):
super().__init__(pad_token_id=pad_token_id, **kwargs) super().__init__(pad_token_id=pad_token_id, **kwargs)
...@@ -144,7 +141,6 @@ class ChineseCLIPTextConfig(PretrainedConfig): ...@@ -144,7 +141,6 @@ class ChineseCLIPTextConfig(PretrainedConfig):
self.layer_norm_eps = layer_norm_eps self.layer_norm_eps = layer_norm_eps
self.position_embedding_type = position_embedding_type self.position_embedding_type = position_embedding_type
self.use_cache = use_cache self.use_cache = use_cache
self.classifier_dropout = classifier_dropout
@classmethod @classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
......
...@@ -69,8 +69,6 @@ class MarianConfig(PretrainedConfig): ...@@ -69,8 +69,6 @@ class MarianConfig(PretrainedConfig):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
activation_dropout (`float`, *optional*, defaults to 0.0): activation_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for activations inside the fully connected layer. The dropout ratio for activations inside the fully connected layer.
classifier_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for classifier.
max_position_embeddings (`int`, *optional*, defaults to 1024): max_position_embeddings (`int`, *optional*, defaults to 1024):
The maximum sequence length that this model might ever be used with. Typically set this to something large The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048). just in case (e.g., 512 or 1024 or 2048).
...@@ -130,7 +128,6 @@ class MarianConfig(PretrainedConfig): ...@@ -130,7 +128,6 @@ class MarianConfig(PretrainedConfig):
activation_dropout=0.0, activation_dropout=0.0,
init_std=0.02, init_std=0.02,
decoder_start_token_id=58100, decoder_start_token_id=58100,
classifier_dropout=0.0,
scale_embedding=False, scale_embedding=False,
pad_token_id=58100, pad_token_id=58100,
eos_token_id=0, eos_token_id=0,
...@@ -155,7 +152,6 @@ class MarianConfig(PretrainedConfig): ...@@ -155,7 +152,6 @@ class MarianConfig(PretrainedConfig):
self.init_std = init_std self.init_std = init_std
self.encoder_layerdrop = encoder_layerdrop self.encoder_layerdrop = encoder_layerdrop
self.decoder_layerdrop = decoder_layerdrop self.decoder_layerdrop = decoder_layerdrop
self.classifier_dropout = classifier_dropout
self.use_cache = use_cache self.use_cache = use_cache
self.num_hidden_layers = encoder_layers self.num_hidden_layers = encoder_layers
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
......
...@@ -64,8 +64,6 @@ class PegasusConfig(PretrainedConfig): ...@@ -64,8 +64,6 @@ class PegasusConfig(PretrainedConfig):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
activation_dropout (`float`, *optional*, defaults to 0.0): activation_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for activations inside the fully connected layer. The dropout ratio for activations inside the fully connected layer.
classifier_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for classifier.
max_position_embeddings (`int`, *optional*, defaults to 1024): max_position_embeddings (`int`, *optional*, defaults to 1024):
The maximum sequence length that this model might ever be used with. Typically set this to something large The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048). just in case (e.g., 512 or 1024 or 2048).
...@@ -124,7 +122,6 @@ class PegasusConfig(PretrainedConfig): ...@@ -124,7 +122,6 @@ class PegasusConfig(PretrainedConfig):
activation_dropout=0.0, activation_dropout=0.0,
init_std=0.02, init_std=0.02,
decoder_start_token_id=0, decoder_start_token_id=0,
classifier_dropout=0.0,
scale_embedding=False, scale_embedding=False,
pad_token_id=0, pad_token_id=0,
eos_token_id=1, eos_token_id=1,
...@@ -147,7 +144,6 @@ class PegasusConfig(PretrainedConfig): ...@@ -147,7 +144,6 @@ class PegasusConfig(PretrainedConfig):
self.init_std = init_std self.init_std = init_std
self.encoder_layerdrop = encoder_layerdrop self.encoder_layerdrop = encoder_layerdrop
self.decoder_layerdrop = decoder_layerdrop self.decoder_layerdrop = decoder_layerdrop
self.classifier_dropout = classifier_dropout
self.use_cache = use_cache self.use_cache = use_cache
self.num_hidden_layers = encoder_layers self.num_hidden_layers = encoder_layers
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
......
...@@ -60,8 +60,6 @@ class Speech2Text2Config(PretrainedConfig): ...@@ -60,8 +60,6 @@ class Speech2Text2Config(PretrainedConfig):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
activation_dropout (`float`, *optional*, defaults to 0.0): activation_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for activations inside the fully connected layer. The dropout ratio for activations inside the fully connected layer.
classifier_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for classifier.
init_std (`float`, *optional*, defaults to 0.02): init_std (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
https://arxiv.org/abs/1909.11556>`__ for more details. https://arxiv.org/abs/1909.11556>`__ for more details.
...@@ -109,7 +107,6 @@ class Speech2Text2Config(PretrainedConfig): ...@@ -109,7 +107,6 @@ class Speech2Text2Config(PretrainedConfig):
activation_dropout=0.0, activation_dropout=0.0,
init_std=0.02, init_std=0.02,
decoder_start_token_id=2, decoder_start_token_id=2,
classifier_dropout=0.0,
scale_embedding=True, scale_embedding=True,
pad_token_id=1, pad_token_id=1,
bos_token_id=0, bos_token_id=0,
...@@ -129,7 +126,6 @@ class Speech2Text2Config(PretrainedConfig): ...@@ -129,7 +126,6 @@ class Speech2Text2Config(PretrainedConfig):
self.activation_function = activation_function self.activation_function = activation_function
self.init_std = init_std self.init_std = init_std
self.decoder_layerdrop = decoder_layerdrop self.decoder_layerdrop = decoder_layerdrop
self.classifier_dropout = classifier_dropout
self.use_cache = use_cache self.use_cache = use_cache
self.num_hidden_layers = decoder_layers self.num_hidden_layers = decoder_layers
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment