clean up unused `classifier_dropout` in config (#20596)

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>

clean up unused `classifier_dropout` in config (#20596)
Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
4430b912 · Yih-Dar · GitHub · eefae413 · 4430b912 · 4430b912
Unverified Commit 4430b912 authored Dec 05, 2022 by Yih-Dar Committed by GitHub Dec 05, 2022
6 changed files
--- a/src/transformers/models/blenderbot/configuration_blenderbot.py
+++ b/src/transformers/models/blenderbot/configuration_blenderbot.py
@@ -71,8 +71,6 @@ class BlenderbotConfig(PretrainedConfig):
            The dropout ratio for the attention probabilities.
        activation_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for classifier.
        max_position_embeddings (`int`, *optional*, defaults to 128):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
@@ -131,7 +129,6 @@ class BlenderbotConfig(PretrainedConfig):
        activation_dropout=0.0,
        init_std=0.02,
        decoder_start_token_id=1,
-        classifier_dropout=0.0,
        scale_embedding=False,
        pad_token_id=0,
        bos_token_id=1,
@@ -156,7 +153,6 @@ class BlenderbotConfig(PretrainedConfig):
        self.init_std = init_std
        self.encoder_layerdrop = encoder_layerdrop
        self.decoder_layerdrop = decoder_layerdrop
-        self.classifier_dropout = classifier_dropout
        self.use_cache = use_cache
        self.num_hidden_layers = encoder_layers
        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True

--- a/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
@@ -71,8 +71,6 @@ class BlenderbotSmallConfig(PretrainedConfig):
            The dropout ratio for the attention probabilities.
        activation_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for classifier.
        max_position_embeddings (`int`, *optional*, defaults to 512):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
@@ -131,7 +129,6 @@ class BlenderbotSmallConfig(PretrainedConfig):
        activation_dropout=0.0,
        init_std=0.02,
        decoder_start_token_id=1,
-        classifier_dropout=0.0,
        scale_embedding=False,
        pad_token_id=0,
        bos_token_id=1,
@@ -155,7 +152,6 @@ class BlenderbotSmallConfig(PretrainedConfig):
        self.init_std = init_std
        self.encoder_layerdrop = encoder_layerdrop
        self.decoder_layerdrop = decoder_layerdrop
-        self.classifier_dropout = classifier_dropout
        self.use_cache = use_cache
        self.num_hidden_layers = encoder_layers
        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True

--- a/src/transformers/models/chinese_clip/configuration_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/configuration_chinese_clip.py
@@ -87,8 +87,6 @@ class ChineseCLIPTextConfig(PretrainedConfig):
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
-        classifier_dropout (`float`, *optional*):
-            The dropout ratio for the classification head.

    Example:

@@ -124,7 +122,6 @@ class ChineseCLIPTextConfig(PretrainedConfig):
        pad_token_id=0,
        position_embedding_type="absolute",
        use_cache=True,
-        classifier_dropout=None,
        **kwargs
    ):
        super().__init__(pad_token_id=pad_token_id, **kwargs)
@@ -144,7 +141,6 @@ class ChineseCLIPTextConfig(PretrainedConfig):
        self.layer_norm_eps = layer_norm_eps
        self.position_embedding_type = position_embedding_type
        self.use_cache = use_cache
-        self.classifier_dropout = classifier_dropout

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":

--- a/src/transformers/models/marian/configuration_marian.py
+++ b/src/transformers/models/marian/configuration_marian.py
@@ -69,8 +69,6 @@ class MarianConfig(PretrainedConfig):
            The dropout ratio for the attention probabilities.
        activation_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for classifier.
        max_position_embeddings (`int`, *optional*, defaults to 1024):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
@@ -130,7 +128,6 @@ class MarianConfig(PretrainedConfig):
        activation_dropout=0.0,
        init_std=0.02,
        decoder_start_token_id=58100,
-        classifier_dropout=0.0,
        scale_embedding=False,
        pad_token_id=58100,
        eos_token_id=0,
@@ -155,7 +152,6 @@ class MarianConfig(PretrainedConfig):
        self.init_std = init_std
        self.encoder_layerdrop = encoder_layerdrop
        self.decoder_layerdrop = decoder_layerdrop
-        self.classifier_dropout = classifier_dropout
        self.use_cache = use_cache
        self.num_hidden_layers = encoder_layers
        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True

--- a/src/transformers/models/pegasus/configuration_pegasus.py
+++ b/src/transformers/models/pegasus/configuration_pegasus.py
@@ -64,8 +64,6 @@ class PegasusConfig(PretrainedConfig):
            The dropout ratio for the attention probabilities.
        activation_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for classifier.
        max_position_embeddings (`int`, *optional*, defaults to 1024):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
@@ -124,7 +122,6 @@ class PegasusConfig(PretrainedConfig):
        activation_dropout=0.0,
        init_std=0.02,
        decoder_start_token_id=0,
-        classifier_dropout=0.0,
        scale_embedding=False,
        pad_token_id=0,
        eos_token_id=1,
@@ -147,7 +144,6 @@ class PegasusConfig(PretrainedConfig):
        self.init_std = init_std
        self.encoder_layerdrop = encoder_layerdrop
        self.decoder_layerdrop = decoder_layerdrop
-        self.classifier_dropout = classifier_dropout
        self.use_cache = use_cache
        self.num_hidden_layers = encoder_layers
        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True

--- a/src/transformers/models/speech_to_text_2/configuration_speech_to_text_2.py
+++ b/src/transformers/models/speech_to_text_2/configuration_speech_to_text_2.py
@@ -60,8 +60,6 @@ class Speech2Text2Config(PretrainedConfig):
            The dropout ratio for the attention probabilities.
        activation_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for classifier.
        init_std (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
            https://arxiv.org/abs/1909.11556>`__ for more details.
@@ -109,7 +107,6 @@ class Speech2Text2Config(PretrainedConfig):
        activation_dropout=0.0,
        init_std=0.02,
        decoder_start_token_id=2,
-        classifier_dropout=0.0,
        scale_embedding=True,
        pad_token_id=1,
        bos_token_id=0,
@@ -129,7 +126,6 @@ class Speech2Text2Config(PretrainedConfig):
        self.activation_function = activation_function
        self.init_std = init_std
        self.decoder_layerdrop = decoder_layerdrop
-        self.classifier_dropout = classifier_dropout
        self.use_cache = use_cache
        self.num_hidden_layers = decoder_layers
        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True