Unverified Commit 63424273 authored by Yih-Dar's avatar Yih-Dar Committed by GitHub
Browse files

Remove more unused attributes in config classes (#21327)



* remove unused classifier_dropout

* remove unused dropout

* remove unused pooler_fn

* remove unnecessary is_encoder_decoder

* remove unnecessary drop_rate

* remove unused classifier_dropout

* remove unused classifier_dropout

* remove unused dropout

* remove unused dropout

* remove unused summary_* attributes

* remove unused tie_word_embeddings

* remove unused summary_* attributes

* fix

---------
Co-authored-by: default avatarydshieh <ydshieh@users.noreply.github.com>
parent da2a4d95
...@@ -77,12 +77,8 @@ class AltCLIPTextConfig(PretrainedConfig): ...@@ -77,12 +77,8 @@ class AltCLIPTextConfig(PretrainedConfig):
use_cache (`bool`, *optional*, defaults to `True`): use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`. relevant if `config.is_decoder=True`.
classifier_dropout (`float`, *optional*):
The dropout ratio for the classification head.
project_dim (`int`, *optional*, defaults to 768): project_dim (`int`, *optional*, defaults to 768):
The dimentions of the teacher model before the mapping layer. The dimentions of the teacher model before the mapping layer.
pooler_fn (`str`, *optional*, defaults to `"cls"`):
Type of pooler we use. We take the first token as pooled output.
Examples: Examples:
...@@ -120,9 +116,7 @@ class AltCLIPTextConfig(PretrainedConfig): ...@@ -120,9 +116,7 @@ class AltCLIPTextConfig(PretrainedConfig):
eos_token_id=2, eos_token_id=2,
position_embedding_type="absolute", position_embedding_type="absolute",
use_cache=True, use_cache=True,
classifier_dropout=None,
project_dim=768, project_dim=768,
pooler_fn="cls",
**kwargs **kwargs
): ):
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
...@@ -142,9 +136,7 @@ class AltCLIPTextConfig(PretrainedConfig): ...@@ -142,9 +136,7 @@ class AltCLIPTextConfig(PretrainedConfig):
self.layer_norm_eps = layer_norm_eps self.layer_norm_eps = layer_norm_eps
self.position_embedding_type = position_embedding_type self.position_embedding_type = position_embedding_type
self.use_cache = use_cache self.use_cache = use_cache
self.classifier_dropout = classifier_dropout
self.project_dim = project_dim self.project_dim = project_dim
self.pooler_fn = pooler_fn
class AltCLIPVisionConfig(PretrainedConfig): class AltCLIPVisionConfig(PretrainedConfig):
...@@ -176,8 +168,6 @@ class AltCLIPVisionConfig(PretrainedConfig): ...@@ -176,8 +168,6 @@ class AltCLIPVisionConfig(PretrainedConfig):
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-5): layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers. The epsilon used by the layer normalization layers.
dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.0): attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
initializer_range (`float`, *optional*, defaults to 0.02): initializer_range (`float`, *optional*, defaults to 0.02):
...@@ -215,7 +205,6 @@ class AltCLIPVisionConfig(PretrainedConfig): ...@@ -215,7 +205,6 @@ class AltCLIPVisionConfig(PretrainedConfig):
patch_size=32, patch_size=32,
hidden_act="quick_gelu", hidden_act="quick_gelu",
layer_norm_eps=1e-5, layer_norm_eps=1e-5,
dropout=0.0,
attention_dropout=0.0, attention_dropout=0.0,
initializer_range=0.02, initializer_range=0.02,
initializer_factor=1.0, initializer_factor=1.0,
...@@ -226,7 +215,6 @@ class AltCLIPVisionConfig(PretrainedConfig): ...@@ -226,7 +215,6 @@ class AltCLIPVisionConfig(PretrainedConfig):
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.intermediate_size = intermediate_size self.intermediate_size = intermediate_size
self.projection_dim = projection_dim self.projection_dim = projection_dim
self.dropout = dropout
self.num_hidden_layers = num_hidden_layers self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads self.num_attention_heads = num_attention_heads
self.num_channels = num_channels self.num_channels = num_channels
......
...@@ -68,8 +68,6 @@ class BioGptConfig(PretrainedConfig): ...@@ -68,8 +68,6 @@ class BioGptConfig(PretrainedConfig):
use_cache (`bool`, *optional*, defaults to `True`): use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`. relevant if `config.is_decoder=True`.
is_encoder_decoder (`bool`, *optional*, defaults to `False`):
Whether this is an encoder/decoder model.
layerdrop (`float`, *optional*, defaults to 0.0): layerdrop (`float`, *optional*, defaults to 0.0):
Please refer to the paper about LayerDrop: https://arxiv.org/abs/1909.11556 for further details Please refer to the paper about LayerDrop: https://arxiv.org/abs/1909.11556 for further details
activation_dropout (`float`, *optional*, defaults to 0.0): activation_dropout (`float`, *optional*, defaults to 0.0):
...@@ -111,7 +109,6 @@ class BioGptConfig(PretrainedConfig): ...@@ -111,7 +109,6 @@ class BioGptConfig(PretrainedConfig):
layer_norm_eps=1e-12, layer_norm_eps=1e-12,
scale_embedding=True, scale_embedding=True,
use_cache=True, use_cache=True,
is_encoder_decoder=False,
layerdrop=0.0, layerdrop=0.0,
activation_dropout=0.0, activation_dropout=0.0,
pad_token_id=1, pad_token_id=1,
...@@ -132,7 +129,6 @@ class BioGptConfig(PretrainedConfig): ...@@ -132,7 +129,6 @@ class BioGptConfig(PretrainedConfig):
self.layer_norm_eps = layer_norm_eps self.layer_norm_eps = layer_norm_eps
self.scale_embedding = scale_embedding self.scale_embedding = scale_embedding
self.use_cache = use_cache self.use_cache = use_cache
self.is_encoder_decoder = is_encoder_decoder
self.layerdrop = layerdrop self.layerdrop = layerdrop
self.activation_dropout = activation_dropout self.activation_dropout = activation_dropout
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
...@@ -77,10 +77,10 @@ class BlipTextConfig(PretrainedConfig): ...@@ -77,10 +77,10 @@ class BlipTextConfig(PretrainedConfig):
`"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-12): layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers. The epsilon used by the layer normalization layers.
hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.0): attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
initializer_range (`float`, *optional*, defaults to 0.02): initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
initializer_factor (`float``, *optional*, defaults to 1): initializer_factor (`float``, *optional*, defaults to 1):
...@@ -211,8 +211,6 @@ class BlipVisionConfig(PretrainedConfig): ...@@ -211,8 +211,6 @@ class BlipVisionConfig(PretrainedConfig):
`"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-5): layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers. The epsilon used by the layer normalization layers.
dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.0): attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
initializer_range (`float`, *optional*, defaults to 0.02): initializer_range (`float`, *optional*, defaults to 0.02):
...@@ -250,7 +248,6 @@ class BlipVisionConfig(PretrainedConfig): ...@@ -250,7 +248,6 @@ class BlipVisionConfig(PretrainedConfig):
patch_size=16, patch_size=16,
hidden_act="gelu", hidden_act="gelu",
layer_norm_eps=1e-5, layer_norm_eps=1e-5,
dropout=0.0,
attention_dropout=0.0, attention_dropout=0.0,
initializer_range=1e-10, initializer_range=1e-10,
initializer_factor=1.0, initializer_factor=1.0,
...@@ -261,7 +258,6 @@ class BlipVisionConfig(PretrainedConfig): ...@@ -261,7 +258,6 @@ class BlipVisionConfig(PretrainedConfig):
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.intermediate_size = intermediate_size self.intermediate_size = intermediate_size
self.projection_dim = projection_dim self.projection_dim = projection_dim
self.dropout = dropout
self.num_hidden_layers = num_hidden_layers self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads self.num_attention_heads = num_attention_heads
self.num_channels = num_channels self.num_channels = num_channels
......
...@@ -260,8 +260,6 @@ class BridgeTowerConfig(PretrainedConfig): ...@@ -260,8 +260,6 @@ class BridgeTowerConfig(PretrainedConfig):
Args: Args:
share_cross_modal_transformer_layers (`bool`, *optional*, defaults to `True`): share_cross_modal_transformer_layers (`bool`, *optional*, defaults to `True`):
Whether cross modal transformer layers are shared. Whether cross modal transformer layers are shared.
drop_rate (`float`, *optional*, defaults to 0.1):
Drop out probability.
head_hidden_scale (`int`, *optional*, defaults to 2): head_hidden_scale (`int`, *optional*, defaults to 2):
Scale of hidden layers head. Scale of hidden layers head.
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
...@@ -271,8 +269,6 @@ class BridgeTowerConfig(PretrainedConfig): ...@@ -271,8 +269,6 @@ class BridgeTowerConfig(PretrainedConfig):
initializer_factor (`float``, *optional*, defaults to 1): initializer_factor (`float``, *optional*, defaults to 1):
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
testing). testing).
is_encoder_decoder (`bool`, *optional*, defaults to `False`):
Whether this is an encoder/decoder model
layer_norm_eps (`float`, *optional*, defaults to 1e-05): layer_norm_eps (`float`, *optional*, defaults to 1e-05):
The epsilon used by the layer normalization layers. The epsilon used by the layer normalization layers.
share_link_tower_layers (`bool`, *optional*, defaults to `False`): share_link_tower_layers (`bool`, *optional*, defaults to `False`):
...@@ -311,12 +307,10 @@ class BridgeTowerConfig(PretrainedConfig): ...@@ -311,12 +307,10 @@ class BridgeTowerConfig(PretrainedConfig):
def __init__( def __init__(
self, self,
share_cross_modal_transformer_layers=True, share_cross_modal_transformer_layers=True,
drop_rate=0.1,
head_hidden_scale=2, head_hidden_scale=2,
hidden_act="gelu", hidden_act="gelu",
hidden_size=768, hidden_size=768,
initializer_factor=1, initializer_factor=1,
is_encoder_decoder=False,
layer_norm_eps=1e-05, layer_norm_eps=1e-05,
share_link_tower_layers=False, share_link_tower_layers=False,
link_tower_type="add", link_tower_type="add",
...@@ -330,12 +324,10 @@ class BridgeTowerConfig(PretrainedConfig): ...@@ -330,12 +324,10 @@ class BridgeTowerConfig(PretrainedConfig):
): ):
super().__init__(**kwargs) super().__init__(**kwargs)
self.share_cross_modal_transformer_layers = share_cross_modal_transformer_layers self.share_cross_modal_transformer_layers = share_cross_modal_transformer_layers
self.drop_rate = drop_rate
self.head_hidden_scale = head_hidden_scale self.head_hidden_scale = head_hidden_scale
self.hidden_act = hidden_act self.hidden_act = hidden_act
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.initializer_factor = initializer_factor self.initializer_factor = initializer_factor
self.is_encoder_decoder = is_encoder_decoder
self.layer_norm_eps = layer_norm_eps self.layer_norm_eps = layer_norm_eps
self.share_link_tower_layers = share_link_tower_layers self.share_link_tower_layers = share_link_tower_layers
self.link_tower_type = link_tower_type self.link_tower_type = link_tower_type
......
...@@ -190,8 +190,6 @@ class ChineseCLIPVisionConfig(PretrainedConfig): ...@@ -190,8 +190,6 @@ class ChineseCLIPVisionConfig(PretrainedConfig):
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-5): layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers. The epsilon used by the layer normalization layers.
dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.0): attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
initializer_range (`float`, *optional*, defaults to 0.02): initializer_range (`float`, *optional*, defaults to 0.02):
...@@ -227,7 +225,6 @@ class ChineseCLIPVisionConfig(PretrainedConfig): ...@@ -227,7 +225,6 @@ class ChineseCLIPVisionConfig(PretrainedConfig):
patch_size=32, patch_size=32,
hidden_act="quick_gelu", hidden_act="quick_gelu",
layer_norm_eps=1e-5, layer_norm_eps=1e-5,
dropout=0.0,
attention_dropout=0.0, attention_dropout=0.0,
initializer_range=0.02, initializer_range=0.02,
initializer_factor=1.0, initializer_factor=1.0,
...@@ -238,7 +235,6 @@ class ChineseCLIPVisionConfig(PretrainedConfig): ...@@ -238,7 +235,6 @@ class ChineseCLIPVisionConfig(PretrainedConfig):
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.intermediate_size = intermediate_size self.intermediate_size = intermediate_size
self.projection_dim = projection_dim self.projection_dim = projection_dim
self.dropout = dropout
self.num_hidden_layers = num_hidden_layers self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads self.num_attention_heads = num_attention_heads
self.num_channels = num_channels self.num_channels = num_channels
......
...@@ -69,8 +69,6 @@ class CLIPTextConfig(PretrainedConfig): ...@@ -69,8 +69,6 @@ class CLIPTextConfig(PretrainedConfig):
The epsilon used by the layer normalization layers. The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0): attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
initializer_range (`float`, *optional*, defaults to 0.02): initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
initializer_factor (`float`, *optional*, defaults to 1): initializer_factor (`float`, *optional*, defaults to 1):
...@@ -104,7 +102,6 @@ class CLIPTextConfig(PretrainedConfig): ...@@ -104,7 +102,6 @@ class CLIPTextConfig(PretrainedConfig):
max_position_embeddings=77, max_position_embeddings=77,
hidden_act="quick_gelu", hidden_act="quick_gelu",
layer_norm_eps=1e-5, layer_norm_eps=1e-5,
dropout=0.0,
attention_dropout=0.0, attention_dropout=0.0,
initializer_range=0.02, initializer_range=0.02,
initializer_factor=1.0, initializer_factor=1.0,
...@@ -119,7 +116,6 @@ class CLIPTextConfig(PretrainedConfig): ...@@ -119,7 +116,6 @@ class CLIPTextConfig(PretrainedConfig):
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.intermediate_size = intermediate_size self.intermediate_size = intermediate_size
self.projection_dim = projection_dim self.projection_dim = projection_dim
self.dropout = dropout
self.num_hidden_layers = num_hidden_layers self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads self.num_attention_heads = num_attention_heads
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
...@@ -175,8 +171,6 @@ class CLIPVisionConfig(PretrainedConfig): ...@@ -175,8 +171,6 @@ class CLIPVisionConfig(PretrainedConfig):
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-5): layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers. The epsilon used by the layer normalization layers.
dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.0): attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
initializer_range (`float`, *optional*, defaults to 0.02): initializer_range (`float`, *optional*, defaults to 0.02):
...@@ -214,7 +208,6 @@ class CLIPVisionConfig(PretrainedConfig): ...@@ -214,7 +208,6 @@ class CLIPVisionConfig(PretrainedConfig):
patch_size=32, patch_size=32,
hidden_act="quick_gelu", hidden_act="quick_gelu",
layer_norm_eps=1e-5, layer_norm_eps=1e-5,
dropout=0.0,
attention_dropout=0.0, attention_dropout=0.0,
initializer_range=0.02, initializer_range=0.02,
initializer_factor=1.0, initializer_factor=1.0,
...@@ -225,7 +218,6 @@ class CLIPVisionConfig(PretrainedConfig): ...@@ -225,7 +218,6 @@ class CLIPVisionConfig(PretrainedConfig):
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.intermediate_size = intermediate_size self.intermediate_size = intermediate_size
self.projection_dim = projection_dim self.projection_dim = projection_dim
self.dropout = dropout
self.num_hidden_layers = num_hidden_layers self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads self.num_attention_heads = num_attention_heads
self.num_channels = num_channels self.num_channels = num_channels
......
...@@ -61,8 +61,6 @@ class CLIPSegTextConfig(PretrainedConfig): ...@@ -61,8 +61,6 @@ class CLIPSegTextConfig(PretrainedConfig):
The epsilon used by the layer normalization layers. The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0): attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
initializer_range (`float`, *optional*, defaults to 0.02): initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
initializer_factor (`float``, *optional*, defaults to 1): initializer_factor (`float``, *optional*, defaults to 1):
...@@ -95,7 +93,6 @@ class CLIPSegTextConfig(PretrainedConfig): ...@@ -95,7 +93,6 @@ class CLIPSegTextConfig(PretrainedConfig):
max_position_embeddings=77, max_position_embeddings=77,
hidden_act="quick_gelu", hidden_act="quick_gelu",
layer_norm_eps=1e-5, layer_norm_eps=1e-5,
dropout=0.0,
attention_dropout=0.0, attention_dropout=0.0,
initializer_range=0.02, initializer_range=0.02,
initializer_factor=1.0, initializer_factor=1.0,
...@@ -109,7 +106,6 @@ class CLIPSegTextConfig(PretrainedConfig): ...@@ -109,7 +106,6 @@ class CLIPSegTextConfig(PretrainedConfig):
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.intermediate_size = intermediate_size self.intermediate_size = intermediate_size
self.dropout = dropout
self.num_hidden_layers = num_hidden_layers self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads self.num_attention_heads = num_attention_heads
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
...@@ -165,8 +161,6 @@ class CLIPSegVisionConfig(PretrainedConfig): ...@@ -165,8 +161,6 @@ class CLIPSegVisionConfig(PretrainedConfig):
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-5): layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers. The epsilon used by the layer normalization layers.
dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.0): attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
initializer_range (`float`, *optional*, defaults to 0.02): initializer_range (`float`, *optional*, defaults to 0.02):
...@@ -203,7 +197,6 @@ class CLIPSegVisionConfig(PretrainedConfig): ...@@ -203,7 +197,6 @@ class CLIPSegVisionConfig(PretrainedConfig):
patch_size=32, patch_size=32,
hidden_act="quick_gelu", hidden_act="quick_gelu",
layer_norm_eps=1e-5, layer_norm_eps=1e-5,
dropout=0.0,
attention_dropout=0.0, attention_dropout=0.0,
initializer_range=0.02, initializer_range=0.02,
initializer_factor=1.0, initializer_factor=1.0,
...@@ -213,7 +206,6 @@ class CLIPSegVisionConfig(PretrainedConfig): ...@@ -213,7 +206,6 @@ class CLIPSegVisionConfig(PretrainedConfig):
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.intermediate_size = intermediate_size self.intermediate_size = intermediate_size
self.dropout = dropout
self.num_hidden_layers = num_hidden_layers self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads self.num_attention_heads = num_attention_heads
self.num_channels = num_channels self.num_channels = num_channels
......
...@@ -164,7 +164,6 @@ class ConditionalDetrConfig(PretrainedConfig): ...@@ -164,7 +164,6 @@ class ConditionalDetrConfig(PretrainedConfig):
activation_dropout=0.0, activation_dropout=0.0,
init_std=0.02, init_std=0.02,
init_xavier_std=1.0, init_xavier_std=1.0,
classifier_dropout=0.0,
scale_embedding=False, scale_embedding=False,
auxiliary_loss=False, auxiliary_loss=False,
position_embedding_type="sine", position_embedding_type="sine",
......
...@@ -96,11 +96,6 @@ class CTRLConfig(PretrainedConfig): ...@@ -96,11 +96,6 @@ class CTRLConfig(PretrainedConfig):
embd_pdrop=0.1, embd_pdrop=0.1,
layer_norm_epsilon=1e-6, layer_norm_epsilon=1e-6,
initializer_range=0.02, initializer_range=0.02,
summary_type="cls_index",
summary_use_proj=True,
summary_activation=None,
summary_proj_to_labels=True,
summary_first_dropout=0.1,
use_cache=True, use_cache=True,
**kwargs **kwargs
): ):
...@@ -115,11 +110,6 @@ class CTRLConfig(PretrainedConfig): ...@@ -115,11 +110,6 @@ class CTRLConfig(PretrainedConfig):
self.layer_norm_epsilon = layer_norm_epsilon self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.summary_type = summary_type
self.summary_use_proj = summary_use_proj
self.summary_activation = summary_activation
self.summary_first_dropout = summary_first_dropout
self.summary_proj_to_labels = summary_proj_to_labels
self.use_cache = use_cache self.use_cache = use_cache
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -129,11 +129,6 @@ class DecisionTransformerConfig(PretrainedConfig): ...@@ -129,11 +129,6 @@ class DecisionTransformerConfig(PretrainedConfig):
attn_pdrop=0.1, attn_pdrop=0.1,
layer_norm_epsilon=1e-5, layer_norm_epsilon=1e-5,
initializer_range=0.02, initializer_range=0.02,
summary_type="cls_index",
summary_use_proj=True,
summary_activation=None,
summary_proj_to_labels=True,
summary_first_dropout=0.1,
scale_attn_weights=True, scale_attn_weights=True,
use_cache=True, use_cache=True,
bos_token_id=50256, bos_token_id=50256,
...@@ -160,11 +155,6 @@ class DecisionTransformerConfig(PretrainedConfig): ...@@ -160,11 +155,6 @@ class DecisionTransformerConfig(PretrainedConfig):
self.attn_pdrop = attn_pdrop self.attn_pdrop = attn_pdrop
self.layer_norm_epsilon = layer_norm_epsilon self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.summary_type = summary_type
self.summary_use_proj = summary_use_proj
self.summary_activation = summary_activation
self.summary_first_dropout = summary_first_dropout
self.summary_proj_to_labels = summary_proj_to_labels
self.scale_attn_weights = scale_attn_weights self.scale_attn_weights = scale_attn_weights
self.use_cache = use_cache self.use_cache = use_cache
self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
......
...@@ -161,7 +161,6 @@ class DetrConfig(PretrainedConfig): ...@@ -161,7 +161,6 @@ class DetrConfig(PretrainedConfig):
activation_dropout=0.0, activation_dropout=0.0,
init_std=0.02, init_std=0.02,
init_xavier_std=1.0, init_xavier_std=1.0,
classifier_dropout=0.0,
scale_embedding=False, scale_embedding=False,
auxiliary_loss=False, auxiliary_loss=False,
position_embedding_type="sine", position_embedding_type="sine",
......
...@@ -81,8 +81,6 @@ class EsmConfig(PretrainedConfig): ...@@ -81,8 +81,6 @@ class EsmConfig(PretrainedConfig):
use_cache (`bool`, *optional*, defaults to `True`): use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`. relevant if `config.is_decoder=True`.
classifier_dropout (`float`, *optional*):
The dropout ratio for the classification head.
emb_layer_norm_before (`bool`, *optional*): emb_layer_norm_before (`bool`, *optional*):
Whether to apply layer normalization after embeddings but before the main stem of the network. Whether to apply layer normalization after embeddings but before the main stem of the network.
token_dropout (`bool`, defaults to `False`): token_dropout (`bool`, defaults to `False`):
...@@ -117,7 +115,6 @@ class EsmConfig(PretrainedConfig): ...@@ -117,7 +115,6 @@ class EsmConfig(PretrainedConfig):
layer_norm_eps=1e-12, layer_norm_eps=1e-12,
position_embedding_type="absolute", position_embedding_type="absolute",
use_cache=True, use_cache=True,
classifier_dropout=None,
emb_layer_norm_before=None, emb_layer_norm_before=None,
token_dropout=False, token_dropout=False,
is_folding_model=False, is_folding_model=False,
...@@ -139,7 +136,6 @@ class EsmConfig(PretrainedConfig): ...@@ -139,7 +136,6 @@ class EsmConfig(PretrainedConfig):
self.layer_norm_eps = layer_norm_eps self.layer_norm_eps = layer_norm_eps
self.position_embedding_type = position_embedding_type self.position_embedding_type = position_embedding_type
self.use_cache = use_cache self.use_cache = use_cache
self.classifier_dropout = classifier_dropout
self.emb_layer_norm_before = emb_layer_norm_before self.emb_layer_norm_before = emb_layer_norm_before
self.token_dropout = token_dropout self.token_dropout = token_dropout
self.is_folding_model = is_folding_model self.is_folding_model = is_folding_model
......
...@@ -57,8 +57,6 @@ class GitVisionConfig(PretrainedConfig): ...@@ -57,8 +57,6 @@ class GitVisionConfig(PretrainedConfig):
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-5): layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers. The epsilon used by the layer normalization layers.
dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.0): attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
initializer_range (`float`, *optional*, defaults to 0.02): initializer_range (`float`, *optional*, defaults to 0.02):
...@@ -96,7 +94,6 @@ class GitVisionConfig(PretrainedConfig): ...@@ -96,7 +94,6 @@ class GitVisionConfig(PretrainedConfig):
patch_size=16, patch_size=16,
hidden_act="quick_gelu", hidden_act="quick_gelu",
layer_norm_eps=1e-5, layer_norm_eps=1e-5,
dropout=0.0,
attention_dropout=0.0, attention_dropout=0.0,
initializer_range=0.02, initializer_range=0.02,
initializer_factor=1.0, initializer_factor=1.0,
...@@ -107,7 +104,6 @@ class GitVisionConfig(PretrainedConfig): ...@@ -107,7 +104,6 @@ class GitVisionConfig(PretrainedConfig):
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.intermediate_size = intermediate_size self.intermediate_size = intermediate_size
self.projection_dim = projection_dim self.projection_dim = projection_dim
self.dropout = dropout
self.num_hidden_layers = num_hidden_layers self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads self.num_attention_heads = num_attention_heads
self.num_channels = num_channels self.num_channels = num_channels
...@@ -183,8 +179,6 @@ class GitConfig(PretrainedConfig): ...@@ -183,8 +179,6 @@ class GitConfig(PretrainedConfig):
with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658). with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
use_cache (`bool`, *optional*, defaults to `True`): use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Whether or not the model should return the last key/values attentions (not used by all models).
classifier_dropout (`float`, *optional*):
The dropout ratio for the classification head.
num_image_with_embedding (`int`, *optional*): num_image_with_embedding (`int`, *optional*):
The number of temporal embeddings to add, in case the model is used for video captioning/VQA. The number of temporal embeddings to add, in case the model is used for video captioning/VQA.
...@@ -221,7 +215,6 @@ class GitConfig(PretrainedConfig): ...@@ -221,7 +215,6 @@ class GitConfig(PretrainedConfig):
pad_token_id=0, pad_token_id=0,
position_embedding_type="absolute", position_embedding_type="absolute",
use_cache=True, use_cache=True,
classifier_dropout=None,
tie_word_embeddings=False, tie_word_embeddings=False,
bos_token_id=101, bos_token_id=101,
eos_token_id=102, eos_token_id=102,
...@@ -248,7 +241,6 @@ class GitConfig(PretrainedConfig): ...@@ -248,7 +241,6 @@ class GitConfig(PretrainedConfig):
self.layer_norm_eps = layer_norm_eps self.layer_norm_eps = layer_norm_eps
self.position_embedding_type = position_embedding_type self.position_embedding_type = position_embedding_type
self.use_cache = use_cache self.use_cache = use_cache
self.classifier_dropout = classifier_dropout
self.tie_word_embeddings = tie_word_embeddings self.tie_word_embeddings = tie_word_embeddings
self.num_image_with_embedding = num_image_with_embedding self.num_image_with_embedding = num_image_with_embedding
......
...@@ -113,11 +113,6 @@ class GPTNeoConfig(PretrainedConfig): ...@@ -113,11 +113,6 @@ class GPTNeoConfig(PretrainedConfig):
attention_dropout=0.0, attention_dropout=0.0,
layer_norm_epsilon=1e-5, layer_norm_epsilon=1e-5,
initializer_range=0.02, initializer_range=0.02,
summary_type="cls_index",
summary_use_proj=True,
summary_activation=None,
summary_proj_to_labels=True,
summary_first_dropout=0.1,
use_cache=True, use_cache=True,
bos_token_id=50256, bos_token_id=50256,
eos_token_id=50256, eos_token_id=50256,
...@@ -136,11 +131,6 @@ class GPTNeoConfig(PretrainedConfig): ...@@ -136,11 +131,6 @@ class GPTNeoConfig(PretrainedConfig):
self.attention_dropout = attention_dropout self.attention_dropout = attention_dropout
self.layer_norm_epsilon = layer_norm_epsilon self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.summary_type = summary_type
self.summary_use_proj = summary_use_proj
self.summary_activation = summary_activation
self.summary_first_dropout = summary_first_dropout
self.summary_proj_to_labels = summary_proj_to_labels
self.use_cache = use_cache self.use_cache = use_cache
self.bos_token_id = bos_token_id self.bos_token_id = bos_token_id
......
...@@ -111,7 +111,6 @@ class LayoutLMConfig(PretrainedConfig): ...@@ -111,7 +111,6 @@ class LayoutLMConfig(PretrainedConfig):
pad_token_id=0, pad_token_id=0,
position_embedding_type="absolute", position_embedding_type="absolute",
use_cache=True, use_cache=True,
classifier_dropout=None,
max_2d_position_embeddings=1024, max_2d_position_embeddings=1024,
**kwargs **kwargs
): ):
...@@ -130,7 +129,6 @@ class LayoutLMConfig(PretrainedConfig): ...@@ -130,7 +129,6 @@ class LayoutLMConfig(PretrainedConfig):
self.layer_norm_eps = layer_norm_eps self.layer_norm_eps = layer_norm_eps
self.position_embedding_type = position_embedding_type self.position_embedding_type = position_embedding_type
self.use_cache = use_cache self.use_cache = use_cache
self.classifier_dropout = classifier_dropout
self.max_2d_position_embeddings = max_2d_position_embeddings self.max_2d_position_embeddings = max_2d_position_embeddings
......
...@@ -92,8 +92,6 @@ class LongformerConfig(PretrainedConfig): ...@@ -92,8 +92,6 @@ class LongformerConfig(PretrainedConfig):
[Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658). with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
classifier_dropout (`float`, *optional*):
The dropout ratio for the classification head.
attention_window (`int` or `List[int]`, *optional*, defaults to 512): attention_window (`int` or `List[int]`, *optional*, defaults to 512):
Size of an attention window around each token. If an `int`, use the same size for all layers. To specify a Size of an attention window around each token. If an `int`, use the same size for all layers. To specify a
different window size for each layer, use a `List[int]` where `len(attention_window) == num_hidden_layers`. different window size for each layer, use a `List[int]` where `len(attention_window) == num_hidden_layers`.
...@@ -134,7 +132,6 @@ class LongformerConfig(PretrainedConfig): ...@@ -134,7 +132,6 @@ class LongformerConfig(PretrainedConfig):
initializer_range: float = 0.02, initializer_range: float = 0.02,
layer_norm_eps: float = 1e-12, layer_norm_eps: float = 1e-12,
position_embedding_type: str = "absolute", position_embedding_type: str = "absolute",
classifier_dropout: float = None,
onnx_export: bool = False, onnx_export: bool = False,
**kwargs **kwargs
): ):
...@@ -158,7 +155,6 @@ class LongformerConfig(PretrainedConfig): ...@@ -158,7 +155,6 @@ class LongformerConfig(PretrainedConfig):
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps self.layer_norm_eps = layer_norm_eps
self.position_embedding_type = position_embedding_type self.position_embedding_type = position_embedding_type
self.classifier_dropout = classifier_dropout
self.onnx_export = onnx_export self.onnx_export = onnx_export
......
...@@ -71,8 +71,6 @@ class OwlViTTextConfig(PretrainedConfig): ...@@ -71,8 +71,6 @@ class OwlViTTextConfig(PretrainedConfig):
The epsilon used by the layer normalization layers. The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0): attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
initializer_range (`float`, *optional*, defaults to 0.02): initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
initializer_factor (`float`, *optional*, defaults to 1): initializer_factor (`float`, *optional*, defaults to 1):
...@@ -105,7 +103,6 @@ class OwlViTTextConfig(PretrainedConfig): ...@@ -105,7 +103,6 @@ class OwlViTTextConfig(PretrainedConfig):
max_position_embeddings=16, max_position_embeddings=16,
hidden_act="quick_gelu", hidden_act="quick_gelu",
layer_norm_eps=1e-5, layer_norm_eps=1e-5,
dropout=0.0,
attention_dropout=0.0, attention_dropout=0.0,
initializer_range=0.02, initializer_range=0.02,
initializer_factor=1.0, initializer_factor=1.0,
...@@ -124,7 +121,6 @@ class OwlViTTextConfig(PretrainedConfig): ...@@ -124,7 +121,6 @@ class OwlViTTextConfig(PretrainedConfig):
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.hidden_act = hidden_act self.hidden_act = hidden_act
self.layer_norm_eps = layer_norm_eps self.layer_norm_eps = layer_norm_eps
self.dropout = dropout
self.attention_dropout = attention_dropout self.attention_dropout = attention_dropout
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.initializer_factor = initializer_factor self.initializer_factor = initializer_factor
...@@ -177,8 +173,6 @@ class OwlViTVisionConfig(PretrainedConfig): ...@@ -177,8 +173,6 @@ class OwlViTVisionConfig(PretrainedConfig):
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-5): layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers. The epsilon used by the layer normalization layers.
dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.0): attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
initializer_range (`float`, *optional*, defaults to 0.02): initializer_range (`float`, *optional*, defaults to 0.02):
...@@ -215,7 +209,6 @@ class OwlViTVisionConfig(PretrainedConfig): ...@@ -215,7 +209,6 @@ class OwlViTVisionConfig(PretrainedConfig):
patch_size=32, patch_size=32,
hidden_act="quick_gelu", hidden_act="quick_gelu",
layer_norm_eps=1e-5, layer_norm_eps=1e-5,
dropout=0.0,
attention_dropout=0.0, attention_dropout=0.0,
initializer_range=0.02, initializer_range=0.02,
initializer_factor=1.0, initializer_factor=1.0,
...@@ -232,7 +225,6 @@ class OwlViTVisionConfig(PretrainedConfig): ...@@ -232,7 +225,6 @@ class OwlViTVisionConfig(PretrainedConfig):
self.patch_size = patch_size self.patch_size = patch_size
self.hidden_act = hidden_act self.hidden_act = hidden_act
self.layer_norm_eps = layer_norm_eps self.layer_norm_eps = layer_norm_eps
self.dropout = dropout
self.attention_dropout = attention_dropout self.attention_dropout = attention_dropout
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.initializer_factor = initializer_factor self.initializer_factor = initializer_factor
......
...@@ -65,8 +65,6 @@ class PegasusXConfig(PretrainedConfig): ...@@ -65,8 +65,6 @@ class PegasusXConfig(PretrainedConfig):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
activation_dropout (`float`, *optional*, defaults to 0.0): activation_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for activations inside the fully connected layer. The dropout ratio for activations inside the fully connected layer.
classifier_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for classifier.
max_position_embeddings (`int`, *optional*, defaults to 16384): max_position_embeddings (`int`, *optional*, defaults to 16384):
The maximum sequence length that this model might ever be used with. Typically set this to something large The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048). just in case (e.g., 512 or 1024 or 2048).
...@@ -130,7 +128,6 @@ class PegasusXConfig(PretrainedConfig): ...@@ -130,7 +128,6 @@ class PegasusXConfig(PretrainedConfig):
activation_dropout=0.0, activation_dropout=0.0,
init_std=0.02, init_std=0.02,
decoder_start_token_id=0, decoder_start_token_id=0,
classifier_dropout=0.0,
scale_embedding=True, scale_embedding=True,
pad_token_id=0, pad_token_id=0,
eos_token_id=1, eos_token_id=1,
...@@ -156,7 +153,6 @@ class PegasusXConfig(PretrainedConfig): ...@@ -156,7 +153,6 @@ class PegasusXConfig(PretrainedConfig):
self.init_std = init_std self.init_std = init_std
self.encoder_layerdrop = encoder_layerdrop self.encoder_layerdrop = encoder_layerdrop
self.decoder_layerdrop = decoder_layerdrop self.decoder_layerdrop = decoder_layerdrop
self.classifier_dropout = classifier_dropout
self.use_cache = use_cache self.use_cache = use_cache
self.num_hidden_layers = encoder_layers self.num_hidden_layers = encoder_layers
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
......
...@@ -66,8 +66,6 @@ class Speech2TextConfig(PretrainedConfig): ...@@ -66,8 +66,6 @@ class Speech2TextConfig(PretrainedConfig):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
activation_dropout (`float`, *optional*, defaults to 0.0): activation_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for activations inside the fully connected layer. The dropout ratio for activations inside the fully connected layer.
classifier_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for classifier.
init_std (`float`, *optional*, defaults to 0.02): init_std (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
encoder_layerdrop (`float`, *optional*, defaults to 0.0): encoder_layerdrop (`float`, *optional*, defaults to 0.0):
...@@ -135,7 +133,6 @@ class Speech2TextConfig(PretrainedConfig): ...@@ -135,7 +133,6 @@ class Speech2TextConfig(PretrainedConfig):
activation_dropout=0.0, activation_dropout=0.0,
init_std=0.02, init_std=0.02,
decoder_start_token_id=2, decoder_start_token_id=2,
classifier_dropout=0.0,
scale_embedding=True, scale_embedding=True,
pad_token_id=1, pad_token_id=1,
bos_token_id=0, bos_token_id=0,
...@@ -164,7 +161,6 @@ class Speech2TextConfig(PretrainedConfig): ...@@ -164,7 +161,6 @@ class Speech2TextConfig(PretrainedConfig):
self.init_std = init_std self.init_std = init_std
self.encoder_layerdrop = encoder_layerdrop self.encoder_layerdrop = encoder_layerdrop
self.decoder_layerdrop = decoder_layerdrop self.decoder_layerdrop = decoder_layerdrop
self.classifier_dropout = classifier_dropout
self.use_cache = use_cache self.use_cache = use_cache
self.num_hidden_layers = encoder_layers self.num_hidden_layers = encoder_layers
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
......
...@@ -163,7 +163,6 @@ class TableTransformerConfig(PretrainedConfig): ...@@ -163,7 +163,6 @@ class TableTransformerConfig(PretrainedConfig):
activation_dropout=0.0, activation_dropout=0.0,
init_std=0.02, init_std=0.02,
init_xavier_std=1.0, init_xavier_std=1.0,
classifier_dropout=0.0,
scale_embedding=False, scale_embedding=False,
auxiliary_loss=False, auxiliary_loss=False,
position_embedding_type="sine", position_embedding_type="sine",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment