Unverified Commit b47a1674 authored by Yih-Dar's avatar Yih-Dar Committed by GitHub
Browse files

Remove more unused attributes in config classes (#21543)



* Remove unused decoder_layerdrop

* Update SPECIAL_CASES_TO_ALLOW for MT5Config

* Remove unused position_embedding_init_scale

* Remove unused decoder_max_relative_position

* Use unused decoder_max_relative_position

* Remove unused init_std

* Remove unused forgotten attributes

* Remove unused patch_norm

* Remove unused max_seq_len

* Update SPECIAL_CASES_TO_ALLOW for OneFormerConfig

---------
Co-authored-by: default avatarydshieh <ydshieh@users.noreply.github.com>
parent 862e8e4f
...@@ -80,9 +80,6 @@ class DeformableDetrConfig(PretrainedConfig): ...@@ -80,9 +80,6 @@ class DeformableDetrConfig(PretrainedConfig):
encoder_layerdrop: (`float`, *optional*, defaults to 0.0): encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
for more details. for more details.
decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
for more details.
auxiliary_loss (`bool`, *optional*, defaults to `False`): auxiliary_loss (`bool`, *optional*, defaults to `False`):
Whether auxiliary decoding losses (loss at each decoder layer) are to be used. Whether auxiliary decoding losses (loss at each decoder layer) are to be used.
position_embedding_type (`str`, *optional*, defaults to `"sine"`): position_embedding_type (`str`, *optional*, defaults to `"sine"`):
...@@ -163,7 +160,6 @@ class DeformableDetrConfig(PretrainedConfig): ...@@ -163,7 +160,6 @@ class DeformableDetrConfig(PretrainedConfig):
decoder_ffn_dim=1024, decoder_ffn_dim=1024,
decoder_attention_heads=8, decoder_attention_heads=8,
encoder_layerdrop=0.0, encoder_layerdrop=0.0,
decoder_layerdrop=0.0,
is_encoder_decoder=True, is_encoder_decoder=True,
activation_function="relu", activation_function="relu",
d_model=256, d_model=256,
...@@ -225,7 +221,6 @@ class DeformableDetrConfig(PretrainedConfig): ...@@ -225,7 +221,6 @@ class DeformableDetrConfig(PretrainedConfig):
self.init_std = init_std self.init_std = init_std
self.init_xavier_std = init_xavier_std self.init_xavier_std = init_xavier_std
self.encoder_layerdrop = encoder_layerdrop self.encoder_layerdrop = encoder_layerdrop
self.decoder_layerdrop = decoder_layerdrop
self.auxiliary_loss = auxiliary_loss self.auxiliary_loss = auxiliary_loss
self.position_embedding_type = position_embedding_type self.position_embedding_type = position_embedding_type
self.backbone = backbone self.backbone = backbone
......
...@@ -74,9 +74,6 @@ class DetaConfig(PretrainedConfig): ...@@ -74,9 +74,6 @@ class DetaConfig(PretrainedConfig):
encoder_layerdrop: (`float`, *optional*, defaults to 0.0): encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
for more details. for more details.
decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
for more details.
auxiliary_loss (`bool`, *optional*, defaults to `False`): auxiliary_loss (`bool`, *optional*, defaults to `False`):
Whether auxiliary decoding losses (loss at each decoder layer) are to be used. Whether auxiliary decoding losses (loss at each decoder layer) are to be used.
position_embedding_type (`str`, *optional*, defaults to `"sine"`): position_embedding_type (`str`, *optional*, defaults to `"sine"`):
...@@ -146,7 +143,6 @@ class DetaConfig(PretrainedConfig): ...@@ -146,7 +143,6 @@ class DetaConfig(PretrainedConfig):
decoder_ffn_dim=1024, decoder_ffn_dim=1024,
decoder_attention_heads=8, decoder_attention_heads=8,
encoder_layerdrop=0.0, encoder_layerdrop=0.0,
decoder_layerdrop=0.0,
is_encoder_decoder=True, is_encoder_decoder=True,
activation_function="relu", activation_function="relu",
d_model=256, d_model=256,
...@@ -202,7 +198,6 @@ class DetaConfig(PretrainedConfig): ...@@ -202,7 +198,6 @@ class DetaConfig(PretrainedConfig):
self.init_std = init_std self.init_std = init_std
self.init_xavier_std = init_xavier_std self.init_xavier_std = init_xavier_std
self.encoder_layerdrop = encoder_layerdrop self.encoder_layerdrop = encoder_layerdrop
self.decoder_layerdrop = decoder_layerdrop
self.auxiliary_loss = auxiliary_loss self.auxiliary_loss = auxiliary_loss
self.position_embedding_type = position_embedding_type self.position_embedding_type = position_embedding_type
# deformable attributes # deformable attributes
......
...@@ -64,8 +64,6 @@ class DinatConfig(PretrainedConfig): ...@@ -64,8 +64,6 @@ class DinatConfig(PretrainedConfig):
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`, The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`,
`"selu"` and `"gelu_new"` are supported. `"selu"` and `"gelu_new"` are supported.
patch_norm (`bool`, *optional*, defaults to `True`):
Whether or not to add layer normalization after patch embedding.
initializer_range (`float`, *optional*, defaults to 0.02): initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (`float`, *optional*, defaults to 1e-12): layer_norm_eps (`float`, *optional*, defaults to 1e-12):
...@@ -112,7 +110,6 @@ class DinatConfig(PretrainedConfig): ...@@ -112,7 +110,6 @@ class DinatConfig(PretrainedConfig):
attention_probs_dropout_prob=0.0, attention_probs_dropout_prob=0.0,
drop_path_rate=0.1, drop_path_rate=0.1,
hidden_act="gelu", hidden_act="gelu",
patch_norm=True,
initializer_range=0.02, initializer_range=0.02,
layer_norm_eps=1e-5, layer_norm_eps=1e-5,
layer_scale_init_value=0.0, layer_scale_init_value=0.0,
...@@ -135,7 +132,6 @@ class DinatConfig(PretrainedConfig): ...@@ -135,7 +132,6 @@ class DinatConfig(PretrainedConfig):
self.attention_probs_dropout_prob = attention_probs_dropout_prob self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.drop_path_rate = drop_path_rate self.drop_path_rate = drop_path_rate
self.hidden_act = hidden_act self.hidden_act = hidden_act
self.path_norm = patch_norm
self.layer_norm_eps = layer_norm_eps self.layer_norm_eps = layer_norm_eps
self.initializer_range = initializer_range self.initializer_range = initializer_range
# we set the hidden_size attribute in order to make Dinat work with VisionEncoderDecoderModel # we set the hidden_size attribute in order to make Dinat work with VisionEncoderDecoderModel
......
...@@ -66,8 +66,6 @@ class DonutSwinConfig(PretrainedConfig): ...@@ -66,8 +66,6 @@ class DonutSwinConfig(PretrainedConfig):
`"selu"` and `"gelu_new"` are supported. `"selu"` and `"gelu_new"` are supported.
use_absolute_embeddings (`bool`, *optional*, defaults to False): use_absolute_embeddings (`bool`, *optional*, defaults to False):
Whether or not to add absolute position embeddings to the patch embeddings. Whether or not to add absolute position embeddings to the patch embeddings.
patch_norm (`bool`, *optional*, defaults to True):
Whether or not to add layer normalization after patch embedding.
initializer_range (`float`, *optional*, defaults to 0.02): initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (`float`, *optional*, defaults to 1e-12): layer_norm_eps (`float`, *optional*, defaults to 1e-12):
...@@ -110,7 +108,6 @@ class DonutSwinConfig(PretrainedConfig): ...@@ -110,7 +108,6 @@ class DonutSwinConfig(PretrainedConfig):
drop_path_rate=0.1, drop_path_rate=0.1,
hidden_act="gelu", hidden_act="gelu",
use_absolute_embeddings=False, use_absolute_embeddings=False,
patch_norm=True,
initializer_range=0.02, initializer_range=0.02,
layer_norm_eps=1e-5, layer_norm_eps=1e-5,
**kwargs, **kwargs,
...@@ -132,7 +129,6 @@ class DonutSwinConfig(PretrainedConfig): ...@@ -132,7 +129,6 @@ class DonutSwinConfig(PretrainedConfig):
self.drop_path_rate = drop_path_rate self.drop_path_rate = drop_path_rate
self.hidden_act = hidden_act self.hidden_act = hidden_act
self.use_absolute_embeddings = use_absolute_embeddings self.use_absolute_embeddings = use_absolute_embeddings
self.path_norm = patch_norm
self.layer_norm_eps = layer_norm_eps self.layer_norm_eps = layer_norm_eps
self.initializer_range = initializer_range self.initializer_range = initializer_range
# we set the hidden_size attribute in order to make Swin work with VisionEncoderDecoderModel # we set the hidden_size attribute in order to make Swin work with VisionEncoderDecoderModel
......
...@@ -539,8 +539,6 @@ class JukeboxConfig(PretrainedConfig): ...@@ -539,8 +539,6 @@ class JukeboxConfig(PretrainedConfig):
metadata_conditioning (`bool`, *optional*, defaults to `True`): metadata_conditioning (`bool`, *optional*, defaults to `True`):
Whether or not to use metadata conditioning, corresponding to the artist, the genre and the min/maximum Whether or not to use metadata conditioning, corresponding to the artist, the genre and the min/maximum
duration. duration.
init_std (`float`, *optional*, defaults to 0.2):
Standard deviation used to initial the model.
Example: Example:
...@@ -572,7 +570,6 @@ class JukeboxConfig(PretrainedConfig): ...@@ -572,7 +570,6 @@ class JukeboxConfig(PretrainedConfig):
max_duration=600.0, max_duration=600.0,
max_nb_genres=5, max_nb_genres=5,
metadata_conditioning=True, metadata_conditioning=True,
init_std=0.2,
**kwargs, **kwargs,
): ):
if vqvae_config is None: if vqvae_config is None:
...@@ -596,7 +593,6 @@ class JukeboxConfig(PretrainedConfig): ...@@ -596,7 +593,6 @@ class JukeboxConfig(PretrainedConfig):
self.hop_fraction = self.vqvae_config.hop_fraction self.hop_fraction = self.vqvae_config.hop_fraction
self.init_std = init_std
self.nb_priors = nb_priors self.nb_priors = nb_priors
# Metadata conditioning # Metadata conditioning
......
...@@ -62,8 +62,6 @@ class MaskFormerSwinConfig(PretrainedConfig): ...@@ -62,8 +62,6 @@ class MaskFormerSwinConfig(PretrainedConfig):
`"selu"` and `"gelu_new"` are supported. `"selu"` and `"gelu_new"` are supported.
use_absolute_embeddings (`bool`, *optional*, defaults to False): use_absolute_embeddings (`bool`, *optional*, defaults to False):
Whether or not to add absolute position embeddings to the patch embeddings. Whether or not to add absolute position embeddings to the patch embeddings.
patch_norm (`bool`, *optional*, defaults to True):
Whether or not to add layer normalization after patch embedding.
initializer_range (`float`, *optional*, defaults to 0.02): initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (`float`, *optional*, defaults to 1e-12): layer_norm_eps (`float`, *optional*, defaults to 1e-12):
...@@ -109,7 +107,6 @@ class MaskFormerSwinConfig(PretrainedConfig): ...@@ -109,7 +107,6 @@ class MaskFormerSwinConfig(PretrainedConfig):
drop_path_rate=0.1, drop_path_rate=0.1,
hidden_act="gelu", hidden_act="gelu",
use_absolute_embeddings=False, use_absolute_embeddings=False,
patch_norm=True,
initializer_range=0.02, initializer_range=0.02,
layer_norm_eps=1e-5, layer_norm_eps=1e-5,
out_features=None, out_features=None,
...@@ -132,7 +129,6 @@ class MaskFormerSwinConfig(PretrainedConfig): ...@@ -132,7 +129,6 @@ class MaskFormerSwinConfig(PretrainedConfig):
self.drop_path_rate = drop_path_rate self.drop_path_rate = drop_path_rate
self.hidden_act = hidden_act self.hidden_act = hidden_act
self.use_absolute_embeddings = use_absolute_embeddings self.use_absolute_embeddings = use_absolute_embeddings
self.path_norm = patch_norm
self.layer_norm_eps = layer_norm_eps self.layer_norm_eps = layer_norm_eps
self.initializer_range = initializer_range self.initializer_range = initializer_range
# we set the hidden_size attribute in order to make Swin work with VisionEncoderDecoderModel # we set the hidden_size attribute in order to make Swin work with VisionEncoderDecoderModel
......
...@@ -62,8 +62,6 @@ class NatConfig(PretrainedConfig): ...@@ -62,8 +62,6 @@ class NatConfig(PretrainedConfig):
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`, The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`,
`"selu"` and `"gelu_new"` are supported. `"selu"` and `"gelu_new"` are supported.
patch_norm (`bool`, *optional*, defaults to `True`):
Whether or not to add layer normalization after patch embedding.
initializer_range (`float`, *optional*, defaults to 0.02): initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (`float`, *optional*, defaults to 1e-12): layer_norm_eps (`float`, *optional*, defaults to 1e-12):
...@@ -109,7 +107,6 @@ class NatConfig(PretrainedConfig): ...@@ -109,7 +107,6 @@ class NatConfig(PretrainedConfig):
attention_probs_dropout_prob=0.0, attention_probs_dropout_prob=0.0,
drop_path_rate=0.1, drop_path_rate=0.1,
hidden_act="gelu", hidden_act="gelu",
patch_norm=True,
initializer_range=0.02, initializer_range=0.02,
layer_norm_eps=1e-5, layer_norm_eps=1e-5,
layer_scale_init_value=0.0, layer_scale_init_value=0.0,
...@@ -131,7 +128,6 @@ class NatConfig(PretrainedConfig): ...@@ -131,7 +128,6 @@ class NatConfig(PretrainedConfig):
self.attention_probs_dropout_prob = attention_probs_dropout_prob self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.drop_path_rate = drop_path_rate self.drop_path_rate = drop_path_rate
self.hidden_act = hidden_act self.hidden_act = hidden_act
self.path_norm = patch_norm
self.layer_norm_eps = layer_norm_eps self.layer_norm_eps = layer_norm_eps
self.initializer_range = initializer_range self.initializer_range = initializer_range
# we set the hidden_size attribute in order to make Nat work with VisionEncoderDecoderModel # we set the hidden_size attribute in order to make Nat work with VisionEncoderDecoderModel
......
...@@ -83,8 +83,6 @@ class OneFormerConfig(PretrainedConfig): ...@@ -83,8 +83,6 @@ class OneFormerConfig(PretrainedConfig):
List containing the strides for feature maps in the encoder. List containing the strides for feature maps in the encoder.
task_seq_len (`int`, *optional*, defaults to 77) task_seq_len (`int`, *optional*, defaults to 77)
Sequence length for tokenizing text list input. Sequence length for tokenizing text list input.
max_seq_len (`int`, *optional*, defaults to 77)
Sequence length for tokenizing task input.
text_encoder_width (`int`, *optional*, defaults to 256) text_encoder_width (`int`, *optional*, defaults to 256)
Hidden size for text encoder. Hidden size for text encoder.
text_encoder_context_length (`int`, *optional*, defaults to 77): text_encoder_context_length (`int`, *optional*, defaults to 77):
...@@ -165,7 +163,6 @@ class OneFormerConfig(PretrainedConfig): ...@@ -165,7 +163,6 @@ class OneFormerConfig(PretrainedConfig):
output_auxiliary_logits: bool = True, output_auxiliary_logits: bool = True,
strides: Optional[list] = [4, 8, 16, 32], strides: Optional[list] = [4, 8, 16, 32],
task_seq_len: int = 77, task_seq_len: int = 77,
max_seq_len: int = 77,
text_encoder_width: int = 256, text_encoder_width: int = 256,
text_encoder_context_length: int = 77, text_encoder_context_length: int = 77,
text_encoder_num_layers: int = 6, text_encoder_num_layers: int = 6,
...@@ -229,7 +226,6 @@ class OneFormerConfig(PretrainedConfig): ...@@ -229,7 +226,6 @@ class OneFormerConfig(PretrainedConfig):
self.output_auxiliary_logits = output_auxiliary_logits self.output_auxiliary_logits = output_auxiliary_logits
self.strides = strides self.strides = strides
self.task_seq_len = task_seq_len self.task_seq_len = task_seq_len
self.max_seq_len = max_seq_len
self.text_encoder_width = text_encoder_width self.text_encoder_width = text_encoder_width
self.text_encoder_context_length = text_encoder_context_length self.text_encoder_context_length = text_encoder_context_length
self.text_encoder_num_layers = text_encoder_num_layers self.text_encoder_num_layers = text_encoder_num_layers
......
...@@ -133,7 +133,6 @@ class PerceiverConfig(PretrainedConfig): ...@@ -133,7 +133,6 @@ class PerceiverConfig(PretrainedConfig):
cross_attention_widening_factor=1, cross_attention_widening_factor=1,
hidden_act="gelu", hidden_act="gelu",
attention_probs_dropout_prob=0.1, attention_probs_dropout_prob=0.1,
position_embedding_init_scale=0.02,
initializer_range=0.02, initializer_range=0.02,
layer_norm_eps=1e-12, layer_norm_eps=1e-12,
use_query_residual=True, use_query_residual=True,
......
...@@ -168,8 +168,6 @@ class SpeechT5Config(PretrainedConfig): ...@@ -168,8 +168,6 @@ class SpeechT5Config(PretrainedConfig):
The maximum sequence length of text features that this model might ever be used with. The maximum sequence length of text features that this model might ever be used with.
encoder_max_relative_position (`int`, *optional*, defaults to 160): encoder_max_relative_position (`int`, *optional*, defaults to 160):
Maximum distance for relative position embedding in the encoder. Maximum distance for relative position embedding in the encoder.
decoder_max_relative_position (`int`, *optional*, defaults to 160):
Maximum distance for relative position embedding in the dencoder.
use_cache (`bool`, *optional*, defaults to `True`): use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Whether or not the model should return the last key/values attentions (not used by all models).
...@@ -243,7 +241,6 @@ class SpeechT5Config(PretrainedConfig): ...@@ -243,7 +241,6 @@ class SpeechT5Config(PretrainedConfig):
max_speech_positions=4000, max_speech_positions=4000,
max_text_positions=450, max_text_positions=450,
encoder_max_relative_position=160, encoder_max_relative_position=160,
decoder_max_relative_position=160,
use_cache=True, use_cache=True,
is_encoder_decoder=True, is_encoder_decoder=True,
**kwargs, **kwargs,
...@@ -314,7 +311,6 @@ class SpeechT5Config(PretrainedConfig): ...@@ -314,7 +311,6 @@ class SpeechT5Config(PretrainedConfig):
self.max_speech_positions = max_speech_positions self.max_speech_positions = max_speech_positions
self.max_text_positions = max_text_positions self.max_text_positions = max_text_positions
self.encoder_max_relative_position = encoder_max_relative_position self.encoder_max_relative_position = encoder_max_relative_position
self.decoder_max_relative_position = decoder_max_relative_position
self.use_cache = use_cache self.use_cache = use_cache
self.is_encoder_decoder = is_encoder_decoder self.is_encoder_decoder = is_encoder_decoder
......
...@@ -75,8 +75,6 @@ class SwinConfig(PretrainedConfig): ...@@ -75,8 +75,6 @@ class SwinConfig(PretrainedConfig):
`"selu"` and `"gelu_new"` are supported. `"selu"` and `"gelu_new"` are supported.
use_absolute_embeddings (`bool`, *optional*, defaults to False): use_absolute_embeddings (`bool`, *optional*, defaults to False):
Whether or not to add absolute position embeddings to the patch embeddings. Whether or not to add absolute position embeddings to the patch embeddings.
patch_norm (`bool`, *optional*, defaults to True):
Whether or not to add layer normalization after patch embedding.
initializer_range (`float`, *optional*, defaults to 0.02): initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (`float`, *optional*, defaults to 1e-12): layer_norm_eps (`float`, *optional*, defaults to 1e-12):
...@@ -124,7 +122,6 @@ class SwinConfig(PretrainedConfig): ...@@ -124,7 +122,6 @@ class SwinConfig(PretrainedConfig):
drop_path_rate=0.1, drop_path_rate=0.1,
hidden_act="gelu", hidden_act="gelu",
use_absolute_embeddings=False, use_absolute_embeddings=False,
patch_norm=True,
initializer_range=0.02, initializer_range=0.02,
layer_norm_eps=1e-5, layer_norm_eps=1e-5,
encoder_stride=32, encoder_stride=32,
...@@ -148,7 +145,6 @@ class SwinConfig(PretrainedConfig): ...@@ -148,7 +145,6 @@ class SwinConfig(PretrainedConfig):
self.drop_path_rate = drop_path_rate self.drop_path_rate = drop_path_rate
self.hidden_act = hidden_act self.hidden_act = hidden_act
self.use_absolute_embeddings = use_absolute_embeddings self.use_absolute_embeddings = use_absolute_embeddings
self.path_norm = patch_norm
self.layer_norm_eps = layer_norm_eps self.layer_norm_eps = layer_norm_eps
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.encoder_stride = encoder_stride self.encoder_stride = encoder_stride
......
...@@ -67,8 +67,6 @@ class Swin2SRConfig(PretrainedConfig): ...@@ -67,8 +67,6 @@ class Swin2SRConfig(PretrainedConfig):
`"selu"` and `"gelu_new"` are supported. `"selu"` and `"gelu_new"` are supported.
use_absolute_embeddings (`bool`, *optional*, defaults to `False`): use_absolute_embeddings (`bool`, *optional*, defaults to `False`):
Whether or not to add absolute position embeddings to the patch embeddings. Whether or not to add absolute position embeddings to the patch embeddings.
patch_norm (`bool`, *optional*, defaults to `True`):
Whether or not to add layer normalization after patch embedding.
initializer_range (`float`, *optional*, defaults to 0.02): initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (`float`, *optional*, defaults to 1e-12): layer_norm_eps (`float`, *optional*, defaults to 1e-12):
...@@ -121,7 +119,6 @@ class Swin2SRConfig(PretrainedConfig): ...@@ -121,7 +119,6 @@ class Swin2SRConfig(PretrainedConfig):
drop_path_rate=0.1, drop_path_rate=0.1,
hidden_act="gelu", hidden_act="gelu",
use_absolute_embeddings=False, use_absolute_embeddings=False,
patch_norm=True,
initializer_range=0.02, initializer_range=0.02,
layer_norm_eps=1e-5, layer_norm_eps=1e-5,
upscale=2, upscale=2,
...@@ -147,7 +144,6 @@ class Swin2SRConfig(PretrainedConfig): ...@@ -147,7 +144,6 @@ class Swin2SRConfig(PretrainedConfig):
self.drop_path_rate = drop_path_rate self.drop_path_rate = drop_path_rate
self.hidden_act = hidden_act self.hidden_act = hidden_act
self.use_absolute_embeddings = use_absolute_embeddings self.use_absolute_embeddings = use_absolute_embeddings
self.path_norm = patch_norm
self.layer_norm_eps = layer_norm_eps self.layer_norm_eps = layer_norm_eps
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.upscale = upscale self.upscale = upscale
......
...@@ -68,8 +68,6 @@ class Swinv2Config(PretrainedConfig): ...@@ -68,8 +68,6 @@ class Swinv2Config(PretrainedConfig):
`"selu"` and `"gelu_new"` are supported. `"selu"` and `"gelu_new"` are supported.
use_absolute_embeddings (`bool`, *optional*, defaults to `False`): use_absolute_embeddings (`bool`, *optional*, defaults to `False`):
Whether or not to add absolute position embeddings to the patch embeddings. Whether or not to add absolute position embeddings to the patch embeddings.
patch_norm (`bool`, *optional*, defaults to `True`):
Whether or not to add layer normalization after patch embedding.
initializer_range (`float`, *optional*, defaults to 0.02): initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (`float`, *optional*, defaults to 1e-12): layer_norm_eps (`float`, *optional*, defaults to 1e-12):
...@@ -114,7 +112,6 @@ class Swinv2Config(PretrainedConfig): ...@@ -114,7 +112,6 @@ class Swinv2Config(PretrainedConfig):
drop_path_rate=0.1, drop_path_rate=0.1,
hidden_act="gelu", hidden_act="gelu",
use_absolute_embeddings=False, use_absolute_embeddings=False,
patch_norm=True,
initializer_range=0.02, initializer_range=0.02,
layer_norm_eps=1e-5, layer_norm_eps=1e-5,
encoder_stride=32, encoder_stride=32,
...@@ -137,7 +134,6 @@ class Swinv2Config(PretrainedConfig): ...@@ -137,7 +134,6 @@ class Swinv2Config(PretrainedConfig):
self.drop_path_rate = drop_path_rate self.drop_path_rate = drop_path_rate
self.hidden_act = hidden_act self.hidden_act = hidden_act
self.use_absolute_embeddings = use_absolute_embeddings self.use_absolute_embeddings = use_absolute_embeddings
self.path_norm = patch_norm
self.layer_norm_eps = layer_norm_eps self.layer_norm_eps = layer_norm_eps
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.encoder_stride = encoder_stride self.encoder_stride = encoder_stride
......
...@@ -374,7 +374,7 @@ class VanPreTrainedModel(PreTrainedModel): ...@@ -374,7 +374,7 @@ class VanPreTrainedModel(PreTrainedModel):
def _init_weights(self, module): def _init_weights(self, module):
"""Initialize the weights""" """Initialize the weights"""
if isinstance(module, nn.Linear): if isinstance(module, nn.Linear):
nn.init.trunc_normal_(module.weight, std=0.02) nn.init.trunc_normal_(module.weight, std=self.config.initializer_range)
if isinstance(module, nn.Linear) and module.bias is not None: if isinstance(module, nn.Linear) and module.bias is not None:
nn.init.constant_(module.bias, 0) nn.init.constant_(module.bias, 0)
elif isinstance(module, nn.LayerNorm): elif isinstance(module, nn.LayerNorm):
......
...@@ -45,13 +45,17 @@ SPECIAL_CASES_TO_ALLOW = { ...@@ -45,13 +45,17 @@ SPECIAL_CASES_TO_ALLOW = {
"EsmConfig": ["is_folding_model"], "EsmConfig": ["is_folding_model"],
# used during training (despite we don't have training script for these models yet) # used during training (despite we don't have training script for these models yet)
"Mask2FormerConfig": ["ignore_value"], "Mask2FormerConfig": ["ignore_value"],
# used during training (despite we don't have training script for these models yet) # `ignore_value` used during training (despite we don't have training script for these models yet)
"OneFormerConfig": ["ignore_value"], # `norm` used in conversion script (despite not using in the modeling file)
"OneFormerConfig": ["ignore_value", "norm"],
# used during preprocessing and collation, see `collating_graphormer.py` # used during preprocessing and collation, see `collating_graphormer.py`
"GraphormerConfig": ["spatial_pos_max"], "GraphormerConfig": ["spatial_pos_max"],
# used internally in the configuration class file # used internally in the configuration class file
"T5Config": ["feed_forward_proj"], "T5Config": ["feed_forward_proj"],
# used internally in the configuration class file # used internally in the configuration class file
# `tokenizer_class` get default value `T5Tokenizer` intentionally
"MT5Config": ["feed_forward_proj", "tokenizer_class"],
# used internally in the configuration class file
"LongT5Config": ["feed_forward_proj"], "LongT5Config": ["feed_forward_proj"],
# used internally in the configuration class file # used internally in the configuration class file
"SwitchTransformersConfig": ["feed_forward_proj"], "SwitchTransformersConfig": ["feed_forward_proj"],
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment