Unverified Commit 7504be35 authored by Yih-Dar's avatar Yih-Dar Committed by GitHub
Browse files

Fix `check_config_attributes`: check all configuration classes (#24231)



* fix

---------
Co-authored-by: default avatarydshieh <ydshieh@users.noreply.github.com>
parent 6793f0cf
......@@ -206,8 +206,6 @@ class AlignVisionConfig(PretrainedConfig):
The epsilon used by the batch normalization layers.
batch_norm_momentum (`float`, *optional*, defaults to 0.99):
The momentum used by the batch normalization layers.
dropout_rate (`float`, *optional*, defaults to 0.5):
The dropout rate to be applied before final classifier layer.
drop_connect_rate (`float`, *optional*, defaults to 0.2):
The drop rate for skip connections.
......@@ -249,7 +247,6 @@ class AlignVisionConfig(PretrainedConfig):
initializer_range: float = 0.02,
batch_norm_eps: float = 0.001,
batch_norm_momentum: float = 0.99,
dropout_rate: float = 0.5,
drop_connect_rate: float = 0.2,
**kwargs,
):
......@@ -274,7 +271,6 @@ class AlignVisionConfig(PretrainedConfig):
self.initializer_range = initializer_range
self.batch_norm_eps = batch_norm_eps
self.batch_norm_momentum = batch_norm_momentum
self.dropout_rate = dropout_rate
self.drop_connect_rate = drop_connect_rate
self.num_hidden_layers = sum(num_block_repeats) * 4
......
......@@ -234,7 +234,6 @@ class BlipVisionConfig(PretrainedConfig):
projection_dim=512,
num_hidden_layers=12,
num_attention_heads=12,
num_channels=3,
image_size=384,
patch_size=16,
hidden_act="gelu",
......@@ -250,7 +249,6 @@ class BlipVisionConfig(PretrainedConfig):
self.projection_dim = projection_dim
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_channels = num_channels
self.patch_size = patch_size
self.image_size = image_size
self.initializer_range = initializer_range
......
......@@ -58,15 +58,10 @@ class Blip2VisionConfig(PretrainedConfig):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults
to 1e-5): The epsilon used by the layer normalization layers.
dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
initializer_factor (`float``, *optional*, defaults to 1):
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
testing).
qkv_bias (`bool`, *optional*, defaults to `True`):
Whether to add a bias to the queries and values in the self-attention layers.
......@@ -91,18 +86,14 @@ class Blip2VisionConfig(PretrainedConfig):
self,
hidden_size=1408,
intermediate_size=6144,
projection_dim=512,
num_hidden_layers=39,
num_attention_heads=16,
num_channels=3,
image_size=224,
patch_size=14,
hidden_act="gelu",
layer_norm_eps=0.00001,
dropout=0.0,
attention_dropout=0.0,
initializer_range=1e-10,
initializer_factor=1.0,
qkv_bias=True,
**kwargs,
):
......@@ -110,15 +101,11 @@ class Blip2VisionConfig(PretrainedConfig):
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.projection_dim = projection_dim
self.dropout = dropout
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_channels = num_channels
self.patch_size = patch_size
self.image_size = image_size
self.initializer_range = initializer_range
self.initializer_factor = initializer_factor
self.attention_dropout = attention_dropout
self.layer_norm_eps = layer_norm_eps
self.hidden_act = hidden_act
......@@ -184,8 +171,6 @@ class Blip2QFormerConfig(PretrainedConfig):
[Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
classifier_dropout (`float`, *optional*):
The dropout ratio for the classification head.
cross_attention_frequency (`int`, *optional*, defaults to 2):
The frequency of adding cross-attention to the Transformer layers.
encoder_hidden_size (`int`, *optional*, defaults to 1408):
......@@ -221,7 +206,6 @@ class Blip2QFormerConfig(PretrainedConfig):
layer_norm_eps=1e-12,
pad_token_id=0,
position_embedding_type="absolute",
classifier_dropout=None,
cross_attention_frequency=2,
encoder_hidden_size=1408,
**kwargs,
......@@ -240,7 +224,6 @@ class Blip2QFormerConfig(PretrainedConfig):
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.position_embedding_type = position_embedding_type
self.classifier_dropout = classifier_dropout
self.cross_attention_frequency = cross_attention_frequency
self.encoder_hidden_size = encoder_hidden_size
......
......@@ -155,8 +155,6 @@ class BridgeTowerTextConfig(PretrainedConfig):
initializer_factor (`float``, *optional*, defaults to 1):
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
testing).
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
The epsilon used by the layer normalization layers.
position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
......@@ -170,8 +168,6 @@ class BridgeTowerTextConfig(PretrainedConfig):
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
classifier_dropout (`float`, *optional*):
The dropout ratio for the classification head.
Example:
......@@ -199,14 +195,12 @@ class BridgeTowerTextConfig(PretrainedConfig):
attention_probs_dropout_prob=0.1,
max_position_embeddings=514,
type_vocab_size=1,
initializer_range=0.02,
layer_norm_eps=1e-05,
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
position_embedding_type="absolute",
use_cache=True,
classifier_dropout=None,
**kwargs,
):
super().__init__(**kwargs)
......@@ -222,11 +216,9 @@ class BridgeTowerTextConfig(PretrainedConfig):
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.position_embedding_type = position_embedding_type
self.use_cache = use_cache
self.classifier_dropout = classifier_dropout
self.pad_token_id = pad_token_id
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
......
......@@ -65,8 +65,6 @@ class ClapTextConfig(PretrainedConfig):
just in case (e.g., 512 or 1024 or 2048).
type_vocab_size (`int`, *optional*, defaults to 2):
The vocabulary size of the `token_type_ids` passed when calling [`ClapTextModel`].
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
......@@ -80,8 +78,6 @@ class ClapTextConfig(PretrainedConfig):
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
classifier_dropout (`float`, *optional*):
The dropout ratio for the classification head.
projection_hidden_act (`str`, *optional*, defaults to `"relu"`):
The non-linear activation function (function or string) in the projection layer. If string, `"gelu"`,
`"relu"`, `"silu"` and `"gelu_new"` are supported.
......@@ -116,7 +112,6 @@ class ClapTextConfig(PretrainedConfig):
attention_probs_dropout_prob=0.1,
max_position_embeddings=514,
type_vocab_size=1,
initializer_range=0.02,
initializer_factor=1.0,
layer_norm_eps=1e-12,
projection_dim=512,
......@@ -125,7 +120,6 @@ class ClapTextConfig(PretrainedConfig):
eos_token_id=2,
position_embedding_type="absolute",
use_cache=True,
classifier_dropout=None,
projection_hidden_act="relu",
**kwargs,
):
......@@ -141,12 +135,10 @@ class ClapTextConfig(PretrainedConfig):
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.initializer_factor = initializer_factor
self.layer_norm_eps = layer_norm_eps
self.position_embedding_type = position_embedding_type
self.use_cache = use_cache
self.classifier_dropout = classifier_dropout
self.projection_hidden_act = projection_hidden_act
self.projection_dim = projection_dim
......
......@@ -187,16 +187,10 @@ class Pix2StructVisionConfig(PretrainedConfig):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
d_kv (`int`, *optional*, defaults to 64):
Dimensionality of the key, query, value projections per attention head.
projection_dim (`int`, *optional*, defaults to 768):
Dimensionality of the projection layer in the Transformer encoder.
num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
num_channels (`int`, *optional*, defaults to 3):
Number of channels of the input images.
patch_size (`int`, *optional*, defaults to 16):
The size (resolution) of each patch.
dense_act_fn (`str` or `function`, *optional*, defaults to `"gelu_new"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
......@@ -213,8 +207,6 @@ class Pix2StructVisionConfig(PretrainedConfig):
testing).
seq_len (`int`, *optional*, defaults to 4096):
Maximum sequence length (here number of patches) supported by the model.
layer_norm_bias (`bool`, *optional*, defaults to `False`):
Whether or not to add a bias to the layer normalization layers.
relative_attention_num_buckets (`int`, *optional*, defaults to 32):
The number of buckets to use for each attention layer.
relative_attention_max_distance (`int`, *optional*, defaults to 128):
......@@ -243,11 +235,8 @@ class Pix2StructVisionConfig(PretrainedConfig):
patch_embed_hidden_size=768,
d_ff=2048,
d_kv=64,
projection_dim=768,
num_hidden_layers=12,
num_attention_heads=12,
num_channels=3,
patch_size=16,
dense_act_fn="gelu_new",
layer_norm_eps=1e-6,
dropout_rate=0.0,
......@@ -255,7 +244,6 @@ class Pix2StructVisionConfig(PretrainedConfig):
initializer_range=1e-10,
initializer_factor=1.0,
seq_len=4096,
layer_norm_bias=False,
relative_attention_num_buckets=32,
relative_attention_max_distance=128,
**kwargs,
......@@ -265,19 +253,15 @@ class Pix2StructVisionConfig(PretrainedConfig):
self.hidden_size = hidden_size
self.patch_embed_hidden_size = patch_embed_hidden_size
self.d_ff = d_ff
self.projection_dim = projection_dim
self.dropout_rate = dropout_rate
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_channels = num_channels
self.patch_size = patch_size
self.initializer_range = initializer_range
self.initializer_factor = initializer_factor
self.attention_dropout = attention_dropout
self.layer_norm_eps = layer_norm_eps
self.dense_act_fn = dense_act_fn
self.seq_len = seq_len
self.layer_norm_bias = layer_norm_bias
self.relative_attention_num_buckets = relative_attention_num_buckets
self.relative_attention_max_distance = relative_attention_max_distance
self.d_kv = d_kv
......
......@@ -150,10 +150,6 @@ class SamVisionConfig(PretrainedConfig):
Args:
hidden_size (`int`, *optional*, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
intermediate_size (`int`, *optional*, defaults to 6144):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
projection_dim (`int`, *optional*, defaults to 512):
Dimensionality of the projection layer in the Transformer encoder.
output_channels (`int`, *optional*, defaults to 256):
Dimensionality of the output channels in the Patch Encoder.
num_hidden_layers (`int`, *optional*, defaults to 12):
......@@ -170,14 +166,10 @@ class SamVisionConfig(PretrainedConfig):
The non-linear activation function (function or string)
layer_norm_eps (`float`, *optional*, defaults to 1e-6):
The epsilon used by the layer normalization layers.
dropout (`float`, *optional*, defaults to 0.0):
The dropout probability.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
initializer_range (`float`, *optional*, defaults to 1e-10):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
initializer_factor (`float`, *optional*, defaults to 1.0):
A factor for multiplying the initializer range.
qkv_bias (`bool`, *optional*, defaults to `True`):
Whether to add a bias to query, key, value projections.
mlp_ratio (`float`, *optional*, defaults to 4.0):
......@@ -200,8 +192,6 @@ class SamVisionConfig(PretrainedConfig):
def __init__(
self,
hidden_size=768,
intermediate_size=6144,
projection_dim=512,
output_channels=256,
num_hidden_layers=12,
num_attention_heads=12,
......@@ -210,10 +200,8 @@ class SamVisionConfig(PretrainedConfig):
patch_size=16,
hidden_act="gelu",
layer_norm_eps=1e-06,
dropout=0.0,
attention_dropout=0.0,
initializer_range=1e-10,
initializer_factor=1.0,
qkv_bias=True,
mlp_ratio=4.0,
use_abs_pos=True,
......@@ -227,8 +215,6 @@ class SamVisionConfig(PretrainedConfig):
super().__init__(**kwargs)
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.projection_dim = projection_dim
self.output_channels = output_channels
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
......@@ -237,10 +223,8 @@ class SamVisionConfig(PretrainedConfig):
self.patch_size = patch_size
self.hidden_act = hidden_act
self.layer_norm_eps = layer_norm_eps
self.dropout = dropout
self.attention_dropout = attention_dropout
self.initializer_range = initializer_range
self.initializer_factor = initializer_factor
self.qkv_bias = qkv_bias
self.mlp_ratio = mlp_ratio
self.use_abs_pos = use_abs_pos
......
......@@ -17,6 +17,7 @@ import inspect
import os
import re
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import direct_transformers_import
......@@ -77,6 +78,12 @@ SPECIAL_CASES_TO_ALLOW = {
"TimeSeriesTransformerConfig": ["num_static_real_features", "num_time_features"],
# used internally to calculate the feature size
"AutoformerConfig": ["num_static_real_features", "num_time_features"],
# used internally to calculate `mlp_dim`
"SamVisionConfig": ["mlp_ratio"],
# For (head) training, but so far not implemented
"ClapAudioConfig": ["num_classes"],
# Not used, but providing useful information to users
"SpeechT5HifiGanConfig": ["sampling_rate"],
}
......@@ -113,6 +120,10 @@ SPECIAL_CASES_TO_ALLOW.update(
"VanConfig": True,
"WavLMConfig": True,
"WhisperConfig": True,
# TODO: @Arthur (for `alignment_head` and `alignment_layer`)
"JukeboxPriorConfig": True,
# TODO: @Younes (for `is_decoder`)
"Pix2StructTextConfig": True,
}
)
......@@ -254,10 +265,21 @@ def check_config_attributes_being_used(config_class):
def check_config_attributes():
"""Check the arguments in `__init__` of all configuration classes are used in python files"""
configs_with_unused_attributes = {}
for config_class in list(CONFIG_MAPPING.values()):
unused_attributes = check_config_attributes_being_used(config_class)
if len(unused_attributes) > 0:
configs_with_unused_attributes[config_class.__name__] = unused_attributes
for _config_class in list(CONFIG_MAPPING.values()):
# Some config classes are not in `CONFIG_MAPPING` (e.g. `CLIPVisionConfig`, `Blip2VisionConfig`, etc.)
config_classes_in_module = [
cls
for name, cls in inspect.getmembers(
inspect.getmodule(_config_class),
lambda x: inspect.isclass(x)
and issubclass(x, PretrainedConfig)
and inspect.getmodule(x) == inspect.getmodule(_config_class),
)
]
for config_class in config_classes_in_module:
unused_attributes = check_config_attributes_being_used(config_class)
if len(unused_attributes) > 0:
configs_with_unused_attributes[config_class.__name__] = unused_attributes
if len(configs_with_unused_attributes) > 0:
error = "The following configuration classes contain unused attributes in the corresponding modeling files:\n"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment