"...git@developer.sourcefind.cn:OpenDAS/nni.git" did not exist on "f6343344387fc063e74b1c086a2d1c245861dd3f"
Unverified Commit f726d53e authored by Yih-Dar's avatar Yih-Dar Committed by GitHub
Browse files

Remove more unused attributes in config classes (#21392)



* * Remove unused type_vocab_size

* Remove unused initializer_factor

* Remove unused n_embd

* Remove unused scale_embedding

* Remove unused scale_attn_weights

* fix

* fix

* Remove unused head_hidden_scale

* Remove unused activation_dropout

---------
Co-authored-by: default avatarydshieh <ydshieh@users.noreply.github.com>
parent 3560ae6d
......@@ -83,9 +83,6 @@ class BlipTextConfig(PretrainedConfig):
The dropout ratio for the attention probabilities.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
initializer_factor (`float``, *optional*, defaults to 1):
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
testing).
bos_token_id (`int`, *optional*, defaults to 30522):
The id of the `beginning-of-sequence` token.
eos_token_id (`int`, *optional*, defaults to 2):
......@@ -130,7 +127,6 @@ class BlipTextConfig(PretrainedConfig):
hidden_dropout_prob=0.0,
attention_probs_dropout_prob=0.0,
initializer_range=0.02,
initializer_factor=1.0,
bos_token_id=30522,
eos_token_id=2,
pad_token_id=0,
......@@ -159,7 +155,6 @@ class BlipTextConfig(PretrainedConfig):
self.layer_norm_eps = layer_norm_eps
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.initializer_factor = initializer_factor
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.is_decoder = is_decoder
self.use_cache = use_cache
......@@ -215,9 +210,6 @@ class BlipVisionConfig(PretrainedConfig):
The dropout ratio for the attention probabilities.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
initializer_factor (`float``, *optional*, defaults to 1):
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
testing).
Example:
......@@ -250,7 +242,6 @@ class BlipVisionConfig(PretrainedConfig):
layer_norm_eps=1e-5,
attention_dropout=0.0,
initializer_range=1e-10,
initializer_factor=1.0,
**kwargs
):
super().__init__(**kwargs)
......@@ -264,7 +255,6 @@ class BlipVisionConfig(PretrainedConfig):
self.patch_size = patch_size
self.image_size = image_size
self.initializer_range = initializer_range
self.initializer_factor = initializer_factor
self.attention_dropout = attention_dropout
self.layer_norm_eps = layer_norm_eps
self.hidden_act = hidden_act
......
......@@ -260,8 +260,6 @@ class BridgeTowerConfig(PretrainedConfig):
Args:
share_cross_modal_transformer_layers (`bool`, *optional*, defaults to `True`):
Whether cross modal transformer layers are shared.
head_hidden_scale (`int`, *optional*, defaults to 2):
Scale of hidden layers head.
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler.
hidden_size (`int`, *optional*, defaults to 768):
......@@ -307,7 +305,6 @@ class BridgeTowerConfig(PretrainedConfig):
def __init__(
self,
share_cross_modal_transformer_layers=True,
head_hidden_scale=2,
hidden_act="gelu",
hidden_size=768,
initializer_factor=1,
......@@ -324,7 +321,6 @@ class BridgeTowerConfig(PretrainedConfig):
):
super().__init__(**kwargs)
self.share_cross_modal_transformer_layers = share_cross_modal_transformer_layers
self.head_hidden_scale = head_hidden_scale
self.hidden_act = hidden_act
self.hidden_size = hidden_size
self.initializer_factor = initializer_factor
......
......@@ -79,8 +79,6 @@ class CodeGenConfig(PretrainedConfig):
The epsilon to use in the layer normalization layers.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
scale_attn_weights (`bool`, *optional*, defaults to `True`):
Scale attention weights by dividing by sqrt(hidden_size).
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models).
......@@ -122,7 +120,6 @@ class CodeGenConfig(PretrainedConfig):
attn_pdrop=0.0,
layer_norm_epsilon=1e-5,
initializer_range=0.02,
scale_attn_weights=True,
use_cache=True,
bos_token_id=50256,
eos_token_id=50256,
......@@ -143,7 +140,6 @@ class CodeGenConfig(PretrainedConfig):
self.attn_pdrop = attn_pdrop
self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_range = initializer_range
self.scale_attn_weights = scale_attn_weights
self.use_cache = use_cache
self.bos_token_id = bos_token_id
......
......@@ -164,7 +164,6 @@ class ConditionalDetrConfig(PretrainedConfig):
activation_dropout=0.0,
init_std=0.02,
init_xavier_std=1.0,
scale_embedding=False,
auxiliary_loss=False,
position_embedding_type="sine",
backbone="resnet50",
......@@ -213,7 +212,6 @@ class ConditionalDetrConfig(PretrainedConfig):
self.encoder_layerdrop = encoder_layerdrop
self.decoder_layerdrop = decoder_layerdrop
self.num_hidden_layers = encoder_layers
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
self.auxiliary_loss = auxiliary_loss
self.position_embedding_type = position_embedding_type
self.backbone = backbone
......
......@@ -57,8 +57,6 @@ class DecisionTransformerConfig(PretrainedConfig):
n_positions (`int`, *optional*, defaults to 1024):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
n_embd (`int`, *optional*, defaults to 768):
Dimensionality of the embeddings and hidden states.
n_layer (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
n_head (`int`, *optional*, defaults to 12):
......@@ -119,7 +117,6 @@ class DecisionTransformerConfig(PretrainedConfig):
action_tanh=True,
vocab_size=1,
n_positions=1024,
n_embd=768,
n_layer=3,
n_head=1,
n_inner=None,
......@@ -145,7 +142,6 @@ class DecisionTransformerConfig(PretrainedConfig):
self.action_tanh = action_tanh
self.vocab_size = vocab_size
self.n_positions = n_positions
self.n_embd = n_embd
self.n_layer = n_layer
self.n_head = n_head
self.n_inner = n_inner
......
......@@ -161,7 +161,6 @@ class DetrConfig(PretrainedConfig):
activation_dropout=0.0,
init_std=0.02,
init_xavier_std=1.0,
scale_embedding=False,
auxiliary_loss=False,
position_embedding_type="sine",
backbone="resnet50",
......@@ -209,7 +208,6 @@ class DetrConfig(PretrainedConfig):
self.encoder_layerdrop = encoder_layerdrop
self.decoder_layerdrop = decoder_layerdrop
self.num_hidden_layers = encoder_layers
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
self.auxiliary_loss = auxiliary_loss
self.position_embedding_type = position_embedding_type
self.backbone = backbone
......
......@@ -75,8 +75,6 @@ class FunnelConfig(PretrainedConfig):
The dropout probability for the attention probabilities.
activation_dropout (`float`, *optional*, defaults to 0.0):
The dropout probability used between the two layers of the feed-forward blocks.
type_vocab_size (`int`, *optional*, defaults to 3):
The vocabulary size of the `token_type_ids` passed when calling [`FunnelModel`] or [`TFFunnelModel`].
initializer_range (`float`, *optional*, defaults to 0.1):
The upper bound of the *uniform initializer* for initializing all weight matrices in attention layers.
initializer_std (`float`, *optional*):
......@@ -118,7 +116,6 @@ class FunnelConfig(PretrainedConfig):
hidden_dropout=0.1,
attention_dropout=0.1,
activation_dropout=0.0,
type_vocab_size=3,
initializer_range=0.1,
initializer_std=None,
layer_norm_eps=1e-9,
......@@ -144,7 +141,6 @@ class FunnelConfig(PretrainedConfig):
self.hidden_dropout = hidden_dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.initializer_std = initializer_std
self.layer_norm_eps = layer_norm_eps
......
......@@ -28,7 +28,6 @@ GIT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
}
# Copied from transformers.models.clip.configuration_clip.CLIPVisionConfig with CLIPVision->GitVision, CLIP->GIT, clip->git, openai/git-vit-base-patch32->microsoft/git-base, 32->16
class GitVisionConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`GitVisionModel`]. It is used to instantiate a GIT
......@@ -61,9 +60,6 @@ class GitVisionConfig(PretrainedConfig):
The dropout ratio for the attention probabilities.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
initializer_factor (`float`, *optional*, defaults to 1):
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
testing).
Example:
......@@ -86,7 +82,6 @@ class GitVisionConfig(PretrainedConfig):
self,
hidden_size=768,
intermediate_size=3072,
projection_dim=512,
num_hidden_layers=12,
num_attention_heads=12,
num_channels=3,
......@@ -96,21 +91,18 @@ class GitVisionConfig(PretrainedConfig):
layer_norm_eps=1e-5,
attention_dropout=0.0,
initializer_range=0.02,
initializer_factor=1.0,
**kwargs
):
super().__init__(**kwargs)
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.projection_dim = projection_dim
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_channels = num_channels
self.patch_size = patch_size
self.image_size = image_size
self.initializer_range = initializer_range
self.initializer_factor = initializer_factor
self.attention_dropout = attention_dropout
self.layer_norm_eps = layer_norm_eps
self.hidden_act = hidden_act
......
......@@ -63,8 +63,6 @@ class GPTNeoXJapaneseConfig(PretrainedConfig):
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
weight_tying (`bool`, *optional*, defaults to `True`):
Whhether or not use weight tying between input and output embedding weight
attention_dropout (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention.
hidden_dropout (`float`, *optional*, defaults to 0.0):
......@@ -101,7 +99,6 @@ class GPTNeoXJapaneseConfig(PretrainedConfig):
use_cache=True,
bos_token_id=31996,
eos_token_id=31999,
weight_tying=True,
attention_dropout=0.1,
hidden_dropout=0.0,
**kwargs
......@@ -119,6 +116,5 @@ class GPTNeoXJapaneseConfig(PretrainedConfig):
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.use_cache = use_cache
self.weight_tying = weight_tying
self.attention_dropout = attention_dropout
self.hidden_dropout = hidden_dropout
......@@ -68,8 +68,6 @@ class GPTJConfig(PretrainedConfig):
The epsilon to use in the layer normalization layers.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
scale_attn_weights (`bool`, *optional*, defaults to `True`):
Scale attention weights by dividing by sqrt(hidden_size).
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models).
......@@ -110,7 +108,6 @@ class GPTJConfig(PretrainedConfig):
attn_pdrop=0.0,
layer_norm_epsilon=1e-5,
initializer_range=0.02,
scale_attn_weights=True,
use_cache=True,
bos_token_id=50256,
eos_token_id=50256,
......@@ -130,7 +127,6 @@ class GPTJConfig(PretrainedConfig):
self.attn_pdrop = attn_pdrop
self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_range = initializer_range
self.scale_attn_weights = scale_attn_weights
self.use_cache = use_cache
self.bos_token_id = bos_token_id
......
......@@ -79,8 +79,6 @@ class GraphormerConfig(PretrainedConfig):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for the attention weights.
activation_dropout (`float`, *optional*, defaults to 0.1):
The dropout probability after activation in the FFN.
layerdrop (`float`, *optional*, defaults to 0.0):
The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
for more details.
......@@ -152,7 +150,6 @@ class GraphormerConfig(PretrainedConfig):
num_attention_heads: int = 32,
dropout: float = 0.1,
attention_dropout: float = 0.1,
activation_dropout: float = 0.1,
layerdrop: float = 0.0,
encoder_normalize_before: bool = False,
pre_layernorm: bool = False,
......@@ -191,7 +188,6 @@ class GraphormerConfig(PretrainedConfig):
self.num_attention_heads = num_attention_heads
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.layerdrop = layerdrop
self.encoder_normalize_before = encoder_normalize_before
self.pre_layernorm = pre_layernorm
......
......@@ -465,8 +465,8 @@ class LEDEncoderSelfAttention(nn.Module):
query = query.transpose(1, 2).reshape(batch_size * num_heads, seq_len, head_dim)
key = key.transpose(1, 2).reshape(batch_size * num_heads, seq_len, head_dim)
query = self._chunk(query, window_overlap, self.config.__dict__.get("onnx_export", False))
key = self._chunk(key, window_overlap, self.config.__dict__.get("onnx_export", False))
query = self._chunk(query, window_overlap, getattr(self.config, "onnx_export", False))
key = self._chunk(key, window_overlap, getattr(self.config, "onnx_export", False))
# matrix multiplication
# bcxd: batch_size * num_heads x chunks x 2window_overlap x head_dim
......
......@@ -836,8 +836,8 @@ class LongformerSelfAttention(nn.Module):
query = query.transpose(1, 2).reshape(batch_size * num_heads, seq_len, head_dim)
key = key.transpose(1, 2).reshape(batch_size * num_heads, seq_len, head_dim)
query = self._chunk(query, window_overlap, self.config.__dict__.get("onnx_export", False))
key = self._chunk(key, window_overlap, self.config.__dict__.get("onnx_export", False))
query = self._chunk(query, window_overlap, getattr(self.config, "onnx_export", False))
key = self._chunk(key, window_overlap, getattr(self.config, "onnx_export", False))
# matrix multiplication
# bcxd: batch_size * num_heads x chunks x 2window_overlap x head_dim
......
......@@ -21,8 +21,6 @@ class NezhaConfig(PretrainedConfig):
vocab_size (`int`, optional, defaults to 21128):
Vocabulary size of the NEZHA model. Defines the different tokens that can be represented by the
*inputs_ids* passed to the forward method of [`NezhaModel`].
embedding_size (`int`, optional, defaults to 128):
Dimensionality of vocabulary embeddings.
hidden_size (`int`, optional, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (`int`, optional, defaults to 12):
......@@ -71,7 +69,6 @@ class NezhaConfig(PretrainedConfig):
def __init__(
self,
vocab_size=21128,
embedding_size=128,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
......@@ -94,7 +91,6 @@ class NezhaConfig(PretrainedConfig):
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
self.vocab_size = vocab_size
self.embedding_size = embedding_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
......
......@@ -223,7 +223,7 @@ class NezhaEmbeddings(nn.Module):
class NezhaSelfAttention(nn.Module):
def __init__(self, config):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
if config.hidden_size % config.num_attention_heads != 0:
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
......
......@@ -189,7 +189,6 @@ class SEWDConfig(PretrainedConfig):
attention_dropout=0.1,
feat_proj_dropout=0.0,
final_dropout=0.1,
layerdrop=0.1,
initializer_range=0.02,
layer_norm_eps=1e-7,
feature_layer_norm_eps=1e-5,
......@@ -244,7 +243,6 @@ class SEWDConfig(PretrainedConfig):
self.activation_dropout = activation_dropout
self.feat_proj_dropout = feat_proj_dropout
self.final_dropout = final_dropout
self.layerdrop = layerdrop
self.layer_norm_eps = layer_norm_eps
self.feature_layer_norm_eps = feature_layer_norm_eps
self.initializer_range = initializer_range
......
......@@ -163,7 +163,6 @@ class TableTransformerConfig(PretrainedConfig):
activation_dropout=0.0,
init_std=0.02,
init_xavier_std=1.0,
scale_embedding=False,
auxiliary_loss=False,
position_embedding_type="sine",
backbone="resnet50",
......@@ -211,7 +210,6 @@ class TableTransformerConfig(PretrainedConfig):
self.encoder_layerdrop = encoder_layerdrop
self.decoder_layerdrop = decoder_layerdrop
self.num_hidden_layers = encoder_layers
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
self.auxiliary_loss = auxiliary_loss
self.position_embedding_type = position_embedding_type
self.backbone = backbone
......
......@@ -77,8 +77,6 @@ class TrajectoryTransformerConfig(PretrainedConfig):
max_position_embeddings (`int`, *optional*, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
type_vocab_size (`int`, *optional*, defaults to 2):
The vocabulary size of the `token_type_ids` passed when calling [`TrajectoryTransformerModel`]
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
......@@ -128,7 +126,6 @@ class TrajectoryTransformerConfig(PretrainedConfig):
resid_pdrop=0.1,
learning_rate=0.0006,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-12,
kaiming_initializer_range=1,
......@@ -155,7 +152,6 @@ class TrajectoryTransformerConfig(PretrainedConfig):
self.attn_pdrop = attn_pdrop
self.resid_pdrop = resid_pdrop
self.initializer_range = initializer_range
self.type_vocab_size = type_vocab_size
self.layer_norm_eps = layer_norm_eps
self.kaiming_initializer_range = kaiming_initializer_range
self.use_cache = use_cache
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment