Unverified Commit f726d53e authored by Yih-Dar's avatar Yih-Dar Committed by GitHub
Browse files

Remove more unused attributes in config classes (#21392)



* * Remove unused type_vocab_size

* Remove unused initializer_factor

* Remove unused n_embd

* Remove unused scale_embedding

* Remove unused scale_attn_weights

* fix

* fix

* Remove unused head_hidden_scale

* Remove unused activation_dropout

---------
Co-authored-by: default avatarydshieh <ydshieh@users.noreply.github.com>
parent 3560ae6d
...@@ -83,9 +83,6 @@ class BlipTextConfig(PretrainedConfig): ...@@ -83,9 +83,6 @@ class BlipTextConfig(PretrainedConfig):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
initializer_range (`float`, *optional*, defaults to 0.02): initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
initializer_factor (`float``, *optional*, defaults to 1):
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
testing).
bos_token_id (`int`, *optional*, defaults to 30522): bos_token_id (`int`, *optional*, defaults to 30522):
The id of the `beginning-of-sequence` token. The id of the `beginning-of-sequence` token.
eos_token_id (`int`, *optional*, defaults to 2): eos_token_id (`int`, *optional*, defaults to 2):
...@@ -130,7 +127,6 @@ class BlipTextConfig(PretrainedConfig): ...@@ -130,7 +127,6 @@ class BlipTextConfig(PretrainedConfig):
hidden_dropout_prob=0.0, hidden_dropout_prob=0.0,
attention_probs_dropout_prob=0.0, attention_probs_dropout_prob=0.0,
initializer_range=0.02, initializer_range=0.02,
initializer_factor=1.0,
bos_token_id=30522, bos_token_id=30522,
eos_token_id=2, eos_token_id=2,
pad_token_id=0, pad_token_id=0,
...@@ -159,7 +155,6 @@ class BlipTextConfig(PretrainedConfig): ...@@ -159,7 +155,6 @@ class BlipTextConfig(PretrainedConfig):
self.layer_norm_eps = layer_norm_eps self.layer_norm_eps = layer_norm_eps
self.hidden_act = hidden_act self.hidden_act = hidden_act
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.initializer_factor = initializer_factor
self.attention_probs_dropout_prob = attention_probs_dropout_prob self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.is_decoder = is_decoder self.is_decoder = is_decoder
self.use_cache = use_cache self.use_cache = use_cache
...@@ -215,9 +210,6 @@ class BlipVisionConfig(PretrainedConfig): ...@@ -215,9 +210,6 @@ class BlipVisionConfig(PretrainedConfig):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
initializer_range (`float`, *optional*, defaults to 0.02): initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
initializer_factor (`float``, *optional*, defaults to 1):
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
testing).
Example: Example:
...@@ -250,7 +242,6 @@ class BlipVisionConfig(PretrainedConfig): ...@@ -250,7 +242,6 @@ class BlipVisionConfig(PretrainedConfig):
layer_norm_eps=1e-5, layer_norm_eps=1e-5,
attention_dropout=0.0, attention_dropout=0.0,
initializer_range=1e-10, initializer_range=1e-10,
initializer_factor=1.0,
**kwargs **kwargs
): ):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -264,7 +255,6 @@ class BlipVisionConfig(PretrainedConfig): ...@@ -264,7 +255,6 @@ class BlipVisionConfig(PretrainedConfig):
self.patch_size = patch_size self.patch_size = patch_size
self.image_size = image_size self.image_size = image_size
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.initializer_factor = initializer_factor
self.attention_dropout = attention_dropout self.attention_dropout = attention_dropout
self.layer_norm_eps = layer_norm_eps self.layer_norm_eps = layer_norm_eps
self.hidden_act = hidden_act self.hidden_act = hidden_act
......
...@@ -260,8 +260,6 @@ class BridgeTowerConfig(PretrainedConfig): ...@@ -260,8 +260,6 @@ class BridgeTowerConfig(PretrainedConfig):
Args: Args:
share_cross_modal_transformer_layers (`bool`, *optional*, defaults to `True`): share_cross_modal_transformer_layers (`bool`, *optional*, defaults to `True`):
Whether cross modal transformer layers are shared. Whether cross modal transformer layers are shared.
head_hidden_scale (`int`, *optional*, defaults to 2):
Scale of hidden layers head.
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. The non-linear activation function (function or string) in the encoder and pooler.
hidden_size (`int`, *optional*, defaults to 768): hidden_size (`int`, *optional*, defaults to 768):
...@@ -307,7 +305,6 @@ class BridgeTowerConfig(PretrainedConfig): ...@@ -307,7 +305,6 @@ class BridgeTowerConfig(PretrainedConfig):
def __init__( def __init__(
self, self,
share_cross_modal_transformer_layers=True, share_cross_modal_transformer_layers=True,
head_hidden_scale=2,
hidden_act="gelu", hidden_act="gelu",
hidden_size=768, hidden_size=768,
initializer_factor=1, initializer_factor=1,
...@@ -324,7 +321,6 @@ class BridgeTowerConfig(PretrainedConfig): ...@@ -324,7 +321,6 @@ class BridgeTowerConfig(PretrainedConfig):
): ):
super().__init__(**kwargs) super().__init__(**kwargs)
self.share_cross_modal_transformer_layers = share_cross_modal_transformer_layers self.share_cross_modal_transformer_layers = share_cross_modal_transformer_layers
self.head_hidden_scale = head_hidden_scale
self.hidden_act = hidden_act self.hidden_act = hidden_act
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.initializer_factor = initializer_factor self.initializer_factor = initializer_factor
......
...@@ -79,8 +79,6 @@ class CodeGenConfig(PretrainedConfig): ...@@ -79,8 +79,6 @@ class CodeGenConfig(PretrainedConfig):
The epsilon to use in the layer normalization layers. The epsilon to use in the layer normalization layers.
initializer_range (`float`, *optional*, defaults to 0.02): initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
scale_attn_weights (`bool`, *optional*, defaults to `True`):
Scale attention weights by dividing by sqrt(hidden_size).
use_cache (`bool`, *optional*, defaults to `True`): use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Whether or not the model should return the last key/values attentions (not used by all models).
...@@ -122,7 +120,6 @@ class CodeGenConfig(PretrainedConfig): ...@@ -122,7 +120,6 @@ class CodeGenConfig(PretrainedConfig):
attn_pdrop=0.0, attn_pdrop=0.0,
layer_norm_epsilon=1e-5, layer_norm_epsilon=1e-5,
initializer_range=0.02, initializer_range=0.02,
scale_attn_weights=True,
use_cache=True, use_cache=True,
bos_token_id=50256, bos_token_id=50256,
eos_token_id=50256, eos_token_id=50256,
...@@ -143,7 +140,6 @@ class CodeGenConfig(PretrainedConfig): ...@@ -143,7 +140,6 @@ class CodeGenConfig(PretrainedConfig):
self.attn_pdrop = attn_pdrop self.attn_pdrop = attn_pdrop
self.layer_norm_epsilon = layer_norm_epsilon self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.scale_attn_weights = scale_attn_weights
self.use_cache = use_cache self.use_cache = use_cache
self.bos_token_id = bos_token_id self.bos_token_id = bos_token_id
......
...@@ -164,7 +164,6 @@ class ConditionalDetrConfig(PretrainedConfig): ...@@ -164,7 +164,6 @@ class ConditionalDetrConfig(PretrainedConfig):
activation_dropout=0.0, activation_dropout=0.0,
init_std=0.02, init_std=0.02,
init_xavier_std=1.0, init_xavier_std=1.0,
scale_embedding=False,
auxiliary_loss=False, auxiliary_loss=False,
position_embedding_type="sine", position_embedding_type="sine",
backbone="resnet50", backbone="resnet50",
...@@ -213,7 +212,6 @@ class ConditionalDetrConfig(PretrainedConfig): ...@@ -213,7 +212,6 @@ class ConditionalDetrConfig(PretrainedConfig):
self.encoder_layerdrop = encoder_layerdrop self.encoder_layerdrop = encoder_layerdrop
self.decoder_layerdrop = decoder_layerdrop self.decoder_layerdrop = decoder_layerdrop
self.num_hidden_layers = encoder_layers self.num_hidden_layers = encoder_layers
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
self.auxiliary_loss = auxiliary_loss self.auxiliary_loss = auxiliary_loss
self.position_embedding_type = position_embedding_type self.position_embedding_type = position_embedding_type
self.backbone = backbone self.backbone = backbone
......
...@@ -57,8 +57,6 @@ class DecisionTransformerConfig(PretrainedConfig): ...@@ -57,8 +57,6 @@ class DecisionTransformerConfig(PretrainedConfig):
n_positions (`int`, *optional*, defaults to 1024): n_positions (`int`, *optional*, defaults to 1024):
The maximum sequence length that this model might ever be used with. Typically set this to something large The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048). just in case (e.g., 512 or 1024 or 2048).
n_embd (`int`, *optional*, defaults to 768):
Dimensionality of the embeddings and hidden states.
n_layer (`int`, *optional*, defaults to 12): n_layer (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder. Number of hidden layers in the Transformer encoder.
n_head (`int`, *optional*, defaults to 12): n_head (`int`, *optional*, defaults to 12):
...@@ -119,7 +117,6 @@ class DecisionTransformerConfig(PretrainedConfig): ...@@ -119,7 +117,6 @@ class DecisionTransformerConfig(PretrainedConfig):
action_tanh=True, action_tanh=True,
vocab_size=1, vocab_size=1,
n_positions=1024, n_positions=1024,
n_embd=768,
n_layer=3, n_layer=3,
n_head=1, n_head=1,
n_inner=None, n_inner=None,
...@@ -145,7 +142,6 @@ class DecisionTransformerConfig(PretrainedConfig): ...@@ -145,7 +142,6 @@ class DecisionTransformerConfig(PretrainedConfig):
self.action_tanh = action_tanh self.action_tanh = action_tanh
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.n_positions = n_positions self.n_positions = n_positions
self.n_embd = n_embd
self.n_layer = n_layer self.n_layer = n_layer
self.n_head = n_head self.n_head = n_head
self.n_inner = n_inner self.n_inner = n_inner
......
...@@ -161,7 +161,6 @@ class DetrConfig(PretrainedConfig): ...@@ -161,7 +161,6 @@ class DetrConfig(PretrainedConfig):
activation_dropout=0.0, activation_dropout=0.0,
init_std=0.02, init_std=0.02,
init_xavier_std=1.0, init_xavier_std=1.0,
scale_embedding=False,
auxiliary_loss=False, auxiliary_loss=False,
position_embedding_type="sine", position_embedding_type="sine",
backbone="resnet50", backbone="resnet50",
...@@ -209,7 +208,6 @@ class DetrConfig(PretrainedConfig): ...@@ -209,7 +208,6 @@ class DetrConfig(PretrainedConfig):
self.encoder_layerdrop = encoder_layerdrop self.encoder_layerdrop = encoder_layerdrop
self.decoder_layerdrop = decoder_layerdrop self.decoder_layerdrop = decoder_layerdrop
self.num_hidden_layers = encoder_layers self.num_hidden_layers = encoder_layers
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
self.auxiliary_loss = auxiliary_loss self.auxiliary_loss = auxiliary_loss
self.position_embedding_type = position_embedding_type self.position_embedding_type = position_embedding_type
self.backbone = backbone self.backbone = backbone
......
...@@ -75,8 +75,6 @@ class FunnelConfig(PretrainedConfig): ...@@ -75,8 +75,6 @@ class FunnelConfig(PretrainedConfig):
The dropout probability for the attention probabilities. The dropout probability for the attention probabilities.
activation_dropout (`float`, *optional*, defaults to 0.0): activation_dropout (`float`, *optional*, defaults to 0.0):
The dropout probability used between the two layers of the feed-forward blocks. The dropout probability used between the two layers of the feed-forward blocks.
type_vocab_size (`int`, *optional*, defaults to 3):
The vocabulary size of the `token_type_ids` passed when calling [`FunnelModel`] or [`TFFunnelModel`].
initializer_range (`float`, *optional*, defaults to 0.1): initializer_range (`float`, *optional*, defaults to 0.1):
The upper bound of the *uniform initializer* for initializing all weight matrices in attention layers. The upper bound of the *uniform initializer* for initializing all weight matrices in attention layers.
initializer_std (`float`, *optional*): initializer_std (`float`, *optional*):
...@@ -118,7 +116,6 @@ class FunnelConfig(PretrainedConfig): ...@@ -118,7 +116,6 @@ class FunnelConfig(PretrainedConfig):
hidden_dropout=0.1, hidden_dropout=0.1,
attention_dropout=0.1, attention_dropout=0.1,
activation_dropout=0.0, activation_dropout=0.0,
type_vocab_size=3,
initializer_range=0.1, initializer_range=0.1,
initializer_std=None, initializer_std=None,
layer_norm_eps=1e-9, layer_norm_eps=1e-9,
...@@ -144,7 +141,6 @@ class FunnelConfig(PretrainedConfig): ...@@ -144,7 +141,6 @@ class FunnelConfig(PretrainedConfig):
self.hidden_dropout = hidden_dropout self.hidden_dropout = hidden_dropout
self.attention_dropout = attention_dropout self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout self.activation_dropout = activation_dropout
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.initializer_std = initializer_std self.initializer_std = initializer_std
self.layer_norm_eps = layer_norm_eps self.layer_norm_eps = layer_norm_eps
......
...@@ -28,7 +28,6 @@ GIT_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -28,7 +28,6 @@ GIT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
} }
# Copied from transformers.models.clip.configuration_clip.CLIPVisionConfig with CLIPVision->GitVision, CLIP->GIT, clip->git, openai/git-vit-base-patch32->microsoft/git-base, 32->16
class GitVisionConfig(PretrainedConfig): class GitVisionConfig(PretrainedConfig):
r""" r"""
This is the configuration class to store the configuration of a [`GitVisionModel`]. It is used to instantiate a GIT This is the configuration class to store the configuration of a [`GitVisionModel`]. It is used to instantiate a GIT
...@@ -61,9 +60,6 @@ class GitVisionConfig(PretrainedConfig): ...@@ -61,9 +60,6 @@ class GitVisionConfig(PretrainedConfig):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
initializer_range (`float`, *optional*, defaults to 0.02): initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
initializer_factor (`float`, *optional*, defaults to 1):
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
testing).
Example: Example:
...@@ -86,7 +82,6 @@ class GitVisionConfig(PretrainedConfig): ...@@ -86,7 +82,6 @@ class GitVisionConfig(PretrainedConfig):
self, self,
hidden_size=768, hidden_size=768,
intermediate_size=3072, intermediate_size=3072,
projection_dim=512,
num_hidden_layers=12, num_hidden_layers=12,
num_attention_heads=12, num_attention_heads=12,
num_channels=3, num_channels=3,
...@@ -96,21 +91,18 @@ class GitVisionConfig(PretrainedConfig): ...@@ -96,21 +91,18 @@ class GitVisionConfig(PretrainedConfig):
layer_norm_eps=1e-5, layer_norm_eps=1e-5,
attention_dropout=0.0, attention_dropout=0.0,
initializer_range=0.02, initializer_range=0.02,
initializer_factor=1.0,
**kwargs **kwargs
): ):
super().__init__(**kwargs) super().__init__(**kwargs)
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.intermediate_size = intermediate_size self.intermediate_size = intermediate_size
self.projection_dim = projection_dim
self.num_hidden_layers = num_hidden_layers self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads self.num_attention_heads = num_attention_heads
self.num_channels = num_channels self.num_channels = num_channels
self.patch_size = patch_size self.patch_size = patch_size
self.image_size = image_size self.image_size = image_size
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.initializer_factor = initializer_factor
self.attention_dropout = attention_dropout self.attention_dropout = attention_dropout
self.layer_norm_eps = layer_norm_eps self.layer_norm_eps = layer_norm_eps
self.hidden_act = hidden_act self.hidden_act = hidden_act
......
...@@ -63,8 +63,6 @@ class GPTNeoXJapaneseConfig(PretrainedConfig): ...@@ -63,8 +63,6 @@ class GPTNeoXJapaneseConfig(PretrainedConfig):
use_cache (`bool`, *optional*, defaults to `True`): use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`. relevant if `config.is_decoder=True`.
weight_tying (`bool`, *optional*, defaults to `True`):
Whhether or not use weight tying between input and output embedding weight
attention_dropout (`float`, *optional*, defaults to 0.1): attention_dropout (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention. The dropout ratio for the attention.
hidden_dropout (`float`, *optional*, defaults to 0.0): hidden_dropout (`float`, *optional*, defaults to 0.0):
...@@ -101,7 +99,6 @@ class GPTNeoXJapaneseConfig(PretrainedConfig): ...@@ -101,7 +99,6 @@ class GPTNeoXJapaneseConfig(PretrainedConfig):
use_cache=True, use_cache=True,
bos_token_id=31996, bos_token_id=31996,
eos_token_id=31999, eos_token_id=31999,
weight_tying=True,
attention_dropout=0.1, attention_dropout=0.1,
hidden_dropout=0.0, hidden_dropout=0.0,
**kwargs **kwargs
...@@ -119,6 +116,5 @@ class GPTNeoXJapaneseConfig(PretrainedConfig): ...@@ -119,6 +116,5 @@ class GPTNeoXJapaneseConfig(PretrainedConfig):
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps self.layer_norm_eps = layer_norm_eps
self.use_cache = use_cache self.use_cache = use_cache
self.weight_tying = weight_tying
self.attention_dropout = attention_dropout self.attention_dropout = attention_dropout
self.hidden_dropout = hidden_dropout self.hidden_dropout = hidden_dropout
...@@ -68,8 +68,6 @@ class GPTJConfig(PretrainedConfig): ...@@ -68,8 +68,6 @@ class GPTJConfig(PretrainedConfig):
The epsilon to use in the layer normalization layers. The epsilon to use in the layer normalization layers.
initializer_range (`float`, *optional*, defaults to 0.02): initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
scale_attn_weights (`bool`, *optional*, defaults to `True`):
Scale attention weights by dividing by sqrt(hidden_size).
use_cache (`bool`, *optional*, defaults to `True`): use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Whether or not the model should return the last key/values attentions (not used by all models).
...@@ -110,7 +108,6 @@ class GPTJConfig(PretrainedConfig): ...@@ -110,7 +108,6 @@ class GPTJConfig(PretrainedConfig):
attn_pdrop=0.0, attn_pdrop=0.0,
layer_norm_epsilon=1e-5, layer_norm_epsilon=1e-5,
initializer_range=0.02, initializer_range=0.02,
scale_attn_weights=True,
use_cache=True, use_cache=True,
bos_token_id=50256, bos_token_id=50256,
eos_token_id=50256, eos_token_id=50256,
...@@ -130,7 +127,6 @@ class GPTJConfig(PretrainedConfig): ...@@ -130,7 +127,6 @@ class GPTJConfig(PretrainedConfig):
self.attn_pdrop = attn_pdrop self.attn_pdrop = attn_pdrop
self.layer_norm_epsilon = layer_norm_epsilon self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.scale_attn_weights = scale_attn_weights
self.use_cache = use_cache self.use_cache = use_cache
self.bos_token_id = bos_token_id self.bos_token_id = bos_token_id
......
...@@ -79,8 +79,6 @@ class GraphormerConfig(PretrainedConfig): ...@@ -79,8 +79,6 @@ class GraphormerConfig(PretrainedConfig):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.1): attention_dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for the attention weights. The dropout probability for the attention weights.
activation_dropout (`float`, *optional*, defaults to 0.1):
The dropout probability after activation in the FFN.
layerdrop (`float`, *optional*, defaults to 0.0): layerdrop (`float`, *optional*, defaults to 0.0):
The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
for more details. for more details.
...@@ -152,7 +150,6 @@ class GraphormerConfig(PretrainedConfig): ...@@ -152,7 +150,6 @@ class GraphormerConfig(PretrainedConfig):
num_attention_heads: int = 32, num_attention_heads: int = 32,
dropout: float = 0.1, dropout: float = 0.1,
attention_dropout: float = 0.1, attention_dropout: float = 0.1,
activation_dropout: float = 0.1,
layerdrop: float = 0.0, layerdrop: float = 0.0,
encoder_normalize_before: bool = False, encoder_normalize_before: bool = False,
pre_layernorm: bool = False, pre_layernorm: bool = False,
...@@ -191,7 +188,6 @@ class GraphormerConfig(PretrainedConfig): ...@@ -191,7 +188,6 @@ class GraphormerConfig(PretrainedConfig):
self.num_attention_heads = num_attention_heads self.num_attention_heads = num_attention_heads
self.dropout = dropout self.dropout = dropout
self.attention_dropout = attention_dropout self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.layerdrop = layerdrop self.layerdrop = layerdrop
self.encoder_normalize_before = encoder_normalize_before self.encoder_normalize_before = encoder_normalize_before
self.pre_layernorm = pre_layernorm self.pre_layernorm = pre_layernorm
......
...@@ -465,8 +465,8 @@ class LEDEncoderSelfAttention(nn.Module): ...@@ -465,8 +465,8 @@ class LEDEncoderSelfAttention(nn.Module):
query = query.transpose(1, 2).reshape(batch_size * num_heads, seq_len, head_dim) query = query.transpose(1, 2).reshape(batch_size * num_heads, seq_len, head_dim)
key = key.transpose(1, 2).reshape(batch_size * num_heads, seq_len, head_dim) key = key.transpose(1, 2).reshape(batch_size * num_heads, seq_len, head_dim)
query = self._chunk(query, window_overlap, self.config.__dict__.get("onnx_export", False)) query = self._chunk(query, window_overlap, getattr(self.config, "onnx_export", False))
key = self._chunk(key, window_overlap, self.config.__dict__.get("onnx_export", False)) key = self._chunk(key, window_overlap, getattr(self.config, "onnx_export", False))
# matrix multiplication # matrix multiplication
# bcxd: batch_size * num_heads x chunks x 2window_overlap x head_dim # bcxd: batch_size * num_heads x chunks x 2window_overlap x head_dim
......
...@@ -836,8 +836,8 @@ class LongformerSelfAttention(nn.Module): ...@@ -836,8 +836,8 @@ class LongformerSelfAttention(nn.Module):
query = query.transpose(1, 2).reshape(batch_size * num_heads, seq_len, head_dim) query = query.transpose(1, 2).reshape(batch_size * num_heads, seq_len, head_dim)
key = key.transpose(1, 2).reshape(batch_size * num_heads, seq_len, head_dim) key = key.transpose(1, 2).reshape(batch_size * num_heads, seq_len, head_dim)
query = self._chunk(query, window_overlap, self.config.__dict__.get("onnx_export", False)) query = self._chunk(query, window_overlap, getattr(self.config, "onnx_export", False))
key = self._chunk(key, window_overlap, self.config.__dict__.get("onnx_export", False)) key = self._chunk(key, window_overlap, getattr(self.config, "onnx_export", False))
# matrix multiplication # matrix multiplication
# bcxd: batch_size * num_heads x chunks x 2window_overlap x head_dim # bcxd: batch_size * num_heads x chunks x 2window_overlap x head_dim
......
...@@ -21,8 +21,6 @@ class NezhaConfig(PretrainedConfig): ...@@ -21,8 +21,6 @@ class NezhaConfig(PretrainedConfig):
vocab_size (`int`, optional, defaults to 21128): vocab_size (`int`, optional, defaults to 21128):
Vocabulary size of the NEZHA model. Defines the different tokens that can be represented by the Vocabulary size of the NEZHA model. Defines the different tokens that can be represented by the
*inputs_ids* passed to the forward method of [`NezhaModel`]. *inputs_ids* passed to the forward method of [`NezhaModel`].
embedding_size (`int`, optional, defaults to 128):
Dimensionality of vocabulary embeddings.
hidden_size (`int`, optional, defaults to 768): hidden_size (`int`, optional, defaults to 768):
Dimensionality of the encoder layers and the pooler layer. Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (`int`, optional, defaults to 12): num_hidden_layers (`int`, optional, defaults to 12):
...@@ -71,7 +69,6 @@ class NezhaConfig(PretrainedConfig): ...@@ -71,7 +69,6 @@ class NezhaConfig(PretrainedConfig):
def __init__( def __init__(
self, self,
vocab_size=21128, vocab_size=21128,
embedding_size=128,
hidden_size=768, hidden_size=768,
num_hidden_layers=12, num_hidden_layers=12,
num_attention_heads=12, num_attention_heads=12,
...@@ -94,7 +91,6 @@ class NezhaConfig(PretrainedConfig): ...@@ -94,7 +91,6 @@ class NezhaConfig(PretrainedConfig):
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.embedding_size = embedding_size
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads self.num_attention_heads = num_attention_heads
......
...@@ -223,7 +223,7 @@ class NezhaEmbeddings(nn.Module): ...@@ -223,7 +223,7 @@ class NezhaEmbeddings(nn.Module):
class NezhaSelfAttention(nn.Module): class NezhaSelfAttention(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): if config.hidden_size % config.num_attention_heads != 0:
raise ValueError( raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})" f"heads ({config.num_attention_heads})"
......
...@@ -189,7 +189,6 @@ class SEWDConfig(PretrainedConfig): ...@@ -189,7 +189,6 @@ class SEWDConfig(PretrainedConfig):
attention_dropout=0.1, attention_dropout=0.1,
feat_proj_dropout=0.0, feat_proj_dropout=0.0,
final_dropout=0.1, final_dropout=0.1,
layerdrop=0.1,
initializer_range=0.02, initializer_range=0.02,
layer_norm_eps=1e-7, layer_norm_eps=1e-7,
feature_layer_norm_eps=1e-5, feature_layer_norm_eps=1e-5,
...@@ -244,7 +243,6 @@ class SEWDConfig(PretrainedConfig): ...@@ -244,7 +243,6 @@ class SEWDConfig(PretrainedConfig):
self.activation_dropout = activation_dropout self.activation_dropout = activation_dropout
self.feat_proj_dropout = feat_proj_dropout self.feat_proj_dropout = feat_proj_dropout
self.final_dropout = final_dropout self.final_dropout = final_dropout
self.layerdrop = layerdrop
self.layer_norm_eps = layer_norm_eps self.layer_norm_eps = layer_norm_eps
self.feature_layer_norm_eps = feature_layer_norm_eps self.feature_layer_norm_eps = feature_layer_norm_eps
self.initializer_range = initializer_range self.initializer_range = initializer_range
......
...@@ -163,7 +163,6 @@ class TableTransformerConfig(PretrainedConfig): ...@@ -163,7 +163,6 @@ class TableTransformerConfig(PretrainedConfig):
activation_dropout=0.0, activation_dropout=0.0,
init_std=0.02, init_std=0.02,
init_xavier_std=1.0, init_xavier_std=1.0,
scale_embedding=False,
auxiliary_loss=False, auxiliary_loss=False,
position_embedding_type="sine", position_embedding_type="sine",
backbone="resnet50", backbone="resnet50",
...@@ -211,7 +210,6 @@ class TableTransformerConfig(PretrainedConfig): ...@@ -211,7 +210,6 @@ class TableTransformerConfig(PretrainedConfig):
self.encoder_layerdrop = encoder_layerdrop self.encoder_layerdrop = encoder_layerdrop
self.decoder_layerdrop = decoder_layerdrop self.decoder_layerdrop = decoder_layerdrop
self.num_hidden_layers = encoder_layers self.num_hidden_layers = encoder_layers
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
self.auxiliary_loss = auxiliary_loss self.auxiliary_loss = auxiliary_loss
self.position_embedding_type = position_embedding_type self.position_embedding_type = position_embedding_type
self.backbone = backbone self.backbone = backbone
......
...@@ -77,8 +77,6 @@ class TrajectoryTransformerConfig(PretrainedConfig): ...@@ -77,8 +77,6 @@ class TrajectoryTransformerConfig(PretrainedConfig):
max_position_embeddings (`int`, *optional*, defaults to 512): max_position_embeddings (`int`, *optional*, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something large The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048). just in case (e.g., 512 or 1024 or 2048).
type_vocab_size (`int`, *optional*, defaults to 2):
The vocabulary size of the `token_type_ids` passed when calling [`TrajectoryTransformerModel`]
initializer_range (`float`, *optional*, defaults to 0.02): initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (`float`, *optional*, defaults to 1e-12): layer_norm_eps (`float`, *optional*, defaults to 1e-12):
...@@ -128,7 +126,6 @@ class TrajectoryTransformerConfig(PretrainedConfig): ...@@ -128,7 +126,6 @@ class TrajectoryTransformerConfig(PretrainedConfig):
resid_pdrop=0.1, resid_pdrop=0.1,
learning_rate=0.0006, learning_rate=0.0006,
max_position_embeddings=512, max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02, initializer_range=0.02,
layer_norm_eps=1e-12, layer_norm_eps=1e-12,
kaiming_initializer_range=1, kaiming_initializer_range=1,
...@@ -155,7 +152,6 @@ class TrajectoryTransformerConfig(PretrainedConfig): ...@@ -155,7 +152,6 @@ class TrajectoryTransformerConfig(PretrainedConfig):
self.attn_pdrop = attn_pdrop self.attn_pdrop = attn_pdrop
self.resid_pdrop = resid_pdrop self.resid_pdrop = resid_pdrop
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.type_vocab_size = type_vocab_size
self.layer_norm_eps = layer_norm_eps self.layer_norm_eps = layer_norm_eps
self.kaiming_initializer_range = kaiming_initializer_range self.kaiming_initializer_range = kaiming_initializer_range
self.use_cache = use_cache self.use_cache = use_cache
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment