"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "aef3823e1ab5282ba98de6a3fdc78d89dabacd9e"
Unverified Commit 212829ad authored by Yih-Dar's avatar Yih-Dar Committed by GitHub
Browse files

Remove more unused attributes in config classes (#21000)



* Remove gradient_checkpointing from MarkupLMConfig

* Remove predict_special_tokens from OpenAIGPTConfig

* Remove enable_cls from RoCBertConfig

* Remove batch_size from TrajectoryTransformerConfig

* Remove searcher_seq_len from RealmConfig

* Remove feat_quantizer_dropout from WavLMConfig

* Remove position_biased_input from SEWDConfig

* Remove max_source_positions from Speech2Text2Config
Co-authored-by: default avatarydshieh <ydshieh@users.noreply.github.com>
parent b5be744d
...@@ -65,8 +65,6 @@ class MarkupLMConfig(PretrainedConfig): ...@@ -65,8 +65,6 @@ class MarkupLMConfig(PretrainedConfig):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (`float`, *optional*, defaults to 1e-12): layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers. The epsilon used by the layer normalization layers.
gradient_checkpointing (`bool`, *optional*, defaults to `False`):
If True, use gradient checkpointing to save memory at the expense of slower backward pass.
max_tree_id_unit_embeddings (`int`, *optional*, defaults to 1024): max_tree_id_unit_embeddings (`int`, *optional*, defaults to 1024):
The maximum value that the tree id unit embedding might ever use. Typically set this to something large The maximum value that the tree id unit embedding might ever use. Typically set this to something large
just in case (e.g., 1024). just in case (e.g., 1024).
...@@ -119,7 +117,6 @@ class MarkupLMConfig(PretrainedConfig): ...@@ -119,7 +117,6 @@ class MarkupLMConfig(PretrainedConfig):
pad_token_id=0, pad_token_id=0,
bos_token_id=0, bos_token_id=0,
eos_token_id=2, eos_token_id=2,
gradient_checkpointing=False,
max_xpath_tag_unit_embeddings=256, max_xpath_tag_unit_embeddings=256,
max_xpath_subs_unit_embeddings=1024, max_xpath_subs_unit_embeddings=1024,
tag_pad_id=216, tag_pad_id=216,
...@@ -135,7 +132,6 @@ class MarkupLMConfig(PretrainedConfig): ...@@ -135,7 +132,6 @@ class MarkupLMConfig(PretrainedConfig):
pad_token_id=pad_token_id, pad_token_id=pad_token_id,
bos_token_id=bos_token_id, bos_token_id=bos_token_id,
eos_token_id=eos_token_id, eos_token_id=eos_token_id,
gradient_checkpointing=gradient_checkpointing,
**kwargs, **kwargs,
) )
self.vocab_size = vocab_size self.vocab_size = vocab_size
......
...@@ -60,8 +60,6 @@ class OpenAIGPTConfig(PretrainedConfig): ...@@ -60,8 +60,6 @@ class OpenAIGPTConfig(PretrainedConfig):
The epsilon to use in the layer normalization layers The epsilon to use in the layer normalization layers
initializer_range (`float`, *optional*, defaults to 0.02): initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
predict_special_tokens (`bool`, *optional*, defaults to `True`):
Whether or not special tokens should be predicted when the model has a language modeling head.
summary_type (`str`, *optional*, defaults to `"cls_index"`): summary_type (`str`, *optional*, defaults to `"cls_index"`):
Argument used when doing sequence summary, used in the models [`OpenAIGPTDoubleHeadsModel`] and Argument used when doing sequence summary, used in the models [`OpenAIGPTDoubleHeadsModel`] and
[`OpenAIGPTDoubleHeadsModel`]. [`OpenAIGPTDoubleHeadsModel`].
...@@ -133,7 +131,6 @@ class OpenAIGPTConfig(PretrainedConfig): ...@@ -133,7 +131,6 @@ class OpenAIGPTConfig(PretrainedConfig):
attn_pdrop=0.1, attn_pdrop=0.1,
layer_norm_epsilon=1e-5, layer_norm_epsilon=1e-5,
initializer_range=0.02, initializer_range=0.02,
predict_special_tokens=True,
summary_type="cls_index", summary_type="cls_index",
summary_use_proj=True, summary_use_proj=True,
summary_activation=None, summary_activation=None,
...@@ -152,7 +149,6 @@ class OpenAIGPTConfig(PretrainedConfig): ...@@ -152,7 +149,6 @@ class OpenAIGPTConfig(PretrainedConfig):
self.attn_pdrop = attn_pdrop self.attn_pdrop = attn_pdrop
self.layer_norm_epsilon = layer_norm_epsilon self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.predict_special_tokens = predict_special_tokens
self.summary_type = summary_type self.summary_type = summary_type
self.summary_use_proj = summary_use_proj self.summary_use_proj = summary_use_proj
self.summary_activation = summary_activation self.summary_activation = summary_activation
......
...@@ -110,8 +110,6 @@ class RealmConfig(PretrainedConfig): ...@@ -110,8 +110,6 @@ class RealmConfig(PretrainedConfig):
searcher_beam_size (`int`, *optional*, defaults to 5000): searcher_beam_size (`int`, *optional*, defaults to 5000):
Beam size of the searcher. Note that when eval mode is enabled, *searcher_beam_size* will be the same as Beam size of the searcher. Note that when eval mode is enabled, *searcher_beam_size* will be the same as
*reader_beam_size*. *reader_beam_size*.
searcher_seq_len (`int`, *optional*, defaults to 64):
Maximum sequence length of the searcher.
Example: Example:
...@@ -152,7 +150,6 @@ class RealmConfig(PretrainedConfig): ...@@ -152,7 +150,6 @@ class RealmConfig(PretrainedConfig):
reader_seq_len=320, # 288 + 32 reader_seq_len=320, # 288 + 32
num_block_records=13353718, num_block_records=13353718,
searcher_beam_size=5000, searcher_beam_size=5000,
searcher_seq_len=64,
pad_token_id=1, pad_token_id=1,
bos_token_id=0, bos_token_id=0,
eos_token_id=2, eos_token_id=2,
...@@ -186,4 +183,3 @@ class RealmConfig(PretrainedConfig): ...@@ -186,4 +183,3 @@ class RealmConfig(PretrainedConfig):
# Retrieval config # Retrieval config
self.num_block_records = num_block_records self.num_block_records = num_block_records
self.searcher_beam_size = searcher_beam_size self.searcher_beam_size = searcher_beam_size
self.searcher_seq_len = searcher_seq_len
...@@ -77,8 +77,6 @@ class RoCBertConfig(PretrainedConfig): ...@@ -77,8 +77,6 @@ class RoCBertConfig(PretrainedConfig):
with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658). with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
classifier_dropout (`float`, *optional*): classifier_dropout (`float`, *optional*):
The dropout ratio for the classification head. The dropout ratio for the classification head.
enable_cls (`bool`, *optional*, defaults to `True`):
Whether or not the model use cls loss when pretrained.
enable_pronunciation (`bool`, *optional*, defaults to `True`): enable_pronunciation (`bool`, *optional*, defaults to `True`):
Whether or not the model use pronunciation embed when training. Whether or not the model use pronunciation embed when training.
enable_shape (`bool`, *optional*, defaults to `True`): enable_shape (`bool`, *optional*, defaults to `True`):
...@@ -131,7 +129,6 @@ class RoCBertConfig(PretrainedConfig): ...@@ -131,7 +129,6 @@ class RoCBertConfig(PretrainedConfig):
pad_token_id=0, pad_token_id=0,
position_embedding_type="absolute", position_embedding_type="absolute",
classifier_dropout=None, classifier_dropout=None,
enable_cls=True,
enable_pronunciation=True, enable_pronunciation=True,
enable_shape=True, enable_shape=True,
pronunciation_embed_dim=768, pronunciation_embed_dim=768,
...@@ -154,7 +151,6 @@ class RoCBertConfig(PretrainedConfig): ...@@ -154,7 +151,6 @@ class RoCBertConfig(PretrainedConfig):
self.type_vocab_size = type_vocab_size self.type_vocab_size = type_vocab_size
self.layer_norm_eps = layer_norm_eps self.layer_norm_eps = layer_norm_eps
self.use_cache = use_cache self.use_cache = use_cache
self.enable_cls = enable_cls
self.enable_pronunciation = enable_pronunciation self.enable_pronunciation = enable_pronunciation
self.enable_shape = enable_shape self.enable_shape = enable_shape
self.pronunciation_embed_dim = pronunciation_embed_dim self.pronunciation_embed_dim = pronunciation_embed_dim
......
...@@ -63,8 +63,6 @@ class SEWDConfig(PretrainedConfig): ...@@ -63,8 +63,6 @@ class SEWDConfig(PretrainedConfig):
Whether to share attention key with c2p and p2c. Whether to share attention key with c2p and p2c.
relative_attention (`bool`, *optional*, defaults to `True`): relative_attention (`bool`, *optional*, defaults to `True`):
Whether to use relative position encoding. Whether to use relative position encoding.
position_biased_input (`bool`, *optional*, defaults to `False`):
Whether to add absolute position embedding to content embedding.
pos_att_type (`Tuple[str]`, *optional*, defaults to `("p2c", "c2p")`): pos_att_type (`Tuple[str]`, *optional*, defaults to `("p2c", "c2p")`):
The type of relative position attention, it can be a combination of `("p2c", "c2p")`, e.g. `("p2c")`, The type of relative position attention, it can be a combination of `("p2c", "c2p")`, e.g. `("p2c")`,
`("p2c", "c2p")`, `("p2c", "c2p")`. `("p2c", "c2p")`, `("p2c", "c2p")`.
...@@ -183,7 +181,6 @@ class SEWDConfig(PretrainedConfig): ...@@ -183,7 +181,6 @@ class SEWDConfig(PretrainedConfig):
position_buckets=256, position_buckets=256,
share_att_key=True, share_att_key=True,
relative_attention=True, relative_attention=True,
position_biased_input=False,
pos_att_type=("p2c", "c2p"), pos_att_type=("p2c", "c2p"),
norm_rel_ebd="layer_norm", norm_rel_ebd="layer_norm",
hidden_act="gelu_python", hidden_act="gelu_python",
...@@ -239,7 +236,6 @@ class SEWDConfig(PretrainedConfig): ...@@ -239,7 +236,6 @@ class SEWDConfig(PretrainedConfig):
self.share_att_key = share_att_key self.share_att_key = share_att_key
self.relative_attention = relative_attention self.relative_attention = relative_attention
self.norm_rel_ebd = norm_rel_ebd self.norm_rel_ebd = norm_rel_ebd
self.position_biased_input = position_biased_input
self.pos_att_type = list(pos_att_type) self.pos_att_type = list(pos_att_type)
self.hidden_act = hidden_act self.hidden_act = hidden_act
self.num_attention_heads = num_attention_heads self.num_attention_heads = num_attention_heads
......
...@@ -68,8 +68,6 @@ class Speech2Text2Config(PretrainedConfig): ...@@ -68,8 +68,6 @@ class Speech2Text2Config(PretrainedConfig):
for more details. for more details.
use_cache (`bool`, *optional*, defaults to `True`): use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Whether or not the model should return the last key/values attentions (not used by all models).
max_source_positions (`int`, *optional*, defaults to 6000):
The maximum sequence length of log-mel filter-bank features that this model might ever be used with.
max_target_positions (`int`, *optional*, defaults to 1024): max_target_positions (`int`, *optional*, defaults to 1024):
The maximum sequence length that this model might ever be used with. Typically set this to something large The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048). just in case (e.g., 512 or 1024 or 2048).
...@@ -111,7 +109,6 @@ class Speech2Text2Config(PretrainedConfig): ...@@ -111,7 +109,6 @@ class Speech2Text2Config(PretrainedConfig):
pad_token_id=1, pad_token_id=1,
bos_token_id=0, bos_token_id=0,
eos_token_id=2, eos_token_id=2,
max_source_positions=6000,
max_target_positions=1024, max_target_positions=1024,
**kwargs **kwargs
): ):
...@@ -129,7 +126,6 @@ class Speech2Text2Config(PretrainedConfig): ...@@ -129,7 +126,6 @@ class Speech2Text2Config(PretrainedConfig):
self.use_cache = use_cache self.use_cache = use_cache
self.num_hidden_layers = decoder_layers self.num_hidden_layers = decoder_layers
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
self.max_source_positions = max_source_positions
self.max_target_positions = max_target_positions self.max_target_positions = max_target_positions
super().__init__( super().__init__(
......
...@@ -45,8 +45,6 @@ class TrajectoryTransformerConfig(PretrainedConfig): ...@@ -45,8 +45,6 @@ class TrajectoryTransformerConfig(PretrainedConfig):
vocab_size (`int`, *optional*, defaults to 100): vocab_size (`int`, *optional*, defaults to 100):
Vocabulary size of the TrajectoryTransformer model. Defines the number of different tokens that can be Vocabulary size of the TrajectoryTransformer model. Defines the number of different tokens that can be
represented by the `trajectories` passed when calling [`TrajectoryTransformerModel`] represented by the `trajectories` passed when calling [`TrajectoryTransformerModel`]
batch_size (`int`, *optional*, defaults to 256):
Size of the batch of trajectories passed to the model.
action_weight (`int`, *optional*, defaults to 5): action_weight (`int`, *optional*, defaults to 5):
Weight of the action in the loss function Weight of the action in the loss function
reward_weight (`int`, *optional*, defaults to 1): reward_weight (`int`, *optional*, defaults to 1):
...@@ -115,7 +113,6 @@ class TrajectoryTransformerConfig(PretrainedConfig): ...@@ -115,7 +113,6 @@ class TrajectoryTransformerConfig(PretrainedConfig):
def __init__( def __init__(
self, self,
vocab_size=100, vocab_size=100,
batch_size=256,
action_weight=5, action_weight=5,
reward_weight=1, reward_weight=1,
value_weight=1, value_weight=1,
...@@ -142,7 +139,6 @@ class TrajectoryTransformerConfig(PretrainedConfig): ...@@ -142,7 +139,6 @@ class TrajectoryTransformerConfig(PretrainedConfig):
**kwargs **kwargs
): ):
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.batch_size = batch_size
self.action_weight = action_weight self.action_weight = action_weight
self.reward_weight = reward_weight self.reward_weight = reward_weight
self.value_weight = value_weight self.value_weight = value_weight
......
...@@ -75,8 +75,6 @@ class WavLMConfig(PretrainedConfig): ...@@ -75,8 +75,6 @@ class WavLMConfig(PretrainedConfig):
feat_extract_activation (`str, `optional`, defaults to `"gelu"`): feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
The non-linear activation function (function or string) in the 1D convolutional layers of the feature The non-linear activation function (function or string) in the 1D convolutional layers of the feature
extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for quantized feature encoder states.
conv_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`): conv_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers. feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
...@@ -124,8 +122,6 @@ class WavLMConfig(PretrainedConfig): ...@@ -124,8 +122,6 @@ class WavLMConfig(PretrainedConfig):
Number of codevector groups for product codevector quantization. Number of codevector groups for product codevector quantization.
contrastive_logits_temperature (`float`, *optional*, defaults to 0.1): contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
The temperature *kappa* in the contrastive loss. The temperature *kappa* in the contrastive loss.
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for the output of the feature encoder that's used by the quantizer.
num_negatives (`int`, *optional*, defaults to 100): num_negatives (`int`, *optional*, defaults to 100):
Number of negative samples for the contrastive loss. Number of negative samples for the contrastive loss.
codevector_dim (`int`, *optional*, defaults to 256): codevector_dim (`int`, *optional*, defaults to 256):
...@@ -205,7 +201,6 @@ class WavLMConfig(PretrainedConfig): ...@@ -205,7 +201,6 @@ class WavLMConfig(PretrainedConfig):
activation_dropout=0.1, activation_dropout=0.1,
attention_dropout=0.1, attention_dropout=0.1,
feat_proj_dropout=0.0, feat_proj_dropout=0.0,
feat_quantizer_dropout=0.0,
final_dropout=0.1, final_dropout=0.1,
layerdrop=0.1, layerdrop=0.1,
initializer_range=0.02, initializer_range=0.02,
...@@ -308,7 +303,6 @@ class WavLMConfig(PretrainedConfig): ...@@ -308,7 +303,6 @@ class WavLMConfig(PretrainedConfig):
self.num_codevectors_per_group = num_codevectors_per_group self.num_codevectors_per_group = num_codevectors_per_group
self.num_codevector_groups = num_codevector_groups self.num_codevector_groups = num_codevector_groups
self.contrastive_logits_temperature = contrastive_logits_temperature self.contrastive_logits_temperature = contrastive_logits_temperature
self.feat_quantizer_dropout = feat_quantizer_dropout
self.num_negatives = num_negatives self.num_negatives = num_negatives
self.codevector_dim = codevector_dim self.codevector_dim = codevector_dim
self.proj_codevector_dim = proj_codevector_dim self.proj_codevector_dim = proj_codevector_dim
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment