Unverified Commit 212829ad authored by Yih-Dar's avatar Yih-Dar Committed by GitHub
Browse files

Remove more unused attributes in config classes (#21000)



* Remove gradient_checkpointing from MarkupLMConfig

* Remove predict_special_tokens from OpenAIGPTConfig

* Remove enable_cls from RoCBertConfig

* Remove batch_size from TrajectoryTransformerConfig

* Remove searcher_seq_len from RealmConfig

* Remove feat_quantizer_dropout from WavLMConfig

* Remove position_biased_input from SEWDConfig

* Remove max_source_positions from Speech2Text2Config
Co-authored-by: default avatarydshieh <ydshieh@users.noreply.github.com>
parent b5be744d
......@@ -65,8 +65,6 @@ class MarkupLMConfig(PretrainedConfig):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
gradient_checkpointing (`bool`, *optional*, defaults to `False`):
If True, use gradient checkpointing to save memory at the expense of slower backward pass.
max_tree_id_unit_embeddings (`int`, *optional*, defaults to 1024):
The maximum value that the tree id unit embedding might ever use. Typically set this to something large
just in case (e.g., 1024).
......@@ -119,7 +117,6 @@ class MarkupLMConfig(PretrainedConfig):
pad_token_id=0,
bos_token_id=0,
eos_token_id=2,
gradient_checkpointing=False,
max_xpath_tag_unit_embeddings=256,
max_xpath_subs_unit_embeddings=1024,
tag_pad_id=216,
......@@ -135,7 +132,6 @@ class MarkupLMConfig(PretrainedConfig):
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
gradient_checkpointing=gradient_checkpointing,
**kwargs,
)
self.vocab_size = vocab_size
......
......@@ -60,8 +60,6 @@ class OpenAIGPTConfig(PretrainedConfig):
The epsilon to use in the layer normalization layers
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
predict_special_tokens (`bool`, *optional*, defaults to `True`):
Whether or not special tokens should be predicted when the model has a language modeling head.
summary_type (`str`, *optional*, defaults to `"cls_index"`):
Argument used when doing sequence summary, used in the models [`OpenAIGPTDoubleHeadsModel`] and
[`OpenAIGPTDoubleHeadsModel`].
......@@ -133,7 +131,6 @@ class OpenAIGPTConfig(PretrainedConfig):
attn_pdrop=0.1,
layer_norm_epsilon=1e-5,
initializer_range=0.02,
predict_special_tokens=True,
summary_type="cls_index",
summary_use_proj=True,
summary_activation=None,
......@@ -152,7 +149,6 @@ class OpenAIGPTConfig(PretrainedConfig):
self.attn_pdrop = attn_pdrop
self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_range = initializer_range
self.predict_special_tokens = predict_special_tokens
self.summary_type = summary_type
self.summary_use_proj = summary_use_proj
self.summary_activation = summary_activation
......
......@@ -110,8 +110,6 @@ class RealmConfig(PretrainedConfig):
searcher_beam_size (`int`, *optional*, defaults to 5000):
Beam size of the searcher. Note that when eval mode is enabled, *searcher_beam_size* will be the same as
*reader_beam_size*.
searcher_seq_len (`int`, *optional*, defaults to 64):
Maximum sequence length of the searcher.
Example:
......@@ -152,7 +150,6 @@ class RealmConfig(PretrainedConfig):
reader_seq_len=320, # 288 + 32
num_block_records=13353718,
searcher_beam_size=5000,
searcher_seq_len=64,
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
......@@ -186,4 +183,3 @@ class RealmConfig(PretrainedConfig):
# Retrieval config
self.num_block_records = num_block_records
self.searcher_beam_size = searcher_beam_size
self.searcher_seq_len = searcher_seq_len
......@@ -77,8 +77,6 @@ class RoCBertConfig(PretrainedConfig):
with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
classifier_dropout (`float`, *optional*):
The dropout ratio for the classification head.
enable_cls (`bool`, *optional*, defaults to `True`):
Whether or not the model use cls loss when pretrained.
enable_pronunciation (`bool`, *optional*, defaults to `True`):
Whether or not the model use pronunciation embed when training.
enable_shape (`bool`, *optional*, defaults to `True`):
......@@ -131,7 +129,6 @@ class RoCBertConfig(PretrainedConfig):
pad_token_id=0,
position_embedding_type="absolute",
classifier_dropout=None,
enable_cls=True,
enable_pronunciation=True,
enable_shape=True,
pronunciation_embed_dim=768,
......@@ -154,7 +151,6 @@ class RoCBertConfig(PretrainedConfig):
self.type_vocab_size = type_vocab_size
self.layer_norm_eps = layer_norm_eps
self.use_cache = use_cache
self.enable_cls = enable_cls
self.enable_pronunciation = enable_pronunciation
self.enable_shape = enable_shape
self.pronunciation_embed_dim = pronunciation_embed_dim
......
......@@ -63,8 +63,6 @@ class SEWDConfig(PretrainedConfig):
Whether to share attention key with c2p and p2c.
relative_attention (`bool`, *optional*, defaults to `True`):
Whether to use relative position encoding.
position_biased_input (`bool`, *optional*, defaults to `False`):
Whether to add absolute position embedding to content embedding.
pos_att_type (`Tuple[str]`, *optional*, defaults to `("p2c", "c2p")`):
The type of relative position attention, it can be a combination of `("p2c", "c2p")`, e.g. `("p2c")`,
`("p2c", "c2p")`, `("p2c", "c2p")`.
......@@ -183,7 +181,6 @@ class SEWDConfig(PretrainedConfig):
position_buckets=256,
share_att_key=True,
relative_attention=True,
position_biased_input=False,
pos_att_type=("p2c", "c2p"),
norm_rel_ebd="layer_norm",
hidden_act="gelu_python",
......@@ -239,7 +236,6 @@ class SEWDConfig(PretrainedConfig):
self.share_att_key = share_att_key
self.relative_attention = relative_attention
self.norm_rel_ebd = norm_rel_ebd
self.position_biased_input = position_biased_input
self.pos_att_type = list(pos_att_type)
self.hidden_act = hidden_act
self.num_attention_heads = num_attention_heads
......
......@@ -68,8 +68,6 @@ class Speech2Text2Config(PretrainedConfig):
for more details.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models).
max_source_positions (`int`, *optional*, defaults to 6000):
The maximum sequence length of log-mel filter-bank features that this model might ever be used with.
max_target_positions (`int`, *optional*, defaults to 1024):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
......@@ -111,7 +109,6 @@ class Speech2Text2Config(PretrainedConfig):
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
max_source_positions=6000,
max_target_positions=1024,
**kwargs
):
......@@ -129,7 +126,6 @@ class Speech2Text2Config(PretrainedConfig):
self.use_cache = use_cache
self.num_hidden_layers = decoder_layers
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
self.max_source_positions = max_source_positions
self.max_target_positions = max_target_positions
super().__init__(
......
......@@ -45,8 +45,6 @@ class TrajectoryTransformerConfig(PretrainedConfig):
vocab_size (`int`, *optional*, defaults to 100):
Vocabulary size of the TrajectoryTransformer model. Defines the number of different tokens that can be
represented by the `trajectories` passed when calling [`TrajectoryTransformerModel`]
batch_size (`int`, *optional*, defaults to 256):
Size of the batch of trajectories passed to the model.
action_weight (`int`, *optional*, defaults to 5):
Weight of the action in the loss function
reward_weight (`int`, *optional*, defaults to 1):
......@@ -115,7 +113,6 @@ class TrajectoryTransformerConfig(PretrainedConfig):
def __init__(
self,
vocab_size=100,
batch_size=256,
action_weight=5,
reward_weight=1,
value_weight=1,
......@@ -142,7 +139,6 @@ class TrajectoryTransformerConfig(PretrainedConfig):
**kwargs
):
self.vocab_size = vocab_size
self.batch_size = batch_size
self.action_weight = action_weight
self.reward_weight = reward_weight
self.value_weight = value_weight
......
......@@ -75,8 +75,6 @@ class WavLMConfig(PretrainedConfig):
feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
The non-linear activation function (function or string) in the 1D convolutional layers of the feature
extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for quantized feature encoder states.
conv_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
......@@ -124,8 +122,6 @@ class WavLMConfig(PretrainedConfig):
Number of codevector groups for product codevector quantization.
contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
The temperature *kappa* in the contrastive loss.
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for the output of the feature encoder that's used by the quantizer.
num_negatives (`int`, *optional*, defaults to 100):
Number of negative samples for the contrastive loss.
codevector_dim (`int`, *optional*, defaults to 256):
......@@ -205,7 +201,6 @@ class WavLMConfig(PretrainedConfig):
activation_dropout=0.1,
attention_dropout=0.1,
feat_proj_dropout=0.0,
feat_quantizer_dropout=0.0,
final_dropout=0.1,
layerdrop=0.1,
initializer_range=0.02,
......@@ -308,7 +303,6 @@ class WavLMConfig(PretrainedConfig):
self.num_codevectors_per_group = num_codevectors_per_group
self.num_codevector_groups = num_codevector_groups
self.contrastive_logits_temperature = contrastive_logits_temperature
self.feat_quantizer_dropout = feat_quantizer_dropout
self.num_negatives = num_negatives
self.codevector_dim = codevector_dim
self.proj_codevector_dim = proj_codevector_dim
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment