Unverified Commit 8e5d1619 authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Clean load keys (#24505)

* Preliminary work on some models

* Fix test load missing and make sure nonpersistent buffers are tested

* Always ignore nonpersistent buffers if in state_dict

* Treat models

* More models

* Treat remaining models

* Fix quality

* Fix tests

* Remove draft

* This test is not needed anymore

* Fix copies

* Fix last test

* Newly added models

* Fix last tests

* Address review comments
parent 53194991
......@@ -80,7 +80,9 @@ class RobertaEmbeddings(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.register_buffer(
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
)
......@@ -614,15 +616,6 @@ class RobertaPreTrainedModel(PreTrainedModel):
if isinstance(module, RobertaEncoder):
module.gradient_checkpointing = value
def update_keys_to_ignore(self, config, del_keys_to_ignore):
"""Remove some keys from ignore list"""
if not config.tie_word_embeddings:
# must make a new list, or the class variable gets modified!
self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore]
self._keys_to_ignore_on_load_missing = [
k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore
]
ROBERTA_START_DOCSTRING = r"""
......@@ -711,8 +704,6 @@ class RobertaModel(RobertaPreTrainedModel):
"""
_keys_to_ignore_on_load_missing = [r"position_ids"]
# Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Roberta
def __init__(self, config, add_pooling_layer=True):
super().__init__(config)
......@@ -881,9 +872,6 @@ class RobertaModel(RobertaPreTrainedModel):
"""RoBERTa Model with a `language modeling` head on top for CLM fine-tuning.""", ROBERTA_START_DOCSTRING
)
class RobertaForCausalLM(RobertaPreTrainedModel):
_keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
_keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
def __init__(self, config):
......@@ -895,9 +883,6 @@ class RobertaForCausalLM(RobertaPreTrainedModel):
self.roberta = RobertaModel(config, add_pooling_layer=False)
self.lm_head = RobertaLMHead(config)
# The LM head weights require special treatment only when they are tied with the word embeddings
self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
# Initialize weights and apply final processing
self.post_init()
......@@ -1036,9 +1021,6 @@ class RobertaForCausalLM(RobertaPreTrainedModel):
@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top.""", ROBERTA_START_DOCSTRING)
class RobertaForMaskedLM(RobertaPreTrainedModel):
_keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
_keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
def __init__(self, config):
......@@ -1053,9 +1035,6 @@ class RobertaForMaskedLM(RobertaPreTrainedModel):
self.roberta = RobertaModel(config, add_pooling_layer=False)
self.lm_head = RobertaLMHead(config)
# The LM head weights require special treatment only when they are tied with the word embeddings
self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
# Initialize weights and apply final processing
self.post_init()
......@@ -1173,8 +1152,6 @@ class RobertaLMHead(nn.Module):
ROBERTA_START_DOCSTRING,
)
class RobertaForSequenceClassification(RobertaPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
......@@ -1274,8 +1251,6 @@ class RobertaForSequenceClassification(RobertaPreTrainedModel):
ROBERTA_START_DOCSTRING,
)
class RobertaForMultipleChoice(RobertaPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config):
super().__init__(config)
......@@ -1368,9 +1343,6 @@ class RobertaForMultipleChoice(RobertaPreTrainedModel):
ROBERTA_START_DOCSTRING,
)
class RobertaForTokenClassification(RobertaPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
......@@ -1478,9 +1450,6 @@ class RobertaClassificationHead(nn.Module):
ROBERTA_START_DOCSTRING,
)
class RobertaForQuestionAnswering(RobertaPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
......
......@@ -83,7 +83,9 @@ class RobertaPreLayerNormEmbeddings(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.register_buffer(
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
)
......@@ -617,15 +619,6 @@ class RobertaPreLayerNormPreTrainedModel(PreTrainedModel):
if isinstance(module, RobertaPreLayerNormEncoder):
module.gradient_checkpointing = value
def update_keys_to_ignore(self, config, del_keys_to_ignore):
"""Remove some keys from ignore list"""
if not config.tie_word_embeddings:
# must make a new list, or the class variable gets modified!
self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore]
self._keys_to_ignore_on_load_missing = [
k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore
]
ROBERTA_PRELAYERNORM_START_DOCSTRING = r"""
......@@ -714,8 +707,6 @@ class RobertaPreLayerNormModel(RobertaPreLayerNormPreTrainedModel):
"""
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config, add_pooling_layer=True):
super().__init__(config)
self.config = config
......@@ -886,9 +877,6 @@ class RobertaPreLayerNormModel(RobertaPreLayerNormPreTrainedModel):
)
# Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with roberta-base->andreasmadsen/efficient_mlm_m0.40,ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm, RobertaPreLayerNormTokenizer->RobertaTokenizer
class RobertaPreLayerNormForCausalLM(RobertaPreLayerNormPreTrainedModel):
_keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
_keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
def __init__(self, config):
......@@ -902,9 +890,6 @@ class RobertaPreLayerNormForCausalLM(RobertaPreLayerNormPreTrainedModel):
self.roberta_prelayernorm = RobertaPreLayerNormModel(config, add_pooling_layer=False)
self.lm_head = RobertaPreLayerNormLMHead(config)
# The LM head weights require special treatment only when they are tied with the word embeddings
self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
# Initialize weights and apply final processing
self.post_init()
......@@ -1045,9 +1030,6 @@ class RobertaPreLayerNormForCausalLM(RobertaPreLayerNormPreTrainedModel):
"""RoBERTa-PreLayerNorm Model with a `language modeling` head on top.""", ROBERTA_PRELAYERNORM_START_DOCSTRING
)
class RobertaPreLayerNormForMaskedLM(RobertaPreLayerNormPreTrainedModel):
_keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
_keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
# Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM.__init__ with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
......@@ -1063,9 +1045,6 @@ class RobertaPreLayerNormForMaskedLM(RobertaPreLayerNormPreTrainedModel):
self.roberta_prelayernorm = RobertaPreLayerNormModel(config, add_pooling_layer=False)
self.lm_head = RobertaPreLayerNormLMHead(config)
# The LM head weights require special treatment only when they are tied with the word embeddings
self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
# Initialize weights and apply final processing
self.post_init()
......@@ -1185,8 +1164,6 @@ class RobertaPreLayerNormLMHead(nn.Module):
ROBERTA_PRELAYERNORM_START_DOCSTRING,
)
class RobertaPreLayerNormForSequenceClassification(RobertaPreLayerNormPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
......@@ -1286,8 +1263,6 @@ class RobertaPreLayerNormForSequenceClassification(RobertaPreLayerNormPreTrained
)
# Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
class RobertaPreLayerNormForMultipleChoice(RobertaPreLayerNormPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config):
super().__init__(config)
......@@ -1382,9 +1357,6 @@ class RobertaPreLayerNormForMultipleChoice(RobertaPreLayerNormPreTrainedModel):
ROBERTA_PRELAYERNORM_START_DOCSTRING,
)
class RobertaPreLayerNormForTokenClassification(RobertaPreLayerNormPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
......@@ -1492,9 +1464,6 @@ class RobertaPreLayerNormClassificationHead(nn.Module):
ROBERTA_PRELAYERNORM_START_DOCSTRING,
)
class RobertaPreLayerNormForQuestionAnswering(RobertaPreLayerNormPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
......
......@@ -190,7 +190,9 @@ class RoCBertEmbeddings(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.register_buffer(
"token_type_ids",
......@@ -777,7 +779,6 @@ class RoCBertPreTrainedModel(PreTrainedModel):
load_tf_weights = load_tf_weights_in_roc_bert
base_model_prefix = "roc_bert"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module):
"""Initialize the weights"""
......@@ -1081,7 +1082,6 @@ class RoCBertModel(RoCBertPreTrainedModel):
ROC_BERT_START_DOCSTRING,
)
class RoCBertForPreTraining(RoCBertPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"predictions.decoder.bias", "cls.predictions.decoder.weight"]
_tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
def __init__(self, config):
......@@ -1267,8 +1267,6 @@ class RoCBertForPreTraining(RoCBertPreTrainedModel):
@add_start_docstrings("""RoCBert Model with a `language modeling` head on top.""", ROC_BERT_START_DOCSTRING)
class RoCBertForMaskedLM(RoCBertPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"]
_tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
# Copied from transformers.models.bert.modeling_bert.BertForMaskedLM.__init__ with Bert->RoCBert,bert->roc_bert
......@@ -1409,8 +1407,6 @@ class RoCBertForMaskedLM(RoCBertPreTrainedModel):
"""RoCBert Model with a `language modeling` head on top for CLM fine-tuning.""", ROC_BERT_START_DOCSTRING
)
class RoCBertForCausalLM(RoCBertPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"]
_tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
# Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.__init__ with BertLMHeadModel->RoCBertForCausalLM,Bert->RoCBert,bert->roc_bert
......@@ -1804,8 +1800,6 @@ class RoCBertForMultipleChoice(RoCBertPreTrainedModel):
ROC_BERT_START_DOCSTRING,
)
class RoCBertForTokenClassification(RoCBertPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
# Copied from transformers.models.bert.modeling_bert.BertForTokenClassification.__init__ with Bert->RoCBert,bert->roc_bert
def __init__(self, config):
super().__init__(config)
......@@ -1892,8 +1886,6 @@ class RoCBertForTokenClassification(RoCBertPreTrainedModel):
ROC_BERT_START_DOCSTRING,
)
class RoCBertForQuestionAnswering(RoCBertPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
# Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering.__init__ with Bert->RoCBert,bert->roc_bert
def __init__(self, config):
super().__init__(config)
......
......@@ -696,11 +696,6 @@ class RoFormerPreTrainedModel(PreTrainedModel):
load_tf_weights = load_tf_weights_in_roformer
base_model_prefix = "roformer"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = []
_keys_to_ignore_on_load_unexpected = [
r"roformer.embeddings_project.weight",
r"roformer.embeddings_project.bias",
]
def _init_weights(self, module):
"""Initialize the weights"""
......@@ -952,7 +947,6 @@ class RoFormerModel(RoFormerPreTrainedModel):
@add_start_docstrings("""RoFormer Model with a `language modeling` head on top.""", ROFORMER_START_DOCSTRING)
class RoFormerForMaskedLM(RoFormerPreTrainedModel):
_keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
_tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
def __init__(self, config):
......@@ -1055,7 +1049,6 @@ class RoFormerForMaskedLM(RoFormerPreTrainedModel):
"""RoFormer Model with a `language modeling` head on top for CLM fine-tuning.""", ROFORMER_START_DOCSTRING
)
class RoFormerForCausalLM(RoFormerPreTrainedModel):
_keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
_tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
def __init__(self, config):
......
......@@ -1190,7 +1190,6 @@ SAM_INPUTS_DOCSTRING = r"""
SAM_START_DOCSTRING,
)
class SamModel(SamPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"prompt_encoder.shared_embedding.positional_embedding"]
_tied_weights_keys = ["prompt_encoder.shared_embedding.positional_embedding"]
def __init__(self, config):
......
......@@ -723,7 +723,6 @@ class SEWPreTrainedModel(PreTrainedModel):
base_model_prefix = "sew"
main_input_name = "input_values"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module):
"""Initialize the weights"""
......
......@@ -1257,7 +1257,6 @@ class SEWDPreTrainedModel(PreTrainedModel):
config_class = SEWDConfig
base_model_prefix = "sew-d"
main_input_name = "input_values"
_keys_to_ignore_on_load_missing = [r"position_ids"]
supports_gradient_checkpointing = True
def _init_weights(self, module):
......
......@@ -1266,17 +1266,6 @@ class Speech2TextModel(Speech2TextPreTrainedModel):
)
class Speech2TextForConditionalGeneration(Speech2TextPreTrainedModel):
base_model_prefix = "model"
_keys_to_ignore_on_load_missing = [
r"encoder.version",
r"decoder.version",
r"model.encoder.embed_positions.weights",
r"model.decoder.embed_positions.weights",
r"lm_head.weight",
]
_keys_to_ignore_on_save = [
r"model.encoder.embed_positions.weights",
r"model.decoder.embed_positions.weights",
]
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config: Speech2TextConfig):
......
......@@ -756,7 +756,6 @@ class Speech2Text2DecoderWrapper(Speech2Text2PreTrainedModel):
SPEECH_TO_TEXT_2_START_DOCSTRING,
)
class Speech2Text2ForCausalLM(Speech2Text2PreTrainedModel):
_keys_to_ignore_on_load_missing = ["lm_head.weight"]
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):
......
......@@ -441,7 +441,7 @@ class SpeechT5ScaledPositionalEncoding(nn.Module):
pe[:, 1::2] = torch.cos(position.float() * div_term)
pe = pe.unsqueeze(0)
super().__init__()
self.register_buffer("pe", pe)
self.register_buffer("pe", pe, persistent=False)
self.dropout = nn.Dropout(p=dropout)
self.dim = dim
self.alpha = torch.nn.Parameter(torch.tensor(1.0))
......@@ -1251,8 +1251,6 @@ class SpeechT5PreTrainedModel(PreTrainedModel):
main_input_name = "input_values"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, SpeechT5PositionalConvEmbedding):
......@@ -2326,13 +2324,6 @@ class SpeechT5Model(SpeechT5PreTrainedModel):
SPEECHT5_START_DOCSTRING,
)
class SpeechT5ForSpeechToText(SpeechT5PreTrainedModel):
_keys_to_ignore_on_load_missing = [
r"speecht5.encoder.prenet.pos_sinusoidal_embed.weights",
r"text_decoder_postnet.lm_head.weight",
]
_keys_to_ignore_on_save = [
r"speecht5.encoder.prenet.pos_sinusoidal_embed.weights",
]
_tied_weights_keys = ["text_decoder_postnet.lm_head.weight"]
def __init__(self, config: SpeechT5Config):
......@@ -2638,9 +2629,6 @@ def _generate_speech(
SPEECHT5_START_DOCSTRING,
)
class SpeechT5ForTextToSpeech(SpeechT5PreTrainedModel):
_keys_to_ignore_on_load_missing = []
_keys_to_ignore_on_save = []
main_input_name = "input_ids"
def __init__(self, config: SpeechT5Config):
......@@ -2859,13 +2847,6 @@ class SpeechT5ForTextToSpeech(SpeechT5PreTrainedModel):
SPEECHT5_START_DOCSTRING,
)
class SpeechT5ForSpeechToSpeech(SpeechT5PreTrainedModel):
_keys_to_ignore_on_load_missing = [
r"speecht5.encoder.prenet.pos_sinusoidal_embed.weights",
]
_keys_to_ignore_on_save = [
r"speecht5.encoder.prenet.pos_sinusoidal_embed.weights",
]
def __init__(self, config: SpeechT5Config):
super().__init__(config)
......
......@@ -61,7 +61,9 @@ class SplinterEmbeddings(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
def forward(
......@@ -524,7 +526,6 @@ class SplinterPreTrainedModel(PreTrainedModel):
config_class = SplinterConfig
base_model_prefix = "splinter"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = [r"position_ids"]
# Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
def _init_weights(self, module):
......
......@@ -64,7 +64,9 @@ class SqueezeBertEmbeddings(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
if input_ids is not None:
......@@ -425,7 +427,6 @@ class SqueezeBertPreTrainedModel(PreTrainedModel):
config_class = SqueezeBertConfig
base_model_prefix = "transformer"
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module):
"""Initialize the weights"""
......@@ -643,11 +644,6 @@ class SqueezeBertModel(SqueezeBertPreTrainedModel):
@add_start_docstrings("""SqueezeBERT Model with a `language modeling` head on top.""", SQUEEZEBERT_START_DOCSTRING)
class SqueezeBertForMaskedLM(SqueezeBertPreTrainedModel):
_keys_to_ignore_on_load_missing = [
r"predictions.decoder.bias",
"cls.predictions.decoder.weight",
"embeddings.position_ids",
]
_tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
def __init__(self, config):
......
......@@ -1337,7 +1337,6 @@ num_heads)`.
SWITCH_TRANSFORMERS_START_DOCSTRING,
)
class SwitchTransformersModel(SwitchTransformersPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"encoder.embed_tokens.weight", r"decoder.embed_tokens.weight"]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
def __init__(self, config: SwitchTransformersConfig):
......@@ -1506,11 +1505,6 @@ class SwitchTransformersModel(SwitchTransformersPreTrainedModel):
"""SWITCH_TRANSFORMERS Model with a `language modeling` head on top.""", SWITCH_TRANSFORMERS_START_DOCSTRING
)
class SwitchTransformersForConditionalGeneration(SwitchTransformersPreTrainedModel):
_keys_to_ignore_on_load_missing = [
r"encoder.embed_tokens.weight",
r"decoder.embed_tokens.weight",
r"lm_head.weight",
]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
def __init__(self, config: SwitchTransformersConfig):
......@@ -1819,7 +1813,6 @@ class SwitchTransformersForConditionalGeneration(SwitchTransformersPreTrainedMod
SWITCH_TRANSFORMERS_START_DOCSTRING,
)
class SwitchTransformersEncoderModel(SwitchTransformersPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"encoder.embed_tokens.weight"]
_tied_weights_keys = ["encoder.embed_tokens.weight"]
def __init__(self, config: SwitchTransformersConfig):
......
......@@ -1326,12 +1326,8 @@ num_heads)`.
T5_START_DOCSTRING,
)
class T5Model(T5PreTrainedModel):
_keys_to_ignore_on_load_missing = [
r"encoder.embed_tokens.weight",
r"decoder.embed_tokens.weight",
]
_keys_to_ignore_on_load_unexpected = [
r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
......@@ -1530,13 +1526,8 @@ class T5Model(T5PreTrainedModel):
@add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING)
class T5ForConditionalGeneration(T5PreTrainedModel):
_keys_to_ignore_on_load_missing = [
r"encoder.embed_tokens.weight",
r"decoder.embed_tokens.weight",
r"lm_head.weight",
]
_keys_to_ignore_on_load_unexpected = [
r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
......@@ -1845,7 +1836,6 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
T5_START_DOCSTRING,
)
class T5EncoderModel(T5PreTrainedModel):
_keys_to_ignore_on_load_missing = [r"encoder.embed_tokens.weight"]
_tied_weights_keys = ["encoder.embed_tokens.weight"]
def __init__(self, config: T5Config):
......@@ -1963,14 +1953,7 @@ class T5EncoderModel(T5PreTrainedModel):
T5_START_DOCSTRING,
)
class T5ForQuestionAnswering(T5PreTrainedModel):
_keys_to_ignore_on_load_missing = [
r"encoder.embed_tokens.weight",
r"decoder.embed_tokens.weight",
r"lm_head.weight",
]
_keys_to_ignore_on_load_unexpected = [
r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
]
_keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
def __init__(self, config: T5Config):
......
......@@ -998,7 +998,6 @@ class TapasModel(TapasPreTrainedModel):
@add_start_docstrings("""Tapas Model with a `language modeling` head on top.""", TAPAS_START_DOCSTRING)
class TapasForMaskedLM(TapasPreTrainedModel):
_keys_to_ignore_on_load_missing = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
_tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
config_class = TapasConfig
base_model_prefix = "tapas"
......
......@@ -284,6 +284,7 @@ class CausalSelfAttention(nn.Module):
torch.tril(torch.ones(config.block_size, config.block_size)).view(
1, 1, config.block_size, config.block_size
),
persistent=False,
)
# mask previous value estimates
......
......@@ -1002,7 +1002,6 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
TRANSFO_XL_START_DOCSTRING,
)
class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"crit\.out_projs\.\d+", r"crit\.out_layers\.\d+\.weight"]
_tied_weights_keys = [r"crit\.out_projs\.\d+", r"crit\.out_layers\.\d+\.weight"]
def __init__(self, config):
......@@ -1191,8 +1190,6 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
TRANSFO_XL_START_DOCSTRING,
)
class TransfoXLForSequenceClassification(TransfoXLPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
......
......@@ -788,7 +788,6 @@ class TrOCRDecoderWrapper(TrOCRPreTrainedModel):
TROCR_START_DOCSTRING,
)
class TrOCRForCausalLM(TrOCRPreTrainedModel):
_keys_to_ignore_on_load_missing = ["output_projection.weight"]
_tied_weights_keys = ["output_projection.weight"]
def __init__(self, config):
......
......@@ -974,7 +974,6 @@ class UniSpeechPreTrainedModel(PreTrainedModel):
config_class = UniSpeechConfig
base_model_prefix = "unispeech"
main_input_name = "input_values"
_keys_to_ignore_on_load_missing = [r"position_ids"]
supports_gradient_checkpointing = True
def _init_weights(self, module):
......
......@@ -988,7 +988,6 @@ class UniSpeechSatPreTrainedModel(PreTrainedModel):
config_class = UniSpeechSatConfig
base_model_prefix = "unispeech_sat"
main_input_name = "input_values"
_keys_to_ignore_on_load_missing = [r"position_ids"]
supports_gradient_checkpointing = True
def _init_weights(self, module):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment