"vscode:/vscode.git/clone" did not exist on "998d9d15095e7a69629f9e131c8b59bfdd1c6314"
Unverified Commit 8e5d1619 authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Clean load keys (#24505)

* Preliminary work on some models

* Fix test load missing and make sure nonpersistent buffers are tested

* Always ignore nonpersistent buffers if in state_dict

* Treat models

* More models

* Treat remaining models

* Fix quality

* Fix tests

* Remove draft

* This test is not needed anymore

* Fix copies

* Fix last test

* Newly added models

* Fix last tests

* Address review comments
parent 53194991
...@@ -80,7 +80,9 @@ class RobertaEmbeddings(nn.Module): ...@@ -80,7 +80,9 @@ class RobertaEmbeddings(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized # position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.register_buffer( self.register_buffer(
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
) )
...@@ -614,15 +616,6 @@ class RobertaPreTrainedModel(PreTrainedModel): ...@@ -614,15 +616,6 @@ class RobertaPreTrainedModel(PreTrainedModel):
if isinstance(module, RobertaEncoder): if isinstance(module, RobertaEncoder):
module.gradient_checkpointing = value module.gradient_checkpointing = value
def update_keys_to_ignore(self, config, del_keys_to_ignore):
"""Remove some keys from ignore list"""
if not config.tie_word_embeddings:
# must make a new list, or the class variable gets modified!
self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore]
self._keys_to_ignore_on_load_missing = [
k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore
]
ROBERTA_START_DOCSTRING = r""" ROBERTA_START_DOCSTRING = r"""
...@@ -711,8 +704,6 @@ class RobertaModel(RobertaPreTrainedModel): ...@@ -711,8 +704,6 @@ class RobertaModel(RobertaPreTrainedModel):
""" """
_keys_to_ignore_on_load_missing = [r"position_ids"]
# Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Roberta # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Roberta
def __init__(self, config, add_pooling_layer=True): def __init__(self, config, add_pooling_layer=True):
super().__init__(config) super().__init__(config)
...@@ -881,9 +872,6 @@ class RobertaModel(RobertaPreTrainedModel): ...@@ -881,9 +872,6 @@ class RobertaModel(RobertaPreTrainedModel):
"""RoBERTa Model with a `language modeling` head on top for CLM fine-tuning.""", ROBERTA_START_DOCSTRING """RoBERTa Model with a `language modeling` head on top for CLM fine-tuning.""", ROBERTA_START_DOCSTRING
) )
class RobertaForCausalLM(RobertaPreTrainedModel): class RobertaForCausalLM(RobertaPreTrainedModel):
_keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
_keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
def __init__(self, config): def __init__(self, config):
...@@ -895,9 +883,6 @@ class RobertaForCausalLM(RobertaPreTrainedModel): ...@@ -895,9 +883,6 @@ class RobertaForCausalLM(RobertaPreTrainedModel):
self.roberta = RobertaModel(config, add_pooling_layer=False) self.roberta = RobertaModel(config, add_pooling_layer=False)
self.lm_head = RobertaLMHead(config) self.lm_head = RobertaLMHead(config)
# The LM head weights require special treatment only when they are tied with the word embeddings
self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
# Initialize weights and apply final processing # Initialize weights and apply final processing
self.post_init() self.post_init()
...@@ -1036,9 +1021,6 @@ class RobertaForCausalLM(RobertaPreTrainedModel): ...@@ -1036,9 +1021,6 @@ class RobertaForCausalLM(RobertaPreTrainedModel):
@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top.""", ROBERTA_START_DOCSTRING) @add_start_docstrings("""RoBERTa Model with a `language modeling` head on top.""", ROBERTA_START_DOCSTRING)
class RobertaForMaskedLM(RobertaPreTrainedModel): class RobertaForMaskedLM(RobertaPreTrainedModel):
_keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
_keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
def __init__(self, config): def __init__(self, config):
...@@ -1053,9 +1035,6 @@ class RobertaForMaskedLM(RobertaPreTrainedModel): ...@@ -1053,9 +1035,6 @@ class RobertaForMaskedLM(RobertaPreTrainedModel):
self.roberta = RobertaModel(config, add_pooling_layer=False) self.roberta = RobertaModel(config, add_pooling_layer=False)
self.lm_head = RobertaLMHead(config) self.lm_head = RobertaLMHead(config)
# The LM head weights require special treatment only when they are tied with the word embeddings
self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
# Initialize weights and apply final processing # Initialize weights and apply final processing
self.post_init() self.post_init()
...@@ -1173,8 +1152,6 @@ class RobertaLMHead(nn.Module): ...@@ -1173,8 +1152,6 @@ class RobertaLMHead(nn.Module):
ROBERTA_START_DOCSTRING, ROBERTA_START_DOCSTRING,
) )
class RobertaForSequenceClassification(RobertaPreTrainedModel): class RobertaForSequenceClassification(RobertaPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.num_labels = config.num_labels self.num_labels = config.num_labels
...@@ -1274,8 +1251,6 @@ class RobertaForSequenceClassification(RobertaPreTrainedModel): ...@@ -1274,8 +1251,6 @@ class RobertaForSequenceClassification(RobertaPreTrainedModel):
ROBERTA_START_DOCSTRING, ROBERTA_START_DOCSTRING,
) )
class RobertaForMultipleChoice(RobertaPreTrainedModel): class RobertaForMultipleChoice(RobertaPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
...@@ -1368,9 +1343,6 @@ class RobertaForMultipleChoice(RobertaPreTrainedModel): ...@@ -1368,9 +1343,6 @@ class RobertaForMultipleChoice(RobertaPreTrainedModel):
ROBERTA_START_DOCSTRING, ROBERTA_START_DOCSTRING,
) )
class RobertaForTokenClassification(RobertaPreTrainedModel): class RobertaForTokenClassification(RobertaPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.num_labels = config.num_labels self.num_labels = config.num_labels
...@@ -1478,9 +1450,6 @@ class RobertaClassificationHead(nn.Module): ...@@ -1478,9 +1450,6 @@ class RobertaClassificationHead(nn.Module):
ROBERTA_START_DOCSTRING, ROBERTA_START_DOCSTRING,
) )
class RobertaForQuestionAnswering(RobertaPreTrainedModel): class RobertaForQuestionAnswering(RobertaPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.num_labels = config.num_labels self.num_labels = config.num_labels
......
...@@ -83,7 +83,9 @@ class RobertaPreLayerNormEmbeddings(nn.Module): ...@@ -83,7 +83,9 @@ class RobertaPreLayerNormEmbeddings(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized # position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.register_buffer( self.register_buffer(
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
) )
...@@ -617,15 +619,6 @@ class RobertaPreLayerNormPreTrainedModel(PreTrainedModel): ...@@ -617,15 +619,6 @@ class RobertaPreLayerNormPreTrainedModel(PreTrainedModel):
if isinstance(module, RobertaPreLayerNormEncoder): if isinstance(module, RobertaPreLayerNormEncoder):
module.gradient_checkpointing = value module.gradient_checkpointing = value
def update_keys_to_ignore(self, config, del_keys_to_ignore):
"""Remove some keys from ignore list"""
if not config.tie_word_embeddings:
# must make a new list, or the class variable gets modified!
self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore]
self._keys_to_ignore_on_load_missing = [
k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore
]
ROBERTA_PRELAYERNORM_START_DOCSTRING = r""" ROBERTA_PRELAYERNORM_START_DOCSTRING = r"""
...@@ -714,8 +707,6 @@ class RobertaPreLayerNormModel(RobertaPreLayerNormPreTrainedModel): ...@@ -714,8 +707,6 @@ class RobertaPreLayerNormModel(RobertaPreLayerNormPreTrainedModel):
""" """
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config, add_pooling_layer=True): def __init__(self, config, add_pooling_layer=True):
super().__init__(config) super().__init__(config)
self.config = config self.config = config
...@@ -886,9 +877,6 @@ class RobertaPreLayerNormModel(RobertaPreLayerNormPreTrainedModel): ...@@ -886,9 +877,6 @@ class RobertaPreLayerNormModel(RobertaPreLayerNormPreTrainedModel):
) )
# Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with roberta-base->andreasmadsen/efficient_mlm_m0.40,ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm, RobertaPreLayerNormTokenizer->RobertaTokenizer # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with roberta-base->andreasmadsen/efficient_mlm_m0.40,ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm, RobertaPreLayerNormTokenizer->RobertaTokenizer
class RobertaPreLayerNormForCausalLM(RobertaPreLayerNormPreTrainedModel): class RobertaPreLayerNormForCausalLM(RobertaPreLayerNormPreTrainedModel):
_keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
_keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
def __init__(self, config): def __init__(self, config):
...@@ -902,9 +890,6 @@ class RobertaPreLayerNormForCausalLM(RobertaPreLayerNormPreTrainedModel): ...@@ -902,9 +890,6 @@ class RobertaPreLayerNormForCausalLM(RobertaPreLayerNormPreTrainedModel):
self.roberta_prelayernorm = RobertaPreLayerNormModel(config, add_pooling_layer=False) self.roberta_prelayernorm = RobertaPreLayerNormModel(config, add_pooling_layer=False)
self.lm_head = RobertaPreLayerNormLMHead(config) self.lm_head = RobertaPreLayerNormLMHead(config)
# The LM head weights require special treatment only when they are tied with the word embeddings
self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
# Initialize weights and apply final processing # Initialize weights and apply final processing
self.post_init() self.post_init()
...@@ -1045,9 +1030,6 @@ class RobertaPreLayerNormForCausalLM(RobertaPreLayerNormPreTrainedModel): ...@@ -1045,9 +1030,6 @@ class RobertaPreLayerNormForCausalLM(RobertaPreLayerNormPreTrainedModel):
"""RoBERTa-PreLayerNorm Model with a `language modeling` head on top.""", ROBERTA_PRELAYERNORM_START_DOCSTRING """RoBERTa-PreLayerNorm Model with a `language modeling` head on top.""", ROBERTA_PRELAYERNORM_START_DOCSTRING
) )
class RobertaPreLayerNormForMaskedLM(RobertaPreLayerNormPreTrainedModel): class RobertaPreLayerNormForMaskedLM(RobertaPreLayerNormPreTrainedModel):
_keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
_keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
# Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM.__init__ with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm # Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM.__init__ with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
...@@ -1063,9 +1045,6 @@ class RobertaPreLayerNormForMaskedLM(RobertaPreLayerNormPreTrainedModel): ...@@ -1063,9 +1045,6 @@ class RobertaPreLayerNormForMaskedLM(RobertaPreLayerNormPreTrainedModel):
self.roberta_prelayernorm = RobertaPreLayerNormModel(config, add_pooling_layer=False) self.roberta_prelayernorm = RobertaPreLayerNormModel(config, add_pooling_layer=False)
self.lm_head = RobertaPreLayerNormLMHead(config) self.lm_head = RobertaPreLayerNormLMHead(config)
# The LM head weights require special treatment only when they are tied with the word embeddings
self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
# Initialize weights and apply final processing # Initialize weights and apply final processing
self.post_init() self.post_init()
...@@ -1185,8 +1164,6 @@ class RobertaPreLayerNormLMHead(nn.Module): ...@@ -1185,8 +1164,6 @@ class RobertaPreLayerNormLMHead(nn.Module):
ROBERTA_PRELAYERNORM_START_DOCSTRING, ROBERTA_PRELAYERNORM_START_DOCSTRING,
) )
class RobertaPreLayerNormForSequenceClassification(RobertaPreLayerNormPreTrainedModel): class RobertaPreLayerNormForSequenceClassification(RobertaPreLayerNormPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.num_labels = config.num_labels self.num_labels = config.num_labels
...@@ -1286,8 +1263,6 @@ class RobertaPreLayerNormForSequenceClassification(RobertaPreLayerNormPreTrained ...@@ -1286,8 +1263,6 @@ class RobertaPreLayerNormForSequenceClassification(RobertaPreLayerNormPreTrained
) )
# Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm # Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
class RobertaPreLayerNormForMultipleChoice(RobertaPreLayerNormPreTrainedModel): class RobertaPreLayerNormForMultipleChoice(RobertaPreLayerNormPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
...@@ -1382,9 +1357,6 @@ class RobertaPreLayerNormForMultipleChoice(RobertaPreLayerNormPreTrainedModel): ...@@ -1382,9 +1357,6 @@ class RobertaPreLayerNormForMultipleChoice(RobertaPreLayerNormPreTrainedModel):
ROBERTA_PRELAYERNORM_START_DOCSTRING, ROBERTA_PRELAYERNORM_START_DOCSTRING,
) )
class RobertaPreLayerNormForTokenClassification(RobertaPreLayerNormPreTrainedModel): class RobertaPreLayerNormForTokenClassification(RobertaPreLayerNormPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.num_labels = config.num_labels self.num_labels = config.num_labels
...@@ -1492,9 +1464,6 @@ class RobertaPreLayerNormClassificationHead(nn.Module): ...@@ -1492,9 +1464,6 @@ class RobertaPreLayerNormClassificationHead(nn.Module):
ROBERTA_PRELAYERNORM_START_DOCSTRING, ROBERTA_PRELAYERNORM_START_DOCSTRING,
) )
class RobertaPreLayerNormForQuestionAnswering(RobertaPreLayerNormPreTrainedModel): class RobertaPreLayerNormForQuestionAnswering(RobertaPreLayerNormPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.num_labels = config.num_labels self.num_labels = config.num_labels
......
...@@ -190,7 +190,9 @@ class RoCBertEmbeddings(nn.Module): ...@@ -190,7 +190,9 @@ class RoCBertEmbeddings(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized # position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.register_buffer( self.register_buffer(
"token_type_ids", "token_type_ids",
...@@ -777,7 +779,6 @@ class RoCBertPreTrainedModel(PreTrainedModel): ...@@ -777,7 +779,6 @@ class RoCBertPreTrainedModel(PreTrainedModel):
load_tf_weights = load_tf_weights_in_roc_bert load_tf_weights = load_tf_weights_in_roc_bert
base_model_prefix = "roc_bert" base_model_prefix = "roc_bert"
supports_gradient_checkpointing = True supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module): def _init_weights(self, module):
"""Initialize the weights""" """Initialize the weights"""
...@@ -1081,7 +1082,6 @@ class RoCBertModel(RoCBertPreTrainedModel): ...@@ -1081,7 +1082,6 @@ class RoCBertModel(RoCBertPreTrainedModel):
ROC_BERT_START_DOCSTRING, ROC_BERT_START_DOCSTRING,
) )
class RoCBertForPreTraining(RoCBertPreTrainedModel): class RoCBertForPreTraining(RoCBertPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"predictions.decoder.bias", "cls.predictions.decoder.weight"]
_tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
def __init__(self, config): def __init__(self, config):
...@@ -1267,8 +1267,6 @@ class RoCBertForPreTraining(RoCBertPreTrainedModel): ...@@ -1267,8 +1267,6 @@ class RoCBertForPreTraining(RoCBertPreTrainedModel):
@add_start_docstrings("""RoCBert Model with a `language modeling` head on top.""", ROC_BERT_START_DOCSTRING) @add_start_docstrings("""RoCBert Model with a `language modeling` head on top.""", ROC_BERT_START_DOCSTRING)
class RoCBertForMaskedLM(RoCBertPreTrainedModel): class RoCBertForMaskedLM(RoCBertPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"]
_tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
# Copied from transformers.models.bert.modeling_bert.BertForMaskedLM.__init__ with Bert->RoCBert,bert->roc_bert # Copied from transformers.models.bert.modeling_bert.BertForMaskedLM.__init__ with Bert->RoCBert,bert->roc_bert
...@@ -1409,8 +1407,6 @@ class RoCBertForMaskedLM(RoCBertPreTrainedModel): ...@@ -1409,8 +1407,6 @@ class RoCBertForMaskedLM(RoCBertPreTrainedModel):
"""RoCBert Model with a `language modeling` head on top for CLM fine-tuning.""", ROC_BERT_START_DOCSTRING """RoCBert Model with a `language modeling` head on top for CLM fine-tuning.""", ROC_BERT_START_DOCSTRING
) )
class RoCBertForCausalLM(RoCBertPreTrainedModel): class RoCBertForCausalLM(RoCBertPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"]
_tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
# Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.__init__ with BertLMHeadModel->RoCBertForCausalLM,Bert->RoCBert,bert->roc_bert # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.__init__ with BertLMHeadModel->RoCBertForCausalLM,Bert->RoCBert,bert->roc_bert
...@@ -1804,8 +1800,6 @@ class RoCBertForMultipleChoice(RoCBertPreTrainedModel): ...@@ -1804,8 +1800,6 @@ class RoCBertForMultipleChoice(RoCBertPreTrainedModel):
ROC_BERT_START_DOCSTRING, ROC_BERT_START_DOCSTRING,
) )
class RoCBertForTokenClassification(RoCBertPreTrainedModel): class RoCBertForTokenClassification(RoCBertPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
# Copied from transformers.models.bert.modeling_bert.BertForTokenClassification.__init__ with Bert->RoCBert,bert->roc_bert # Copied from transformers.models.bert.modeling_bert.BertForTokenClassification.__init__ with Bert->RoCBert,bert->roc_bert
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
...@@ -1892,8 +1886,6 @@ class RoCBertForTokenClassification(RoCBertPreTrainedModel): ...@@ -1892,8 +1886,6 @@ class RoCBertForTokenClassification(RoCBertPreTrainedModel):
ROC_BERT_START_DOCSTRING, ROC_BERT_START_DOCSTRING,
) )
class RoCBertForQuestionAnswering(RoCBertPreTrainedModel): class RoCBertForQuestionAnswering(RoCBertPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
# Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering.__init__ with Bert->RoCBert,bert->roc_bert # Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering.__init__ with Bert->RoCBert,bert->roc_bert
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
......
...@@ -696,11 +696,6 @@ class RoFormerPreTrainedModel(PreTrainedModel): ...@@ -696,11 +696,6 @@ class RoFormerPreTrainedModel(PreTrainedModel):
load_tf_weights = load_tf_weights_in_roformer load_tf_weights = load_tf_weights_in_roformer
base_model_prefix = "roformer" base_model_prefix = "roformer"
supports_gradient_checkpointing = True supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = []
_keys_to_ignore_on_load_unexpected = [
r"roformer.embeddings_project.weight",
r"roformer.embeddings_project.bias",
]
def _init_weights(self, module): def _init_weights(self, module):
"""Initialize the weights""" """Initialize the weights"""
...@@ -952,7 +947,6 @@ class RoFormerModel(RoFormerPreTrainedModel): ...@@ -952,7 +947,6 @@ class RoFormerModel(RoFormerPreTrainedModel):
@add_start_docstrings("""RoFormer Model with a `language modeling` head on top.""", ROFORMER_START_DOCSTRING) @add_start_docstrings("""RoFormer Model with a `language modeling` head on top.""", ROFORMER_START_DOCSTRING)
class RoFormerForMaskedLM(RoFormerPreTrainedModel): class RoFormerForMaskedLM(RoFormerPreTrainedModel):
_keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
_tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"] _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
def __init__(self, config): def __init__(self, config):
...@@ -1055,7 +1049,6 @@ class RoFormerForMaskedLM(RoFormerPreTrainedModel): ...@@ -1055,7 +1049,6 @@ class RoFormerForMaskedLM(RoFormerPreTrainedModel):
"""RoFormer Model with a `language modeling` head on top for CLM fine-tuning.""", ROFORMER_START_DOCSTRING """RoFormer Model with a `language modeling` head on top for CLM fine-tuning.""", ROFORMER_START_DOCSTRING
) )
class RoFormerForCausalLM(RoFormerPreTrainedModel): class RoFormerForCausalLM(RoFormerPreTrainedModel):
_keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
_tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"] _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
def __init__(self, config): def __init__(self, config):
......
...@@ -1190,7 +1190,6 @@ SAM_INPUTS_DOCSTRING = r""" ...@@ -1190,7 +1190,6 @@ SAM_INPUTS_DOCSTRING = r"""
SAM_START_DOCSTRING, SAM_START_DOCSTRING,
) )
class SamModel(SamPreTrainedModel): class SamModel(SamPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"prompt_encoder.shared_embedding.positional_embedding"]
_tied_weights_keys = ["prompt_encoder.shared_embedding.positional_embedding"] _tied_weights_keys = ["prompt_encoder.shared_embedding.positional_embedding"]
def __init__(self, config): def __init__(self, config):
......
...@@ -723,7 +723,6 @@ class SEWPreTrainedModel(PreTrainedModel): ...@@ -723,7 +723,6 @@ class SEWPreTrainedModel(PreTrainedModel):
base_model_prefix = "sew" base_model_prefix = "sew"
main_input_name = "input_values" main_input_name = "input_values"
supports_gradient_checkpointing = True supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module): def _init_weights(self, module):
"""Initialize the weights""" """Initialize the weights"""
......
...@@ -1257,7 +1257,6 @@ class SEWDPreTrainedModel(PreTrainedModel): ...@@ -1257,7 +1257,6 @@ class SEWDPreTrainedModel(PreTrainedModel):
config_class = SEWDConfig config_class = SEWDConfig
base_model_prefix = "sew-d" base_model_prefix = "sew-d"
main_input_name = "input_values" main_input_name = "input_values"
_keys_to_ignore_on_load_missing = [r"position_ids"]
supports_gradient_checkpointing = True supports_gradient_checkpointing = True
def _init_weights(self, module): def _init_weights(self, module):
......
...@@ -1266,17 +1266,6 @@ class Speech2TextModel(Speech2TextPreTrainedModel): ...@@ -1266,17 +1266,6 @@ class Speech2TextModel(Speech2TextPreTrainedModel):
) )
class Speech2TextForConditionalGeneration(Speech2TextPreTrainedModel): class Speech2TextForConditionalGeneration(Speech2TextPreTrainedModel):
base_model_prefix = "model" base_model_prefix = "model"
_keys_to_ignore_on_load_missing = [
r"encoder.version",
r"decoder.version",
r"model.encoder.embed_positions.weights",
r"model.decoder.embed_positions.weights",
r"lm_head.weight",
]
_keys_to_ignore_on_save = [
r"model.encoder.embed_positions.weights",
r"model.decoder.embed_positions.weights",
]
_tied_weights_keys = ["lm_head.weight"] _tied_weights_keys = ["lm_head.weight"]
def __init__(self, config: Speech2TextConfig): def __init__(self, config: Speech2TextConfig):
......
...@@ -756,7 +756,6 @@ class Speech2Text2DecoderWrapper(Speech2Text2PreTrainedModel): ...@@ -756,7 +756,6 @@ class Speech2Text2DecoderWrapper(Speech2Text2PreTrainedModel):
SPEECH_TO_TEXT_2_START_DOCSTRING, SPEECH_TO_TEXT_2_START_DOCSTRING,
) )
class Speech2Text2ForCausalLM(Speech2Text2PreTrainedModel): class Speech2Text2ForCausalLM(Speech2Text2PreTrainedModel):
_keys_to_ignore_on_load_missing = ["lm_head.weight"]
_tied_weights_keys = ["lm_head.weight"] _tied_weights_keys = ["lm_head.weight"]
def __init__(self, config): def __init__(self, config):
......
...@@ -441,7 +441,7 @@ class SpeechT5ScaledPositionalEncoding(nn.Module): ...@@ -441,7 +441,7 @@ class SpeechT5ScaledPositionalEncoding(nn.Module):
pe[:, 1::2] = torch.cos(position.float() * div_term) pe[:, 1::2] = torch.cos(position.float() * div_term)
pe = pe.unsqueeze(0) pe = pe.unsqueeze(0)
super().__init__() super().__init__()
self.register_buffer("pe", pe) self.register_buffer("pe", pe, persistent=False)
self.dropout = nn.Dropout(p=dropout) self.dropout = nn.Dropout(p=dropout)
self.dim = dim self.dim = dim
self.alpha = torch.nn.Parameter(torch.tensor(1.0)) self.alpha = torch.nn.Parameter(torch.tensor(1.0))
...@@ -1251,8 +1251,6 @@ class SpeechT5PreTrainedModel(PreTrainedModel): ...@@ -1251,8 +1251,6 @@ class SpeechT5PreTrainedModel(PreTrainedModel):
main_input_name = "input_values" main_input_name = "input_values"
supports_gradient_checkpointing = True supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module): def _init_weights(self, module):
"""Initialize the weights""" """Initialize the weights"""
if isinstance(module, SpeechT5PositionalConvEmbedding): if isinstance(module, SpeechT5PositionalConvEmbedding):
...@@ -2326,13 +2324,6 @@ class SpeechT5Model(SpeechT5PreTrainedModel): ...@@ -2326,13 +2324,6 @@ class SpeechT5Model(SpeechT5PreTrainedModel):
SPEECHT5_START_DOCSTRING, SPEECHT5_START_DOCSTRING,
) )
class SpeechT5ForSpeechToText(SpeechT5PreTrainedModel): class SpeechT5ForSpeechToText(SpeechT5PreTrainedModel):
_keys_to_ignore_on_load_missing = [
r"speecht5.encoder.prenet.pos_sinusoidal_embed.weights",
r"text_decoder_postnet.lm_head.weight",
]
_keys_to_ignore_on_save = [
r"speecht5.encoder.prenet.pos_sinusoidal_embed.weights",
]
_tied_weights_keys = ["text_decoder_postnet.lm_head.weight"] _tied_weights_keys = ["text_decoder_postnet.lm_head.weight"]
def __init__(self, config: SpeechT5Config): def __init__(self, config: SpeechT5Config):
...@@ -2638,9 +2629,6 @@ def _generate_speech( ...@@ -2638,9 +2629,6 @@ def _generate_speech(
SPEECHT5_START_DOCSTRING, SPEECHT5_START_DOCSTRING,
) )
class SpeechT5ForTextToSpeech(SpeechT5PreTrainedModel): class SpeechT5ForTextToSpeech(SpeechT5PreTrainedModel):
_keys_to_ignore_on_load_missing = []
_keys_to_ignore_on_save = []
main_input_name = "input_ids" main_input_name = "input_ids"
def __init__(self, config: SpeechT5Config): def __init__(self, config: SpeechT5Config):
...@@ -2859,13 +2847,6 @@ class SpeechT5ForTextToSpeech(SpeechT5PreTrainedModel): ...@@ -2859,13 +2847,6 @@ class SpeechT5ForTextToSpeech(SpeechT5PreTrainedModel):
SPEECHT5_START_DOCSTRING, SPEECHT5_START_DOCSTRING,
) )
class SpeechT5ForSpeechToSpeech(SpeechT5PreTrainedModel): class SpeechT5ForSpeechToSpeech(SpeechT5PreTrainedModel):
_keys_to_ignore_on_load_missing = [
r"speecht5.encoder.prenet.pos_sinusoidal_embed.weights",
]
_keys_to_ignore_on_save = [
r"speecht5.encoder.prenet.pos_sinusoidal_embed.weights",
]
def __init__(self, config: SpeechT5Config): def __init__(self, config: SpeechT5Config):
super().__init__(config) super().__init__(config)
......
...@@ -61,7 +61,9 @@ class SplinterEmbeddings(nn.Module): ...@@ -61,7 +61,9 @@ class SplinterEmbeddings(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized # position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
def forward( def forward(
...@@ -524,7 +526,6 @@ class SplinterPreTrainedModel(PreTrainedModel): ...@@ -524,7 +526,6 @@ class SplinterPreTrainedModel(PreTrainedModel):
config_class = SplinterConfig config_class = SplinterConfig
base_model_prefix = "splinter" base_model_prefix = "splinter"
supports_gradient_checkpointing = True supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = [r"position_ids"]
# Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
def _init_weights(self, module): def _init_weights(self, module):
......
...@@ -64,7 +64,9 @@ class SqueezeBertEmbeddings(nn.Module): ...@@ -64,7 +64,9 @@ class SqueezeBertEmbeddings(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized # position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None): def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
if input_ids is not None: if input_ids is not None:
...@@ -425,7 +427,6 @@ class SqueezeBertPreTrainedModel(PreTrainedModel): ...@@ -425,7 +427,6 @@ class SqueezeBertPreTrainedModel(PreTrainedModel):
config_class = SqueezeBertConfig config_class = SqueezeBertConfig
base_model_prefix = "transformer" base_model_prefix = "transformer"
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module): def _init_weights(self, module):
"""Initialize the weights""" """Initialize the weights"""
...@@ -643,11 +644,6 @@ class SqueezeBertModel(SqueezeBertPreTrainedModel): ...@@ -643,11 +644,6 @@ class SqueezeBertModel(SqueezeBertPreTrainedModel):
@add_start_docstrings("""SqueezeBERT Model with a `language modeling` head on top.""", SQUEEZEBERT_START_DOCSTRING) @add_start_docstrings("""SqueezeBERT Model with a `language modeling` head on top.""", SQUEEZEBERT_START_DOCSTRING)
class SqueezeBertForMaskedLM(SqueezeBertPreTrainedModel): class SqueezeBertForMaskedLM(SqueezeBertPreTrainedModel):
_keys_to_ignore_on_load_missing = [
r"predictions.decoder.bias",
"cls.predictions.decoder.weight",
"embeddings.position_ids",
]
_tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
def __init__(self, config): def __init__(self, config):
......
...@@ -1337,7 +1337,6 @@ num_heads)`. ...@@ -1337,7 +1337,6 @@ num_heads)`.
SWITCH_TRANSFORMERS_START_DOCSTRING, SWITCH_TRANSFORMERS_START_DOCSTRING,
) )
class SwitchTransformersModel(SwitchTransformersPreTrainedModel): class SwitchTransformersModel(SwitchTransformersPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"encoder.embed_tokens.weight", r"decoder.embed_tokens.weight"]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
def __init__(self, config: SwitchTransformersConfig): def __init__(self, config: SwitchTransformersConfig):
...@@ -1506,11 +1505,6 @@ class SwitchTransformersModel(SwitchTransformersPreTrainedModel): ...@@ -1506,11 +1505,6 @@ class SwitchTransformersModel(SwitchTransformersPreTrainedModel):
"""SWITCH_TRANSFORMERS Model with a `language modeling` head on top.""", SWITCH_TRANSFORMERS_START_DOCSTRING """SWITCH_TRANSFORMERS Model with a `language modeling` head on top.""", SWITCH_TRANSFORMERS_START_DOCSTRING
) )
class SwitchTransformersForConditionalGeneration(SwitchTransformersPreTrainedModel): class SwitchTransformersForConditionalGeneration(SwitchTransformersPreTrainedModel):
_keys_to_ignore_on_load_missing = [
r"encoder.embed_tokens.weight",
r"decoder.embed_tokens.weight",
r"lm_head.weight",
]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
def __init__(self, config: SwitchTransformersConfig): def __init__(self, config: SwitchTransformersConfig):
...@@ -1819,7 +1813,6 @@ class SwitchTransformersForConditionalGeneration(SwitchTransformersPreTrainedMod ...@@ -1819,7 +1813,6 @@ class SwitchTransformersForConditionalGeneration(SwitchTransformersPreTrainedMod
SWITCH_TRANSFORMERS_START_DOCSTRING, SWITCH_TRANSFORMERS_START_DOCSTRING,
) )
class SwitchTransformersEncoderModel(SwitchTransformersPreTrainedModel): class SwitchTransformersEncoderModel(SwitchTransformersPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"encoder.embed_tokens.weight"]
_tied_weights_keys = ["encoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight"]
def __init__(self, config: SwitchTransformersConfig): def __init__(self, config: SwitchTransformersConfig):
......
...@@ -1326,12 +1326,8 @@ num_heads)`. ...@@ -1326,12 +1326,8 @@ num_heads)`.
T5_START_DOCSTRING, T5_START_DOCSTRING,
) )
class T5Model(T5PreTrainedModel): class T5Model(T5PreTrainedModel):
_keys_to_ignore_on_load_missing = [
r"encoder.embed_tokens.weight",
r"decoder.embed_tokens.weight",
]
_keys_to_ignore_on_load_unexpected = [ _keys_to_ignore_on_load_unexpected = [
r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight", "decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
] ]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
...@@ -1530,13 +1526,8 @@ class T5Model(T5PreTrainedModel): ...@@ -1530,13 +1526,8 @@ class T5Model(T5PreTrainedModel):
@add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING) @add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING)
class T5ForConditionalGeneration(T5PreTrainedModel): class T5ForConditionalGeneration(T5PreTrainedModel):
_keys_to_ignore_on_load_missing = [
r"encoder.embed_tokens.weight",
r"decoder.embed_tokens.weight",
r"lm_head.weight",
]
_keys_to_ignore_on_load_unexpected = [ _keys_to_ignore_on_load_unexpected = [
r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight", "decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
] ]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
...@@ -1845,7 +1836,6 @@ class T5ForConditionalGeneration(T5PreTrainedModel): ...@@ -1845,7 +1836,6 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
T5_START_DOCSTRING, T5_START_DOCSTRING,
) )
class T5EncoderModel(T5PreTrainedModel): class T5EncoderModel(T5PreTrainedModel):
_keys_to_ignore_on_load_missing = [r"encoder.embed_tokens.weight"]
_tied_weights_keys = ["encoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight"]
def __init__(self, config: T5Config): def __init__(self, config: T5Config):
...@@ -1963,14 +1953,7 @@ class T5EncoderModel(T5PreTrainedModel): ...@@ -1963,14 +1953,7 @@ class T5EncoderModel(T5PreTrainedModel):
T5_START_DOCSTRING, T5_START_DOCSTRING,
) )
class T5ForQuestionAnswering(T5PreTrainedModel): class T5ForQuestionAnswering(T5PreTrainedModel):
_keys_to_ignore_on_load_missing = [ _keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
r"encoder.embed_tokens.weight",
r"decoder.embed_tokens.weight",
r"lm_head.weight",
]
_keys_to_ignore_on_load_unexpected = [
r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
def __init__(self, config: T5Config): def __init__(self, config: T5Config):
......
...@@ -998,7 +998,6 @@ class TapasModel(TapasPreTrainedModel): ...@@ -998,7 +998,6 @@ class TapasModel(TapasPreTrainedModel):
@add_start_docstrings("""Tapas Model with a `language modeling` head on top.""", TAPAS_START_DOCSTRING) @add_start_docstrings("""Tapas Model with a `language modeling` head on top.""", TAPAS_START_DOCSTRING)
class TapasForMaskedLM(TapasPreTrainedModel): class TapasForMaskedLM(TapasPreTrainedModel):
_keys_to_ignore_on_load_missing = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
_tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
config_class = TapasConfig config_class = TapasConfig
base_model_prefix = "tapas" base_model_prefix = "tapas"
......
...@@ -284,6 +284,7 @@ class CausalSelfAttention(nn.Module): ...@@ -284,6 +284,7 @@ class CausalSelfAttention(nn.Module):
torch.tril(torch.ones(config.block_size, config.block_size)).view( torch.tril(torch.ones(config.block_size, config.block_size)).view(
1, 1, config.block_size, config.block_size 1, 1, config.block_size, config.block_size
), ),
persistent=False,
) )
# mask previous value estimates # mask previous value estimates
......
...@@ -1002,7 +1002,6 @@ class TransfoXLModel(TransfoXLPreTrainedModel): ...@@ -1002,7 +1002,6 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_START_DOCSTRING,
) )
class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"crit\.out_projs\.\d+", r"crit\.out_layers\.\d+\.weight"]
_tied_weights_keys = [r"crit\.out_projs\.\d+", r"crit\.out_layers\.\d+\.weight"] _tied_weights_keys = [r"crit\.out_projs\.\d+", r"crit\.out_layers\.\d+\.weight"]
def __init__(self, config): def __init__(self, config):
...@@ -1191,8 +1190,6 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): ...@@ -1191,8 +1190,6 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_START_DOCSTRING,
) )
class TransfoXLForSequenceClassification(TransfoXLPreTrainedModel): class TransfoXLForSequenceClassification(TransfoXLPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"]
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.num_labels = config.num_labels self.num_labels = config.num_labels
......
...@@ -788,7 +788,6 @@ class TrOCRDecoderWrapper(TrOCRPreTrainedModel): ...@@ -788,7 +788,6 @@ class TrOCRDecoderWrapper(TrOCRPreTrainedModel):
TROCR_START_DOCSTRING, TROCR_START_DOCSTRING,
) )
class TrOCRForCausalLM(TrOCRPreTrainedModel): class TrOCRForCausalLM(TrOCRPreTrainedModel):
_keys_to_ignore_on_load_missing = ["output_projection.weight"]
_tied_weights_keys = ["output_projection.weight"] _tied_weights_keys = ["output_projection.weight"]
def __init__(self, config): def __init__(self, config):
......
...@@ -974,7 +974,6 @@ class UniSpeechPreTrainedModel(PreTrainedModel): ...@@ -974,7 +974,6 @@ class UniSpeechPreTrainedModel(PreTrainedModel):
config_class = UniSpeechConfig config_class = UniSpeechConfig
base_model_prefix = "unispeech" base_model_prefix = "unispeech"
main_input_name = "input_values" main_input_name = "input_values"
_keys_to_ignore_on_load_missing = [r"position_ids"]
supports_gradient_checkpointing = True supports_gradient_checkpointing = True
def _init_weights(self, module): def _init_weights(self, module):
......
...@@ -988,7 +988,6 @@ class UniSpeechSatPreTrainedModel(PreTrainedModel): ...@@ -988,7 +988,6 @@ class UniSpeechSatPreTrainedModel(PreTrainedModel):
config_class = UniSpeechSatConfig config_class = UniSpeechSatConfig
base_model_prefix = "unispeech_sat" base_model_prefix = "unispeech_sat"
main_input_name = "input_values" main_input_name = "input_values"
_keys_to_ignore_on_load_missing = [r"position_ids"]
supports_gradient_checkpointing = True supports_gradient_checkpointing = True
def _init_weights(self, module): def _init_weights(self, module):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment