".github/git@developer.sourcefind.cn:ox696c/ktransformers.git" did not exist on "022b89381943ab4152053d18fff0d8bed6b54339"
Unverified Commit 0996a100 authored by amyeroberts's avatar amyeroberts Committed by GitHub
Browse files

Revert low cpu mem tie weights (#29135)

* Revert "Add tie_weights() to LM heads and set bias in set_output_embeddings() (#28948)"

This reverts commit 725f4ad1.

* Revert "Patch to skip failing `test_save_load_low_cpu_mem_usage` tests (#29043)"

This reverts commit 4156f517.
parent 15cfe389
...@@ -692,9 +692,6 @@ class BertLMPredictionHead(nn.Module): ...@@ -692,9 +692,6 @@ class BertLMPredictionHead(nn.Module):
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
self.decoder.bias = self.bias self.decoder.bias = self.bias
def _tie_weights(self):
self.decoder.bias = self.bias
def forward(self, hidden_states): def forward(self, hidden_states):
hidden_states = self.transform(hidden_states) hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states) hidden_states = self.decoder(hidden_states)
...@@ -1065,7 +1062,6 @@ class BertForPreTraining(BertPreTrainedModel): ...@@ -1065,7 +1062,6 @@ class BertForPreTraining(BertPreTrainedModel):
def set_output_embeddings(self, new_embeddings): def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings self.cls.predictions.decoder = new_embeddings
self.cls.predictions.bias = new_embeddings.bias
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=BertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=BertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
...@@ -1175,7 +1171,6 @@ class BertLMHeadModel(BertPreTrainedModel): ...@@ -1175,7 +1171,6 @@ class BertLMHeadModel(BertPreTrainedModel):
def set_output_embeddings(self, new_embeddings): def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings self.cls.predictions.decoder = new_embeddings
self.cls.predictions.bias = new_embeddings.bias
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
...@@ -1329,7 +1324,6 @@ class BertForMaskedLM(BertPreTrainedModel): ...@@ -1329,7 +1324,6 @@ class BertForMaskedLM(BertPreTrainedModel):
def set_output_embeddings(self, new_embeddings): def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings self.cls.predictions.decoder = new_embeddings
self.cls.predictions.bias = new_embeddings.bias
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
......
...@@ -1707,9 +1707,6 @@ class BigBirdLMPredictionHead(nn.Module): ...@@ -1707,9 +1707,6 @@ class BigBirdLMPredictionHead(nn.Module):
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
self.decoder.bias = self.bias self.decoder.bias = self.bias
def _tie_weights(self):
self.decoder.bias = self.bias
def forward(self, hidden_states): def forward(self, hidden_states):
hidden_states = self.transform(hidden_states) hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states) hidden_states = self.decoder(hidden_states)
...@@ -2269,7 +2266,6 @@ class BigBirdForPreTraining(BigBirdPreTrainedModel): ...@@ -2269,7 +2266,6 @@ class BigBirdForPreTraining(BigBirdPreTrainedModel):
def set_output_embeddings(self, new_embeddings): def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings self.cls.predictions.decoder = new_embeddings
self.cls.predictions.bias = new_embeddings.bias
@add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=BigBirdForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=BigBirdForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
...@@ -2382,7 +2378,6 @@ class BigBirdForMaskedLM(BigBirdPreTrainedModel): ...@@ -2382,7 +2378,6 @@ class BigBirdForMaskedLM(BigBirdPreTrainedModel):
def set_output_embeddings(self, new_embeddings): def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings self.cls.predictions.decoder = new_embeddings
self.cls.predictions.bias = new_embeddings.bias
@add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
...@@ -2524,7 +2519,6 @@ class BigBirdForCausalLM(BigBirdPreTrainedModel): ...@@ -2524,7 +2519,6 @@ class BigBirdForCausalLM(BigBirdPreTrainedModel):
def set_output_embeddings(self, new_embeddings): def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings self.cls.predictions.decoder = new_embeddings
self.cls.predictions.bias = new_embeddings.bias
@add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
......
...@@ -523,9 +523,6 @@ class BlipTextLMPredictionHead(nn.Module): ...@@ -523,9 +523,6 @@ class BlipTextLMPredictionHead(nn.Module):
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
self.decoder.bias = self.bias self.decoder.bias = self.bias
def _tie_weights(self):
self.decoder.bias = self.bias
def forward(self, hidden_states): def forward(self, hidden_states):
hidden_states = self.transform(hidden_states) hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states) hidden_states = self.decoder(hidden_states)
...@@ -820,7 +817,6 @@ class BlipTextLMHeadModel(BlipTextPreTrainedModel): ...@@ -820,7 +817,6 @@ class BlipTextLMHeadModel(BlipTextPreTrainedModel):
def set_output_embeddings(self, new_embeddings): def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings self.cls.predictions.decoder = new_embeddings
self.cls.predictions.bias = new_embeddings.bias
def forward( def forward(
self, self,
......
...@@ -608,9 +608,6 @@ class ErnieLMPredictionHead(nn.Module): ...@@ -608,9 +608,6 @@ class ErnieLMPredictionHead(nn.Module):
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
self.decoder.bias = self.bias self.decoder.bias = self.bias
def _tie_weights(self):
self.decoder.bias = self.bias
def forward(self, hidden_states): def forward(self, hidden_states):
hidden_states = self.transform(hidden_states) hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states) hidden_states = self.decoder(hidden_states)
...@@ -998,7 +995,6 @@ class ErnieForPreTraining(ErniePreTrainedModel): ...@@ -998,7 +995,6 @@ class ErnieForPreTraining(ErniePreTrainedModel):
# Copied from transformers.models.bert.modeling_bert.BertForPreTraining.set_output_embeddings # Copied from transformers.models.bert.modeling_bert.BertForPreTraining.set_output_embeddings
def set_output_embeddings(self, new_embeddings): def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings self.cls.predictions.decoder = new_embeddings
self.cls.predictions.bias = new_embeddings.bias
@add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=ErnieForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=ErnieForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
...@@ -1113,7 +1109,6 @@ class ErnieForCausalLM(ErniePreTrainedModel): ...@@ -1113,7 +1109,6 @@ class ErnieForCausalLM(ErniePreTrainedModel):
# Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.set_output_embeddings # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.set_output_embeddings
def set_output_embeddings(self, new_embeddings): def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings self.cls.predictions.decoder = new_embeddings
self.cls.predictions.bias = new_embeddings.bias
@add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
...@@ -1274,7 +1269,6 @@ class ErnieForMaskedLM(ErniePreTrainedModel): ...@@ -1274,7 +1269,6 @@ class ErnieForMaskedLM(ErniePreTrainedModel):
# Copied from transformers.models.bert.modeling_bert.BertForMaskedLM.set_output_embeddings # Copied from transformers.models.bert.modeling_bert.BertForMaskedLM.set_output_embeddings
def set_output_embeddings(self, new_embeddings): def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings self.cls.predictions.decoder = new_embeddings
self.cls.predictions.bias = new_embeddings.bias
@add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
......
...@@ -589,9 +589,6 @@ class LayoutLMLMPredictionHead(nn.Module): ...@@ -589,9 +589,6 @@ class LayoutLMLMPredictionHead(nn.Module):
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
self.decoder.bias = self.bias self.decoder.bias = self.bias
def _tie_weights(self):
self.decoder.bias = self.bias
def forward(self, hidden_states): def forward(self, hidden_states):
hidden_states = self.transform(hidden_states) hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states) hidden_states = self.decoder(hidden_states)
...@@ -872,7 +869,6 @@ class LayoutLMForMaskedLM(LayoutLMPreTrainedModel): ...@@ -872,7 +869,6 @@ class LayoutLMForMaskedLM(LayoutLMPreTrainedModel):
def set_output_embeddings(self, new_embeddings): def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings self.cls.predictions.decoder = new_embeddings
self.cls.predictions.bias = new_embeddings.bias
@add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
......
...@@ -318,9 +318,6 @@ class MarkupLMLMPredictionHead(nn.Module): ...@@ -318,9 +318,6 @@ class MarkupLMLMPredictionHead(nn.Module):
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
self.decoder.bias = self.bias self.decoder.bias = self.bias
def _tie_weights(self):
self.decoder.bias = self.bias
def forward(self, hidden_states): def forward(self, hidden_states):
hidden_states = self.transform(hidden_states) hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states) hidden_states = self.decoder(hidden_states)
......
...@@ -659,9 +659,6 @@ class MegatronBertLMPredictionHead(nn.Module): ...@@ -659,9 +659,6 @@ class MegatronBertLMPredictionHead(nn.Module):
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
self.decoder.bias = self.bias self.decoder.bias = self.bias
def _tie_weights(self):
self.decoder.bias = self.bias
def forward(self, hidden_states): def forward(self, hidden_states):
hidden_states = self.transform(hidden_states) hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states) hidden_states = self.decoder(hidden_states)
...@@ -1026,7 +1023,6 @@ class MegatronBertForPreTraining(MegatronBertPreTrainedModel): ...@@ -1026,7 +1023,6 @@ class MegatronBertForPreTraining(MegatronBertPreTrainedModel):
def set_output_embeddings(self, new_embeddings): def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings self.cls.predictions.decoder = new_embeddings
self.cls.predictions.bias = new_embeddings.bias
@add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=MegatronBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=MegatronBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
...@@ -1136,7 +1132,6 @@ class MegatronBertForCausalLM(MegatronBertPreTrainedModel): ...@@ -1136,7 +1132,6 @@ class MegatronBertForCausalLM(MegatronBertPreTrainedModel):
def set_output_embeddings(self, new_embeddings): def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings self.cls.predictions.decoder = new_embeddings
self.cls.predictions.bias = new_embeddings.bias
@add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
...@@ -1295,7 +1290,6 @@ class MegatronBertForMaskedLM(MegatronBertPreTrainedModel): ...@@ -1295,7 +1290,6 @@ class MegatronBertForMaskedLM(MegatronBertPreTrainedModel):
def set_output_embeddings(self, new_embeddings): def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings self.cls.predictions.decoder = new_embeddings
self.cls.predictions.bias = new_embeddings.bias
@add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
......
...@@ -587,7 +587,6 @@ class MPNetForMaskedLM(MPNetPreTrainedModel): ...@@ -587,7 +587,6 @@ class MPNetForMaskedLM(MPNetPreTrainedModel):
def set_output_embeddings(self, new_embeddings): def set_output_embeddings(self, new_embeddings):
self.lm_head.decoder = new_embeddings self.lm_head.decoder = new_embeddings
self.lm_head.bias = new_embeddings.bias
@add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
...@@ -660,9 +659,6 @@ class MPNetLMHead(nn.Module): ...@@ -660,9 +659,6 @@ class MPNetLMHead(nn.Module):
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
self.decoder.bias = self.bias self.decoder.bias = self.bias
def _tie_weights(self):
self.decoder.bias = self.bias
def forward(self, features, **kwargs): def forward(self, features, **kwargs):
x = self.dense(features) x = self.dense(features)
x = gelu(x) x = gelu(x)
......
...@@ -810,9 +810,6 @@ class MraLMPredictionHead(nn.Module): ...@@ -810,9 +810,6 @@ class MraLMPredictionHead(nn.Module):
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
self.decoder.bias = self.bias self.decoder.bias = self.bias
def _tie_weights(self):
self.decoder.bias = self.bias
def forward(self, hidden_states): def forward(self, hidden_states):
hidden_states = self.transform(hidden_states) hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states) hidden_states = self.decoder(hidden_states)
...@@ -1046,7 +1043,6 @@ class MraForMaskedLM(MraPreTrainedModel): ...@@ -1046,7 +1043,6 @@ class MraForMaskedLM(MraPreTrainedModel):
def set_output_embeddings(self, new_embeddings): def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings self.cls.predictions.decoder = new_embeddings
self.cls.predictions.bias = new_embeddings.bias
@add_start_docstrings_to_model_forward(MRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(MRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
......
...@@ -679,9 +679,6 @@ class NezhaLMPredictionHead(nn.Module): ...@@ -679,9 +679,6 @@ class NezhaLMPredictionHead(nn.Module):
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
self.decoder.bias = self.bias self.decoder.bias = self.bias
def _tie_weights(self):
self.decoder.bias = self.bias
def forward(self, hidden_states): def forward(self, hidden_states):
hidden_states = self.transform(hidden_states) hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states) hidden_states = self.decoder(hidden_states)
...@@ -1047,7 +1044,6 @@ class NezhaForPreTraining(NezhaPreTrainedModel): ...@@ -1047,7 +1044,6 @@ class NezhaForPreTraining(NezhaPreTrainedModel):
def set_output_embeddings(self, new_embeddings): def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings self.cls.predictions.decoder = new_embeddings
self.cls.predictions.bias = new_embeddings.bias
@add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=NezhaForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=NezhaForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
...@@ -1156,7 +1152,6 @@ class NezhaForMaskedLM(NezhaPreTrainedModel): ...@@ -1156,7 +1152,6 @@ class NezhaForMaskedLM(NezhaPreTrainedModel):
def set_output_embeddings(self, new_embeddings): def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings self.cls.predictions.decoder = new_embeddings
self.cls.predictions.bias = new_embeddings.bias
@add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
......
...@@ -428,9 +428,6 @@ class NystromformerLMPredictionHead(nn.Module): ...@@ -428,9 +428,6 @@ class NystromformerLMPredictionHead(nn.Module):
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
self.decoder.bias = self.bias self.decoder.bias = self.bias
def _tie_weights(self):
self.decoder.bias = self.bias
def forward(self, hidden_states): def forward(self, hidden_states):
hidden_states = self.transform(hidden_states) hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states) hidden_states = self.decoder(hidden_states)
...@@ -669,7 +666,6 @@ class NystromformerForMaskedLM(NystromformerPreTrainedModel): ...@@ -669,7 +666,6 @@ class NystromformerForMaskedLM(NystromformerPreTrainedModel):
def set_output_embeddings(self, new_embeddings): def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings self.cls.predictions.decoder = new_embeddings
self.cls.predictions.bias = new_embeddings.bias
@add_start_docstrings_to_model_forward(NYSTROMFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(NYSTROMFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
......
...@@ -683,9 +683,6 @@ class QDQBertLMPredictionHead(nn.Module): ...@@ -683,9 +683,6 @@ class QDQBertLMPredictionHead(nn.Module):
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
self.decoder.bias = self.bias self.decoder.bias = self.bias
def _tie_weights(self):
self.decoder.bias = self.bias
def forward(self, hidden_states): def forward(self, hidden_states):
hidden_states = self.transform(hidden_states) hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states) hidden_states = self.decoder(hidden_states)
...@@ -1027,7 +1024,6 @@ class QDQBertLMHeadModel(QDQBertPreTrainedModel): ...@@ -1027,7 +1024,6 @@ class QDQBertLMHeadModel(QDQBertPreTrainedModel):
def set_output_embeddings(self, new_embeddings): def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings self.cls.predictions.decoder = new_embeddings
self.cls.predictions.bias = new_embeddings.bias
@add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
...@@ -1194,7 +1190,6 @@ class QDQBertForMaskedLM(QDQBertPreTrainedModel): ...@@ -1194,7 +1190,6 @@ class QDQBertForMaskedLM(QDQBertPreTrainedModel):
def set_output_embeddings(self, new_embeddings): def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings self.cls.predictions.decoder = new_embeddings
self.cls.predictions.bias = new_embeddings.bias
@add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
......
...@@ -744,9 +744,6 @@ class RoCBertLMPredictionHead(nn.Module): ...@@ -744,9 +744,6 @@ class RoCBertLMPredictionHead(nn.Module):
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
self.decoder.bias = self.bias self.decoder.bias = self.bias
def _tie_weights(self):
self.decoder.bias = self.bias
def forward(self, hidden_states): def forward(self, hidden_states):
hidden_states = self.transform(hidden_states) hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states) hidden_states = self.decoder(hidden_states)
...@@ -1093,7 +1090,6 @@ class RoCBertForPreTraining(RoCBertPreTrainedModel): ...@@ -1093,7 +1090,6 @@ class RoCBertForPreTraining(RoCBertPreTrainedModel):
# Copied from transformers.models.bert.modeling_bert.BertForPreTraining.set_output_embeddings # Copied from transformers.models.bert.modeling_bert.BertForPreTraining.set_output_embeddings
def set_output_embeddings(self, new_embeddings): def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings self.cls.predictions.decoder = new_embeddings
self.cls.predictions.bias = new_embeddings.bias
@add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
...@@ -1286,7 +1282,6 @@ class RoCBertForMaskedLM(RoCBertPreTrainedModel): ...@@ -1286,7 +1282,6 @@ class RoCBertForMaskedLM(RoCBertPreTrainedModel):
# Copied from transformers.models.bert.modeling_bert.BertForMaskedLM.set_output_embeddings # Copied from transformers.models.bert.modeling_bert.BertForMaskedLM.set_output_embeddings
def set_output_embeddings(self, new_embeddings): def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings self.cls.predictions.decoder = new_embeddings
self.cls.predictions.bias = new_embeddings.bias
@add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
def forward( def forward(
...@@ -1424,7 +1419,6 @@ class RoCBertForCausalLM(RoCBertPreTrainedModel): ...@@ -1424,7 +1419,6 @@ class RoCBertForCausalLM(RoCBertPreTrainedModel):
# Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.set_output_embeddings # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.set_output_embeddings
def set_output_embeddings(self, new_embeddings): def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings self.cls.predictions.decoder = new_embeddings
self.cls.predictions.bias = new_embeddings.bias
@add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
......
...@@ -729,9 +729,6 @@ class TapasLMPredictionHead(nn.Module): ...@@ -729,9 +729,6 @@ class TapasLMPredictionHead(nn.Module):
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
self.decoder.bias = self.bias self.decoder.bias = self.bias
def _tie_weights(self):
self.decoder.bias = self.bias
def forward(self, hidden_states): def forward(self, hidden_states):
hidden_states = self.transform(hidden_states) hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states) hidden_states = self.decoder(hidden_states)
...@@ -1011,7 +1008,6 @@ class TapasForMaskedLM(TapasPreTrainedModel): ...@@ -1011,7 +1008,6 @@ class TapasForMaskedLM(TapasPreTrainedModel):
def set_output_embeddings(self, new_embeddings): def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings self.cls.predictions.decoder = new_embeddings
self.cls.predictions.bias = new_embeddings.bias
@add_start_docstrings_to_model_forward(TAPAS_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(TAPAS_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
......
...@@ -896,7 +896,6 @@ class ViltForMaskedLM(ViltPreTrainedModel): ...@@ -896,7 +896,6 @@ class ViltForMaskedLM(ViltPreTrainedModel):
def set_output_embeddings(self, new_embeddings): def set_output_embeddings(self, new_embeddings):
self.mlm_score.decoder = new_embeddings self.mlm_score.decoder = new_embeddings
self.mlm_score.bias = new_embeddings.bias
@add_start_docstrings_to_model_forward(VILT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(VILT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
...@@ -1043,9 +1042,6 @@ class ViltMLMHead(nn.Module): ...@@ -1043,9 +1042,6 @@ class ViltMLMHead(nn.Module):
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
self.decoder.bias = self.bias self.decoder.bias = self.bias
def _tie_weights(self):
self.decoder.bias = self.bias
def forward(self, x): def forward(self, x):
x = self.transform(x) x = self.transform(x)
x = self.decoder(x) x = self.decoder(x)
......
...@@ -499,9 +499,6 @@ class VisualBertLMPredictionHead(nn.Module): ...@@ -499,9 +499,6 @@ class VisualBertLMPredictionHead(nn.Module):
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
self.decoder.bias = self.bias self.decoder.bias = self.bias
def _tie_weights(self):
self.decoder.bias = self.bias
def forward(self, hidden_states): def forward(self, hidden_states):
hidden_states = self.transform(hidden_states) hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states) hidden_states = self.decoder(hidden_states)
...@@ -882,7 +879,6 @@ class VisualBertForPreTraining(VisualBertPreTrainedModel): ...@@ -882,7 +879,6 @@ class VisualBertForPreTraining(VisualBertPreTrainedModel):
def set_output_embeddings(self, new_embeddings): def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings self.cls.predictions.decoder = new_embeddings
self.cls.predictions.bias = new_embeddings.bias
@add_start_docstrings_to_model_forward(VISUAL_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(VISUAL_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=VisualBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=VisualBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
......
...@@ -626,9 +626,6 @@ class YosoLMPredictionHead(nn.Module): ...@@ -626,9 +626,6 @@ class YosoLMPredictionHead(nn.Module):
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
self.decoder.bias = self.bias self.decoder.bias = self.bias
def _tie_weights(self):
self.decoder.bias = self.bias
def forward(self, hidden_states): def forward(self, hidden_states):
hidden_states = self.transform(hidden_states) hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states) hidden_states = self.decoder(hidden_states)
...@@ -867,7 +864,6 @@ class YosoForMaskedLM(YosoPreTrainedModel): ...@@ -867,7 +864,6 @@ class YosoForMaskedLM(YosoPreTrainedModel):
def set_output_embeddings(self, new_embeddings): def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings self.cls.predictions.decoder = new_embeddings
self.cls.predictions.bias = new_embeddings.bias
@add_start_docstrings_to_model_forward(YOSO_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(YOSO_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
......
...@@ -305,12 +305,6 @@ class BertGenerationEncoderTest(ModelTesterMixin, GenerationTesterMixin, Pipelin ...@@ -305,12 +305,6 @@ class BertGenerationEncoderTest(ModelTesterMixin, GenerationTesterMixin, Pipelin
model = BertGenerationEncoder.from_pretrained("google/bert_for_seq_generation_L-24_bbc_encoder") model = BertGenerationEncoder.from_pretrained("google/bert_for_seq_generation_L-24_bbc_encoder")
self.assertIsNotNone(model) self.assertIsNotNone(model)
@unittest.skip(
"Not currently compatible. Fails with - NotImplementedError: Cannot copy out of meta tensor; no data!"
)
def test_save_load_low_cpu_mem_usage(self):
pass
@require_torch @require_torch
class BertGenerationEncoderIntegrationTest(unittest.TestCase): class BertGenerationEncoderIntegrationTest(unittest.TestCase):
......
...@@ -564,10 +564,6 @@ class DeformableDetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineT ...@@ -564,10 +564,6 @@ class DeformableDetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineT
msg=f"Parameter {name} of model {model_class} seems not properly initialized", msg=f"Parameter {name} of model {model_class} seems not properly initialized",
) )
@unittest.skip("Cannot be initialized on meta device as some weights are modified during the initialization")
def test_save_load_low_cpu_mem_usage(self):
pass
def test_two_stage_training(self): def test_two_stage_training(self):
model_class = DeformableDetrForObjectDetection model_class = DeformableDetrForObjectDetection
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
......
...@@ -520,10 +520,6 @@ class DetaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin ...@@ -520,10 +520,6 @@ class DetaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
msg=f"Parameter {name} of model {model_class} seems not properly initialized", msg=f"Parameter {name} of model {model_class} seems not properly initialized",
) )
@unittest.skip("Cannot be initialized on meta device as some weights are modified during the initialization")
def test_save_load_low_cpu_mem_usage(self):
pass
TOLERANCE = 1e-4 TOLERANCE = 1e-4
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment