Unverified Commit d83b0e0c authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Add a post init method to all models (#14431)

* Add a post init method to all models

* Fix tests

* Fix last tests

* Fix templates

* Add comment

* Forgot to save
parent 08816de1
...@@ -412,17 +412,6 @@ class ModuleUtilsMixin: ...@@ -412,17 +412,6 @@ class ModuleUtilsMixin:
return 6 * self.estimate_tokens(input_dict) * self.num_parameters(exclude_embeddings=exclude_embeddings) return 6 * self.estimate_tokens(input_dict) * self.num_parameters(exclude_embeddings=exclude_embeddings)
def gradient_checkpointing_hook(module, _):
# Hook to enable backward compatibility for gradient checkpointing. Will be removed once all models have a
# proper post_init method.
if getattr(module.config, "gradient_checkpointing", False):
module.gradient_checkpointing_enable()
# Remove the attribute now that is has been consumed, so it's no saved in the config.
delattr(module.config, "gradient_checkpointing")
# The hook will remove itself after the first execution
module._gradient_checkpointing_hook.remove()
class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMixin): class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMixin):
r""" r"""
Base class for all models. Base class for all models.
...@@ -490,8 +479,20 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix ...@@ -490,8 +479,20 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
# Save config and origin of the pretrained weights if given in model # Save config and origin of the pretrained weights if given in model
self.config = config self.config = config
self.name_or_path = config.name_or_path self.name_or_path = config.name_or_path
if self.supports_gradient_checkpointing:
self._gradient_checkpointing_hook = self.register_forward_pre_hook(gradient_checkpointing_hook) def post_init(self):
"""
A method executed at the end of each Transformer model initialization, to execute code that needs the model's
modules properly initialized (such as weight initialization).
"""
self.init_weights()
self._backward_compatibility_gradient_checkpointing()
def _backward_compatibility_gradient_checkpointing(self):
if self.supports_gradient_checkpointing and getattr(self.config, "gradient_checkpointing", False):
self.gradient_checkpointing_enable()
# Remove the attribute now that is has been consumed, so it's no saved in the config.
delattr(self.config, "gradient_checkpointing")
@classmethod @classmethod
def _from_config(cls, config, **kwargs): def _from_config(cls, config, **kwargs):
......
...@@ -638,7 +638,8 @@ class AlbertModel(AlbertPreTrainedModel): ...@@ -638,7 +638,8 @@ class AlbertModel(AlbertPreTrainedModel):
self.pooler = None self.pooler = None
self.pooler_activation = None self.pooler_activation = None
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings.word_embeddings return self.embeddings.word_embeddings
...@@ -757,7 +758,8 @@ class AlbertForPreTraining(AlbertPreTrainedModel): ...@@ -757,7 +758,8 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
self.predictions = AlbertMLMHead(config) self.predictions = AlbertMLMHead(config)
self.sop_classifier = AlbertSOPHead(config) self.sop_classifier = AlbertSOPHead(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self): def get_output_embeddings(self):
return self.predictions.decoder return self.predictions.decoder
...@@ -903,7 +905,8 @@ class AlbertForMaskedLM(AlbertPreTrainedModel): ...@@ -903,7 +905,8 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
self.albert = AlbertModel(config, add_pooling_layer=False) self.albert = AlbertModel(config, add_pooling_layer=False)
self.predictions = AlbertMLMHead(config) self.predictions = AlbertMLMHead(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self): def get_output_embeddings(self):
return self.predictions.decoder return self.predictions.decoder
...@@ -991,7 +994,8 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel): ...@@ -991,7 +994,8 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
self.dropout = nn.Dropout(config.classifier_dropout_prob) self.dropout = nn.Dropout(config.classifier_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, self.config.num_labels) self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
...@@ -1097,7 +1101,8 @@ class AlbertForTokenClassification(AlbertPreTrainedModel): ...@@ -1097,7 +1101,8 @@ class AlbertForTokenClassification(AlbertPreTrainedModel):
self.dropout = nn.Dropout(classifier_dropout_prob) self.dropout = nn.Dropout(classifier_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, self.config.num_labels) self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
...@@ -1187,7 +1192,8 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel): ...@@ -1187,7 +1192,8 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
self.albert = AlbertModel(config, add_pooling_layer=False) self.albert = AlbertModel(config, add_pooling_layer=False)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
...@@ -1286,7 +1292,8 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel): ...@@ -1286,7 +1292,8 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel):
self.dropout = nn.Dropout(config.classifier_dropout_prob) self.dropout = nn.Dropout(config.classifier_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, 1) self.classifier = nn.Linear(config.hidden_size, 1)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
......
...@@ -699,8 +699,9 @@ class BartEncoder(BartPretrainedModel): ...@@ -699,8 +699,9 @@ class BartEncoder(BartPretrainedModel):
self.layers = nn.ModuleList([BartEncoderLayer(config) for _ in range(config.encoder_layers)]) self.layers = nn.ModuleList([BartEncoderLayer(config) for _ in range(config.encoder_layers)])
self.layernorm_embedding = nn.LayerNorm(embed_dim) self.layernorm_embedding = nn.LayerNorm(embed_dim)
self.init_weights()
self.gradient_checkpointing = False self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embed_tokens return self.embed_tokens
...@@ -870,8 +871,9 @@ class BartDecoder(BartPretrainedModel): ...@@ -870,8 +871,9 @@ class BartDecoder(BartPretrainedModel):
self.layers = nn.ModuleList([BartDecoderLayer(config) for _ in range(config.decoder_layers)]) self.layers = nn.ModuleList([BartDecoderLayer(config) for _ in range(config.decoder_layers)])
self.layernorm_embedding = nn.LayerNorm(config.d_model) self.layernorm_embedding = nn.LayerNorm(config.d_model)
self.init_weights()
self.gradient_checkpointing = False self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embed_tokens return self.embed_tokens
...@@ -1130,7 +1132,8 @@ class BartModel(BartPretrainedModel): ...@@ -1130,7 +1132,8 @@ class BartModel(BartPretrainedModel):
self.encoder = BartEncoder(config, self.shared) self.encoder = BartEncoder(config, self.shared)
self.decoder = BartDecoder(config, self.shared) self.decoder = BartDecoder(config, self.shared)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.shared return self.shared
...@@ -1248,7 +1251,8 @@ class BartForConditionalGeneration(BartPretrainedModel): ...@@ -1248,7 +1251,8 @@ class BartForConditionalGeneration(BartPretrainedModel):
self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings))) self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False) self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_encoder(self): def get_encoder(self):
return self.model.get_encoder() return self.model.get_encoder()
...@@ -1666,7 +1670,8 @@ class BartForCausalLM(BartPretrainedModel): ...@@ -1666,7 +1670,8 @@ class BartForCausalLM(BartPretrainedModel):
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.model.decoder.embed_tokens return self.model.decoder.embed_tokens
......
...@@ -598,7 +598,8 @@ class BeitModel(BeitPreTrainedModel): ...@@ -598,7 +598,8 @@ class BeitModel(BeitPreTrainedModel):
) )
self.pooler = BeitPooler(config) if add_pooling_layer else None self.pooler = BeitPooler(config) if add_pooling_layer else None
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings.patch_embeddings return self.embeddings.patch_embeddings
...@@ -715,7 +716,8 @@ class BeitForMaskedImageModeling(BeitPreTrainedModel): ...@@ -715,7 +716,8 @@ class BeitForMaskedImageModeling(BeitPreTrainedModel):
self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size) self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(BEIT_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(BEIT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
...@@ -805,7 +807,8 @@ class BeitForImageClassification(BeitPreTrainedModel): ...@@ -805,7 +807,8 @@ class BeitForImageClassification(BeitPreTrainedModel):
# Classifier head # Classifier head
self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity() self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(BEIT_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(BEIT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
...@@ -1121,7 +1124,8 @@ class BeitForSemanticSegmentation(BeitPreTrainedModel): ...@@ -1121,7 +1124,8 @@ class BeitForSemanticSegmentation(BeitPreTrainedModel):
self.decode_head = BeitUperHead(config) self.decode_head = BeitUperHead(config)
self.auxiliary_head = BeitFCNHead(config) if config.use_auxiliary_head else None self.auxiliary_head = BeitFCNHead(config) if config.use_auxiliary_head else None
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def compute_loss(self, logits, auxiliary_logits, labels): def compute_loss(self, logits, auxiliary_logits, labels):
# upsample logits to the images' original size # upsample logits to the images' original size
......
...@@ -870,7 +870,8 @@ class BertModel(BertPreTrainedModel): ...@@ -870,7 +870,8 @@ class BertModel(BertPreTrainedModel):
self.pooler = BertPooler(config) if add_pooling_layer else None self.pooler = BertPooler(config) if add_pooling_layer else None
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings.word_embeddings return self.embeddings.word_embeddings
...@@ -1037,7 +1038,8 @@ class BertForPreTraining(BertPreTrainedModel): ...@@ -1037,7 +1038,8 @@ class BertForPreTraining(BertPreTrainedModel):
self.bert = BertModel(config) self.bert = BertModel(config)
self.cls = BertPreTrainingHeads(config) self.cls = BertPreTrainingHeads(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self): def get_output_embeddings(self):
return self.cls.predictions.decoder return self.cls.predictions.decoder
...@@ -1145,7 +1147,8 @@ class BertLMHeadModel(BertPreTrainedModel): ...@@ -1145,7 +1147,8 @@ class BertLMHeadModel(BertPreTrainedModel):
self.bert = BertModel(config, add_pooling_layer=False) self.bert = BertModel(config, add_pooling_layer=False)
self.cls = BertOnlyMLMHead(config) self.cls = BertOnlyMLMHead(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self): def get_output_embeddings(self):
return self.cls.predictions.decoder return self.cls.predictions.decoder
...@@ -1294,7 +1297,8 @@ class BertForMaskedLM(BertPreTrainedModel): ...@@ -1294,7 +1297,8 @@ class BertForMaskedLM(BertPreTrainedModel):
self.bert = BertModel(config, add_pooling_layer=False) self.bert = BertModel(config, add_pooling_layer=False)
self.cls = BertOnlyMLMHead(config) self.cls = BertOnlyMLMHead(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self): def get_output_embeddings(self):
return self.cls.predictions.decoder return self.cls.predictions.decoder
...@@ -1394,7 +1398,8 @@ class BertForNextSentencePrediction(BertPreTrainedModel): ...@@ -1394,7 +1398,8 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
self.bert = BertModel(config) self.bert = BertModel(config)
self.cls = BertOnlyNSPHead(config) self.cls = BertOnlyNSPHead(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
...@@ -1501,7 +1506,8 @@ class BertForSequenceClassification(BertPreTrainedModel): ...@@ -1501,7 +1506,8 @@ class BertForSequenceClassification(BertPreTrainedModel):
self.dropout = nn.Dropout(classifier_dropout) self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
...@@ -1600,7 +1606,8 @@ class BertForMultipleChoice(BertPreTrainedModel): ...@@ -1600,7 +1606,8 @@ class BertForMultipleChoice(BertPreTrainedModel):
self.dropout = nn.Dropout(classifier_dropout) self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, 1) self.classifier = nn.Linear(config.hidden_size, 1)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
...@@ -1698,7 +1705,8 @@ class BertForTokenClassification(BertPreTrainedModel): ...@@ -1698,7 +1705,8 @@ class BertForTokenClassification(BertPreTrainedModel):
self.dropout = nn.Dropout(classifier_dropout) self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
...@@ -1788,7 +1796,8 @@ class BertForQuestionAnswering(BertPreTrainedModel): ...@@ -1788,7 +1796,8 @@ class BertForQuestionAnswering(BertPreTrainedModel):
self.bert = BertModel(config, add_pooling_layer=False) self.bert = BertModel(config, add_pooling_layer=False)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
......
...@@ -282,7 +282,8 @@ class BertGenerationEncoder(BertGenerationPreTrainedModel): ...@@ -282,7 +282,8 @@ class BertGenerationEncoder(BertGenerationPreTrainedModel):
self.embeddings = BertGenerationEmbeddings(config) self.embeddings = BertGenerationEmbeddings(config)
self.encoder = BertEncoder(config) self.encoder = BertEncoder(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings.word_embeddings return self.embeddings.word_embeddings
...@@ -456,7 +457,8 @@ class BertGenerationDecoder(BertGenerationPreTrainedModel): ...@@ -456,7 +457,8 @@ class BertGenerationDecoder(BertGenerationPreTrainedModel):
self.bert = BertGenerationEncoder(config) self.bert = BertGenerationEncoder(config)
self.lm_head = BertGenerationOnlyLMHead(config) self.lm_head = BertGenerationOnlyLMHead(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self): def get_output_embeddings(self):
return self.lm_head.decoder return self.lm_head.decoder
......
...@@ -1953,7 +1953,8 @@ class BigBirdModel(BigBirdPreTrainedModel): ...@@ -1953,7 +1953,8 @@ class BigBirdModel(BigBirdPreTrainedModel):
) )
self.set_attention_type("original_full") self.set_attention_type("original_full")
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings.word_embeddings return self.embeddings.word_embeddings
...@@ -2262,7 +2263,8 @@ class BigBirdForPreTraining(BigBirdPreTrainedModel): ...@@ -2262,7 +2263,8 @@ class BigBirdForPreTraining(BigBirdPreTrainedModel):
self.bert = BigBirdModel(config, add_pooling_layer=True) self.bert = BigBirdModel(config, add_pooling_layer=True)
self.cls = BigBirdPreTrainingHeads(config) self.cls = BigBirdPreTrainingHeads(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self): def get_output_embeddings(self):
return self.cls.predictions.decoder return self.cls.predictions.decoder
...@@ -2370,7 +2372,8 @@ class BigBirdForMaskedLM(BigBirdPreTrainedModel): ...@@ -2370,7 +2372,8 @@ class BigBirdForMaskedLM(BigBirdPreTrainedModel):
self.bert = BigBirdModel(config) self.bert = BigBirdModel(config)
self.cls = BigBirdOnlyMLMHead(config) self.cls = BigBirdOnlyMLMHead(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self): def get_output_embeddings(self):
return self.cls.predictions.decoder return self.cls.predictions.decoder
...@@ -2472,7 +2475,8 @@ class BigBirdForCausalLM(BigBirdPreTrainedModel): ...@@ -2472,7 +2475,8 @@ class BigBirdForCausalLM(BigBirdPreTrainedModel):
self.bert = BigBirdModel(config) self.bert = BigBirdModel(config)
self.cls = BigBirdOnlyMLMHead(config) self.cls = BigBirdOnlyMLMHead(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self): def get_output_embeddings(self):
return self.cls.predictions.decoder return self.cls.predictions.decoder
...@@ -2642,7 +2646,8 @@ class BigBirdForSequenceClassification(BigBirdPreTrainedModel): ...@@ -2642,7 +2646,8 @@ class BigBirdForSequenceClassification(BigBirdPreTrainedModel):
self.bert = BigBirdModel(config) self.bert = BigBirdModel(config)
self.classifier = BigBirdClassificationHead(config) self.classifier = BigBirdClassificationHead(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
...@@ -2737,7 +2742,8 @@ class BigBirdForMultipleChoice(BigBirdPreTrainedModel): ...@@ -2737,7 +2742,8 @@ class BigBirdForMultipleChoice(BigBirdPreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, 1) self.classifier = nn.Linear(config.hidden_size, 1)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward( @add_start_docstrings_to_model_forward(
BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
...@@ -2834,7 +2840,8 @@ class BigBirdForTokenClassification(BigBirdPreTrainedModel): ...@@ -2834,7 +2840,8 @@ class BigBirdForTokenClassification(BigBirdPreTrainedModel):
self.dropout = nn.Dropout(classifier_dropout) self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
...@@ -2942,7 +2949,8 @@ class BigBirdForQuestionAnswering(BigBirdPreTrainedModel): ...@@ -2942,7 +2949,8 @@ class BigBirdForQuestionAnswering(BigBirdPreTrainedModel):
self.bert = BigBirdModel(config, add_pooling_layer=add_pooling_layer) self.bert = BigBirdModel(config, add_pooling_layer=add_pooling_layer)
self.qa_classifier = BigBirdForQuestionAnsweringHead(config) self.qa_classifier = BigBirdForQuestionAnsweringHead(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
......
...@@ -1775,8 +1775,9 @@ class BigBirdPegasusEncoder(BigBirdPegasusPreTrainedModel): ...@@ -1775,8 +1775,9 @@ class BigBirdPegasusEncoder(BigBirdPegasusPreTrainedModel):
self.layers = nn.ModuleList([BigBirdPegasusEncoderLayer(config, seed=i) for i in range(config.encoder_layers)]) self.layers = nn.ModuleList([BigBirdPegasusEncoderLayer(config, seed=i) for i in range(config.encoder_layers)])
self.layernorm_embedding = nn.LayerNorm(embed_dim) self.layernorm_embedding = nn.LayerNorm(embed_dim)
self.init_weights()
self.gradient_checkpointing = False self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def forward( def forward(
self, self,
...@@ -2066,8 +2067,9 @@ class BigBirdPegasusDecoder(BigBirdPegasusPreTrainedModel): ...@@ -2066,8 +2067,9 @@ class BigBirdPegasusDecoder(BigBirdPegasusPreTrainedModel):
self.layers = nn.ModuleList([BigBirdPegasusDecoderLayer(config) for _ in range(config.decoder_layers)]) self.layers = nn.ModuleList([BigBirdPegasusDecoderLayer(config) for _ in range(config.decoder_layers)])
self.layernorm_embedding = nn.LayerNorm(config.d_model) self.layernorm_embedding = nn.LayerNorm(config.d_model)
self.init_weights()
self.gradient_checkpointing = False self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embed_tokens return self.embed_tokens
...@@ -2327,7 +2329,8 @@ class BigBirdPegasusModel(BigBirdPegasusPreTrainedModel): ...@@ -2327,7 +2329,8 @@ class BigBirdPegasusModel(BigBirdPegasusPreTrainedModel):
self.encoder = BigBirdPegasusEncoder(config, self.shared) self.encoder = BigBirdPegasusEncoder(config, self.shared)
self.decoder = BigBirdPegasusDecoder(config, self.shared) self.decoder = BigBirdPegasusDecoder(config, self.shared)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.shared return self.shared
...@@ -2447,7 +2450,8 @@ class BigBirdPegasusForConditionalGeneration(BigBirdPegasusPreTrainedModel): ...@@ -2447,7 +2450,8 @@ class BigBirdPegasusForConditionalGeneration(BigBirdPegasusPreTrainedModel):
self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings))) self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False) self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_encoder(self): def get_encoder(self):
return self.model.get_encoder() return self.model.get_encoder()
...@@ -2869,7 +2873,8 @@ class BigBirdPegasusForCausalLM(BigBirdPegasusPreTrainedModel): ...@@ -2869,7 +2873,8 @@ class BigBirdPegasusForCausalLM(BigBirdPegasusPreTrainedModel):
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.model.decoder.embed_tokens return self.model.decoder.embed_tokens
......
...@@ -656,8 +656,9 @@ class BlenderbotEncoder(BlenderbotPreTrainedModel): ...@@ -656,8 +656,9 @@ class BlenderbotEncoder(BlenderbotPreTrainedModel):
self.layers = nn.ModuleList([BlenderbotEncoderLayer(config) for _ in range(config.encoder_layers)]) self.layers = nn.ModuleList([BlenderbotEncoderLayer(config) for _ in range(config.encoder_layers)])
self.layer_norm = nn.LayerNorm(config.d_model) self.layer_norm = nn.LayerNorm(config.d_model)
self.init_weights()
self.gradient_checkpointing = False self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def forward( def forward(
self, self,
...@@ -821,8 +822,9 @@ class BlenderbotDecoder(BlenderbotPreTrainedModel): ...@@ -821,8 +822,9 @@ class BlenderbotDecoder(BlenderbotPreTrainedModel):
self.layers = nn.ModuleList([BlenderbotDecoderLayer(config) for _ in range(config.decoder_layers)]) self.layers = nn.ModuleList([BlenderbotDecoderLayer(config) for _ in range(config.decoder_layers)])
self.layer_norm = nn.LayerNorm(config.d_model) self.layer_norm = nn.LayerNorm(config.d_model)
self.init_weights()
self.gradient_checkpointing = False self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embed_tokens return self.embed_tokens
...@@ -1083,7 +1085,8 @@ class BlenderbotModel(BlenderbotPreTrainedModel): ...@@ -1083,7 +1085,8 @@ class BlenderbotModel(BlenderbotPreTrainedModel):
self.encoder = BlenderbotEncoder(config, self.shared) self.encoder = BlenderbotEncoder(config, self.shared)
self.decoder = BlenderbotDecoder(config, self.shared) self.decoder = BlenderbotDecoder(config, self.shared)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@classmethod @classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs): def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
...@@ -1220,7 +1223,8 @@ class BlenderbotForConditionalGeneration(BlenderbotPreTrainedModel): ...@@ -1220,7 +1223,8 @@ class BlenderbotForConditionalGeneration(BlenderbotPreTrainedModel):
self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings))) self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False) self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@classmethod @classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs): def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
...@@ -1404,7 +1408,8 @@ class BlenderbotForCausalLM(BlenderbotPreTrainedModel): ...@@ -1404,7 +1408,8 @@ class BlenderbotForCausalLM(BlenderbotPreTrainedModel):
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.model.decoder.embed_tokens return self.model.decoder.embed_tokens
......
...@@ -657,8 +657,9 @@ class BlenderbotSmallEncoder(BlenderbotSmallPreTrainedModel): ...@@ -657,8 +657,9 @@ class BlenderbotSmallEncoder(BlenderbotSmallPreTrainedModel):
self.layers = nn.ModuleList([BlenderbotSmallEncoderLayer(config) for _ in range(config.encoder_layers)]) self.layers = nn.ModuleList([BlenderbotSmallEncoderLayer(config) for _ in range(config.encoder_layers)])
self.layernorm_embedding = nn.LayerNorm(embed_dim) self.layernorm_embedding = nn.LayerNorm(embed_dim)
self.init_weights()
self.gradient_checkpointing = False self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def forward( def forward(
self, self,
...@@ -821,8 +822,9 @@ class BlenderbotSmallDecoder(BlenderbotSmallPreTrainedModel): ...@@ -821,8 +822,9 @@ class BlenderbotSmallDecoder(BlenderbotSmallPreTrainedModel):
self.layers = nn.ModuleList([BlenderbotSmallDecoderLayer(config) for _ in range(config.decoder_layers)]) self.layers = nn.ModuleList([BlenderbotSmallDecoderLayer(config) for _ in range(config.decoder_layers)])
self.layernorm_embedding = nn.LayerNorm(config.d_model) self.layernorm_embedding = nn.LayerNorm(config.d_model)
self.init_weights()
self.gradient_checkpointing = False self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embed_tokens return self.embed_tokens
...@@ -1081,7 +1083,8 @@ class BlenderbotSmallModel(BlenderbotSmallPreTrainedModel): ...@@ -1081,7 +1083,8 @@ class BlenderbotSmallModel(BlenderbotSmallPreTrainedModel):
self.encoder = BlenderbotSmallEncoder(config, self.shared) self.encoder = BlenderbotSmallEncoder(config, self.shared)
self.decoder = BlenderbotSmallDecoder(config, self.shared) self.decoder = BlenderbotSmallDecoder(config, self.shared)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.shared return self.shared
...@@ -1208,7 +1211,8 @@ class BlenderbotSmallForConditionalGeneration(BlenderbotSmallPreTrainedModel): ...@@ -1208,7 +1211,8 @@ class BlenderbotSmallForConditionalGeneration(BlenderbotSmallPreTrainedModel):
self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings))) self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False) self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_encoder(self): def get_encoder(self):
return self.model.get_encoder() return self.model.get_encoder()
...@@ -1379,7 +1383,8 @@ class BlenderbotSmallForCausalLM(BlenderbotSmallPreTrainedModel): ...@@ -1379,7 +1383,8 @@ class BlenderbotSmallForCausalLM(BlenderbotSmallPreTrainedModel):
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.model.decoder.embed_tokens return self.model.decoder.embed_tokens
......
...@@ -1015,7 +1015,8 @@ class CanineModel(CaninePreTrainedModel): ...@@ -1015,7 +1015,8 @@ class CanineModel(CaninePreTrainedModel):
self.pooler = CaninePooler(config) if add_pooling_layer else None self.pooler = CaninePooler(config) if add_pooling_layer else None
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
""" """
...@@ -1273,7 +1274,8 @@ class CanineForSequenceClassification(CaninePreTrainedModel): ...@@ -1273,7 +1274,8 @@ class CanineForSequenceClassification(CaninePreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(CANINE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(CANINE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
...@@ -1369,7 +1371,8 @@ class CanineForMultipleChoice(CaninePreTrainedModel): ...@@ -1369,7 +1371,8 @@ class CanineForMultipleChoice(CaninePreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, 1) self.classifier = nn.Linear(config.hidden_size, 1)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(CANINE_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @add_start_docstrings_to_model_forward(CANINE_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
...@@ -1461,7 +1464,8 @@ class CanineForTokenClassification(CaninePreTrainedModel): ...@@ -1461,7 +1464,8 @@ class CanineForTokenClassification(CaninePreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(CANINE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(CANINE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
...@@ -1548,7 +1552,8 @@ class CanineForQuestionAnswering(CaninePreTrainedModel): ...@@ -1548,7 +1552,8 @@ class CanineForQuestionAnswering(CaninePreTrainedModel):
self.canine = CanineModel(config) self.canine = CanineModel(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(CANINE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(CANINE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
......
...@@ -683,7 +683,8 @@ class CLIPTextModel(CLIPPreTrainedModel): ...@@ -683,7 +683,8 @@ class CLIPTextModel(CLIPPreTrainedModel):
def __init__(self, config: CLIPTextConfig): def __init__(self, config: CLIPTextConfig):
super().__init__(config) super().__init__(config)
self.text_model = CLIPTextTransformer(config) self.text_model = CLIPTextTransformer(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self) -> nn.Module: def get_input_embeddings(self) -> nn.Module:
return self.text_model.embeddings.token_embedding return self.text_model.embeddings.token_embedding
...@@ -792,7 +793,8 @@ class CLIPVisionModel(CLIPPreTrainedModel): ...@@ -792,7 +793,8 @@ class CLIPVisionModel(CLIPPreTrainedModel):
def __init__(self, config: CLIPVisionConfig): def __init__(self, config: CLIPVisionConfig):
super().__init__(config) super().__init__(config)
self.vision_model = CLIPVisionTransformer(config) self.vision_model = CLIPVisionTransformer(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self) -> nn.Module: def get_input_embeddings(self) -> nn.Module:
return self.vision_model.embeddings.patch_embedding return self.vision_model.embeddings.patch_embedding
...@@ -866,7 +868,8 @@ class CLIPModel(CLIPPreTrainedModel): ...@@ -866,7 +868,8 @@ class CLIPModel(CLIPPreTrainedModel):
self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False) self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value) self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
def get_text_features( def get_text_features(
......
...@@ -775,7 +775,8 @@ class ConvBertModel(ConvBertPreTrainedModel): ...@@ -775,7 +775,8 @@ class ConvBertModel(ConvBertPreTrainedModel):
self.encoder = ConvBertEncoder(config) self.encoder = ConvBertEncoder(config)
self.config = config self.config = config
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings.word_embeddings return self.embeddings.word_embeddings
...@@ -886,7 +887,8 @@ class ConvBertForMaskedLM(ConvBertPreTrainedModel): ...@@ -886,7 +887,8 @@ class ConvBertForMaskedLM(ConvBertPreTrainedModel):
self.generator_predictions = ConvBertGeneratorPredictions(config) self.generator_predictions = ConvBertGeneratorPredictions(config)
self.generator_lm_head = nn.Linear(config.embedding_size, config.vocab_size) self.generator_lm_head = nn.Linear(config.embedding_size, config.vocab_size)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self): def get_output_embeddings(self):
return self.generator_lm_head return self.generator_lm_head
...@@ -995,7 +997,8 @@ class ConvBertForSequenceClassification(ConvBertPreTrainedModel): ...@@ -995,7 +997,8 @@ class ConvBertForSequenceClassification(ConvBertPreTrainedModel):
self.convbert = ConvBertModel(config) self.convbert = ConvBertModel(config)
self.classifier = ConvBertClassificationHead(config) self.classifier = ConvBertClassificationHead(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
...@@ -1090,7 +1093,8 @@ class ConvBertForMultipleChoice(ConvBertPreTrainedModel): ...@@ -1090,7 +1093,8 @@ class ConvBertForMultipleChoice(ConvBertPreTrainedModel):
self.sequence_summary = SequenceSummary(config) self.sequence_summary = SequenceSummary(config)
self.classifier = nn.Linear(config.hidden_size, 1) self.classifier = nn.Linear(config.hidden_size, 1)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward( @add_start_docstrings_to_model_forward(
CONVBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") CONVBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
...@@ -1187,7 +1191,8 @@ class ConvBertForTokenClassification(ConvBertPreTrainedModel): ...@@ -1187,7 +1191,8 @@ class ConvBertForTokenClassification(ConvBertPreTrainedModel):
self.dropout = nn.Dropout(classifier_dropout) self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
...@@ -1274,7 +1279,8 @@ class ConvBertForQuestionAnswering(ConvBertPreTrainedModel): ...@@ -1274,7 +1279,8 @@ class ConvBertForQuestionAnswering(ConvBertPreTrainedModel):
self.convbert = ConvBertModel(config) self.convbert = ConvBertModel(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
......
...@@ -338,7 +338,8 @@ class CTRLModel(CTRLPreTrainedModel): ...@@ -338,7 +338,8 @@ class CTRLModel(CTRLPreTrainedModel):
) )
self.layernorm = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon) self.layernorm = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.w return self.w
...@@ -499,7 +500,8 @@ class CTRLLMHeadModel(CTRLPreTrainedModel): ...@@ -499,7 +500,8 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
self.transformer = CTRLModel(config) self.transformer = CTRLModel(config)
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=True) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=True)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self): def get_output_embeddings(self):
return self.lm_head return self.lm_head
...@@ -615,7 +617,8 @@ class CTRLForSequenceClassification(CTRLPreTrainedModel): ...@@ -615,7 +617,8 @@ class CTRLForSequenceClassification(CTRLPreTrainedModel):
self.transformer = CTRLModel(config) self.transformer = CTRLModel(config)
self.classifier = nn.Linear(config.n_embd, self.num_labels, bias=False) self.classifier = nn.Linear(config.n_embd, self.num_labels, bias=False)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(CTRL_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(CTRL_INPUTS_DOCSTRING)
@add_code_sample_docstrings( @add_code_sample_docstrings(
......
...@@ -888,7 +888,8 @@ class DebertaModel(DebertaPreTrainedModel): ...@@ -888,7 +888,8 @@ class DebertaModel(DebertaPreTrainedModel):
self.encoder = DebertaEncoder(config) self.encoder = DebertaEncoder(config)
self.z_steps = 0 self.z_steps = 0
self.config = config self.config = config
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings.word_embeddings return self.embeddings.word_embeddings
...@@ -1001,7 +1002,8 @@ class DebertaForMaskedLM(DebertaPreTrainedModel): ...@@ -1001,7 +1002,8 @@ class DebertaForMaskedLM(DebertaPreTrainedModel):
self.deberta = DebertaModel(config) self.deberta = DebertaModel(config)
self.cls = DebertaOnlyMLMHead(config) self.cls = DebertaOnlyMLMHead(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self): def get_output_embeddings(self):
return self.cls.predictions.decoder return self.cls.predictions.decoder
...@@ -1141,7 +1143,8 @@ class DebertaForSequenceClassification(DebertaPreTrainedModel): ...@@ -1141,7 +1143,8 @@ class DebertaForSequenceClassification(DebertaPreTrainedModel):
drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
self.dropout = StableDropout(drop_out) self.dropout = StableDropout(drop_out)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.deberta.get_input_embeddings() return self.deberta.get_input_embeddings()
...@@ -1254,7 +1257,8 @@ class DebertaForTokenClassification(DebertaPreTrainedModel): ...@@ -1254,7 +1257,8 @@ class DebertaForTokenClassification(DebertaPreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
...@@ -1338,7 +1342,8 @@ class DebertaForQuestionAnswering(DebertaPreTrainedModel): ...@@ -1338,7 +1342,8 @@ class DebertaForQuestionAnswering(DebertaPreTrainedModel):
self.deberta = DebertaModel(config) self.deberta = DebertaModel(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
......
...@@ -996,7 +996,8 @@ class DebertaV2Model(DebertaV2PreTrainedModel): ...@@ -996,7 +996,8 @@ class DebertaV2Model(DebertaV2PreTrainedModel):
self.encoder = DebertaV2Encoder(config) self.encoder = DebertaV2Encoder(config)
self.z_steps = 0 self.z_steps = 0
self.config = config self.config = config
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings.word_embeddings return self.embeddings.word_embeddings
...@@ -1110,7 +1111,8 @@ class DebertaV2ForMaskedLM(DebertaV2PreTrainedModel): ...@@ -1110,7 +1111,8 @@ class DebertaV2ForMaskedLM(DebertaV2PreTrainedModel):
self.deberta = DebertaV2Model(config) self.deberta = DebertaV2Model(config)
self.cls = DebertaV2OnlyMLMHead(config) self.cls = DebertaV2OnlyMLMHead(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self): def get_output_embeddings(self):
return self.cls.predictions.decoder return self.cls.predictions.decoder
...@@ -1251,7 +1253,8 @@ class DebertaV2ForSequenceClassification(DebertaV2PreTrainedModel): ...@@ -1251,7 +1253,8 @@ class DebertaV2ForSequenceClassification(DebertaV2PreTrainedModel):
drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
self.dropout = StableDropout(drop_out) self.dropout = StableDropout(drop_out)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.deberta.get_input_embeddings() return self.deberta.get_input_embeddings()
...@@ -1365,7 +1368,8 @@ class DebertaV2ForTokenClassification(DebertaV2PreTrainedModel): ...@@ -1365,7 +1368,8 @@ class DebertaV2ForTokenClassification(DebertaV2PreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
...@@ -1450,7 +1454,8 @@ class DebertaV2ForQuestionAnswering(DebertaV2PreTrainedModel): ...@@ -1450,7 +1454,8 @@ class DebertaV2ForQuestionAnswering(DebertaV2PreTrainedModel):
self.deberta = DebertaV2Model(config) self.deberta = DebertaV2Model(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
......
...@@ -458,7 +458,8 @@ class DeiTModel(DeiTPreTrainedModel): ...@@ -458,7 +458,8 @@ class DeiTModel(DeiTPreTrainedModel):
self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.pooler = DeiTPooler(config) if add_pooling_layer else None self.pooler = DeiTPooler(config) if add_pooling_layer else None
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings.patch_embeddings return self.embeddings.patch_embeddings
...@@ -574,7 +575,8 @@ class DeiTForImageClassification(DeiTPreTrainedModel): ...@@ -574,7 +575,8 @@ class DeiTForImageClassification(DeiTPreTrainedModel):
# Classifier head # Classifier head
self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity() self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
...@@ -711,7 +713,8 @@ class DeiTForImageClassificationWithTeacher(DeiTPreTrainedModel): ...@@ -711,7 +713,8 @@ class DeiTForImageClassificationWithTeacher(DeiTPreTrainedModel):
nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity() nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
) )
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=DeiTForImageClassificationWithTeacherOutput, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=DeiTForImageClassificationWithTeacherOutput, config_class=_CONFIG_FOR_DOC)
......
...@@ -894,7 +894,8 @@ class DetrEncoder(DetrPreTrainedModel): ...@@ -894,7 +894,8 @@ class DetrEncoder(DetrPreTrainedModel):
# in the original DETR, no layernorm is used at the end of the encoder, as "normalize_before" is set to False by default # in the original DETR, no layernorm is used at the end of the encoder, as "normalize_before" is set to False by default
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def forward( def forward(
self, self,
...@@ -1001,8 +1002,9 @@ class DetrDecoder(DetrPreTrainedModel): ...@@ -1001,8 +1002,9 @@ class DetrDecoder(DetrPreTrainedModel):
# in DETR, the decoder uses layernorm after the last decoder layer output # in DETR, the decoder uses layernorm after the last decoder layer output
self.layernorm = nn.LayerNorm(config.d_model) self.layernorm = nn.LayerNorm(config.d_model)
self.init_weights()
self.gradient_checkpointing = False self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def forward( def forward(
self, self,
...@@ -1179,7 +1181,8 @@ class DetrModel(DetrPreTrainedModel): ...@@ -1179,7 +1181,8 @@ class DetrModel(DetrPreTrainedModel):
self.encoder = DetrEncoder(config) self.encoder = DetrEncoder(config)
self.decoder = DetrDecoder(config) self.decoder = DetrDecoder(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_encoder(self): def get_encoder(self):
return self.encoder return self.encoder
...@@ -1333,7 +1336,8 @@ class DetrForObjectDetection(DetrPreTrainedModel): ...@@ -1333,7 +1336,8 @@ class DetrForObjectDetection(DetrPreTrainedModel):
input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3 input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
) )
self.init_weights() # Initialize weights and apply final processing
self.post_init()
# taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py # taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
@torch.jit.unused @torch.jit.unused
...@@ -1494,7 +1498,8 @@ class DetrForSegmentation(DetrPreTrainedModel): ...@@ -1494,7 +1498,8 @@ class DetrForSegmentation(DetrPreTrainedModel):
hidden_size, hidden_size, number_of_heads, dropout=0.0, std=config.init_xavier_std hidden_size, hidden_size, number_of_heads, dropout=0.0, std=config.init_xavier_std
) )
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(DETR_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(DETR_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=DetrSegmentationOutput, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=DetrSegmentationOutput, config_class=_CONFIG_FOR_DOC)
......
...@@ -441,7 +441,8 @@ class DistilBertModel(DistilBertPreTrainedModel): ...@@ -441,7 +441,8 @@ class DistilBertModel(DistilBertPreTrainedModel):
self.embeddings = Embeddings(config) # Embeddings self.embeddings = Embeddings(config) # Embeddings
self.transformer = Transformer(config) # Encoder self.transformer = Transformer(config) # Encoder
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_position_embeddings(self) -> nn.Embedding: def get_position_embeddings(self) -> nn.Embedding:
""" """
...@@ -571,7 +572,8 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel): ...@@ -571,7 +572,8 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
self.vocab_layer_norm = nn.LayerNorm(config.dim, eps=1e-12) self.vocab_layer_norm = nn.LayerNorm(config.dim, eps=1e-12)
self.vocab_projector = nn.Linear(config.dim, config.vocab_size) self.vocab_projector = nn.Linear(config.dim, config.vocab_size)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
self.mlm_loss_fct = nn.CrossEntropyLoss() self.mlm_loss_fct = nn.CrossEntropyLoss()
...@@ -677,7 +679,8 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel): ...@@ -677,7 +679,8 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
self.classifier = nn.Linear(config.dim, config.num_labels) self.classifier = nn.Linear(config.dim, config.num_labels)
self.dropout = nn.Dropout(config.seq_classif_dropout) self.dropout = nn.Dropout(config.seq_classif_dropout)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_position_embeddings(self) -> nn.Embedding: def get_position_embeddings(self) -> nn.Embedding:
""" """
...@@ -793,7 +796,8 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel): ...@@ -793,7 +796,8 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
assert config.num_labels == 2 assert config.num_labels == 2
self.dropout = nn.Dropout(config.qa_dropout) self.dropout = nn.Dropout(config.qa_dropout)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_position_embeddings(self) -> nn.Embedding: def get_position_embeddings(self) -> nn.Embedding:
""" """
...@@ -910,7 +914,8 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel): ...@@ -910,7 +914,8 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):
self.dropout = nn.Dropout(config.dropout) self.dropout = nn.Dropout(config.dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_position_embeddings(self) -> nn.Embedding: def get_position_embeddings(self) -> nn.Embedding:
""" """
...@@ -1015,7 +1020,8 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel): ...@@ -1015,7 +1020,8 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
self.classifier = nn.Linear(config.dim, 1) self.classifier = nn.Linear(config.dim, 1)
self.dropout = nn.Dropout(config.seq_classif_dropout) self.dropout = nn.Dropout(config.seq_classif_dropout)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_position_embeddings(self) -> nn.Embedding: def get_position_embeddings(self) -> nn.Embedding:
""" """
......
...@@ -180,7 +180,8 @@ class DPREncoder(DPRPreTrainedModel): ...@@ -180,7 +180,8 @@ class DPREncoder(DPRPreTrainedModel):
self.projection_dim = config.projection_dim self.projection_dim = config.projection_dim
if self.projection_dim > 0: if self.projection_dim > 0:
self.encode_proj = nn.Linear(self.bert_model.config.hidden_size, config.projection_dim) self.encode_proj = nn.Linear(self.bert_model.config.hidden_size, config.projection_dim)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def forward( def forward(
self, self,
...@@ -232,7 +233,8 @@ class DPRSpanPredictor(DPRPreTrainedModel): ...@@ -232,7 +233,8 @@ class DPRSpanPredictor(DPRPreTrainedModel):
self.encoder = DPREncoder(config) self.encoder = DPREncoder(config)
self.qa_outputs = nn.Linear(self.encoder.embeddings_size, 2) self.qa_outputs = nn.Linear(self.encoder.embeddings_size, 2)
self.qa_classifier = nn.Linear(self.encoder.embeddings_size, 1) self.qa_classifier = nn.Linear(self.encoder.embeddings_size, 1)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def forward( def forward(
self, self,
...@@ -447,7 +449,8 @@ class DPRContextEncoder(DPRPretrainedContextEncoder): ...@@ -447,7 +449,8 @@ class DPRContextEncoder(DPRPretrainedContextEncoder):
super().__init__(config) super().__init__(config)
self.config = config self.config = config
self.ctx_encoder = DPREncoder(config) self.ctx_encoder = DPREncoder(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(DPR_ENCODERS_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(DPR_ENCODERS_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=DPRContextEncoderOutput, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=DPRContextEncoderOutput, config_class=_CONFIG_FOR_DOC)
...@@ -525,7 +528,8 @@ class DPRQuestionEncoder(DPRPretrainedQuestionEncoder): ...@@ -525,7 +528,8 @@ class DPRQuestionEncoder(DPRPretrainedQuestionEncoder):
super().__init__(config) super().__init__(config)
self.config = config self.config = config
self.question_encoder = DPREncoder(config) self.question_encoder = DPREncoder(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(DPR_ENCODERS_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(DPR_ENCODERS_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=DPRQuestionEncoderOutput, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=DPRQuestionEncoderOutput, config_class=_CONFIG_FOR_DOC)
...@@ -602,7 +606,8 @@ class DPRReader(DPRPretrainedReader): ...@@ -602,7 +606,8 @@ class DPRReader(DPRPretrainedReader):
super().__init__(config) super().__init__(config)
self.config = config self.config = config
self.span_predictor = DPRSpanPredictor(config) self.span_predictor = DPRSpanPredictor(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(DPR_READER_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(DPR_READER_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=DPRReaderOutput, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=DPRReaderOutput, config_class=_CONFIG_FOR_DOC)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment