"examples/vscode:/vscode.git/clone" did not exist on "c7058d822431438fd5d07303266bfa70e4e278f2"
Unverified Commit d83b0e0c authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Add a post init method to all models (#14431)

* Add a post init method to all models

* Fix tests

* Fix last tests

* Fix templates

* Add comment

* Forgot to save
parent 08816de1
......@@ -412,17 +412,6 @@ class ModuleUtilsMixin:
return 6 * self.estimate_tokens(input_dict) * self.num_parameters(exclude_embeddings=exclude_embeddings)
def gradient_checkpointing_hook(module, _):
# Hook to enable backward compatibility for gradient checkpointing. Will be removed once all models have a
# proper post_init method.
if getattr(module.config, "gradient_checkpointing", False):
module.gradient_checkpointing_enable()
# Remove the attribute now that is has been consumed, so it's no saved in the config.
delattr(module.config, "gradient_checkpointing")
# The hook will remove itself after the first execution
module._gradient_checkpointing_hook.remove()
class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMixin):
r"""
Base class for all models.
......@@ -490,8 +479,20 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
# Save config and origin of the pretrained weights if given in model
self.config = config
self.name_or_path = config.name_or_path
if self.supports_gradient_checkpointing:
self._gradient_checkpointing_hook = self.register_forward_pre_hook(gradient_checkpointing_hook)
def post_init(self):
"""
A method executed at the end of each Transformer model initialization, to execute code that needs the model's
modules properly initialized (such as weight initialization).
"""
self.init_weights()
self._backward_compatibility_gradient_checkpointing()
def _backward_compatibility_gradient_checkpointing(self):
if self.supports_gradient_checkpointing and getattr(self.config, "gradient_checkpointing", False):
self.gradient_checkpointing_enable()
# Remove the attribute now that is has been consumed, so it's no saved in the config.
delattr(self.config, "gradient_checkpointing")
@classmethod
def _from_config(cls, config, **kwargs):
......
......@@ -638,7 +638,8 @@ class AlbertModel(AlbertPreTrainedModel):
self.pooler = None
self.pooler_activation = None
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
......@@ -757,7 +758,8 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
self.predictions = AlbertMLMHead(config)
self.sop_classifier = AlbertSOPHead(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self):
return self.predictions.decoder
......@@ -903,7 +905,8 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
self.albert = AlbertModel(config, add_pooling_layer=False)
self.predictions = AlbertMLMHead(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self):
return self.predictions.decoder
......@@ -991,7 +994,8 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
self.dropout = nn.Dropout(config.classifier_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
......@@ -1097,7 +1101,8 @@ class AlbertForTokenClassification(AlbertPreTrainedModel):
self.dropout = nn.Dropout(classifier_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
......@@ -1187,7 +1192,8 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
self.albert = AlbertModel(config, add_pooling_layer=False)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
......@@ -1286,7 +1292,8 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel):
self.dropout = nn.Dropout(config.classifier_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, 1)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings(
......
......@@ -699,8 +699,9 @@ class BartEncoder(BartPretrainedModel):
self.layers = nn.ModuleList([BartEncoderLayer(config) for _ in range(config.encoder_layers)])
self.layernorm_embedding = nn.LayerNorm(embed_dim)
self.init_weights()
self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.embed_tokens
......@@ -870,8 +871,9 @@ class BartDecoder(BartPretrainedModel):
self.layers = nn.ModuleList([BartDecoderLayer(config) for _ in range(config.decoder_layers)])
self.layernorm_embedding = nn.LayerNorm(config.d_model)
self.init_weights()
self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.embed_tokens
......@@ -1130,7 +1132,8 @@ class BartModel(BartPretrainedModel):
self.encoder = BartEncoder(config, self.shared)
self.decoder = BartDecoder(config, self.shared)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.shared
......@@ -1248,7 +1251,8 @@ class BartForConditionalGeneration(BartPretrainedModel):
self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_encoder(self):
return self.model.get_encoder()
......@@ -1666,7 +1670,8 @@ class BartForCausalLM(BartPretrainedModel):
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.model.decoder.embed_tokens
......
......@@ -598,7 +598,8 @@ class BeitModel(BeitPreTrainedModel):
)
self.pooler = BeitPooler(config) if add_pooling_layer else None
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.embeddings.patch_embeddings
......@@ -715,7 +716,8 @@ class BeitForMaskedImageModeling(BeitPreTrainedModel):
self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(BEIT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
......@@ -805,7 +807,8 @@ class BeitForImageClassification(BeitPreTrainedModel):
# Classifier head
self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(BEIT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
......@@ -1121,7 +1124,8 @@ class BeitForSemanticSegmentation(BeitPreTrainedModel):
self.decode_head = BeitUperHead(config)
self.auxiliary_head = BeitFCNHead(config) if config.use_auxiliary_head else None
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def compute_loss(self, logits, auxiliary_logits, labels):
# upsample logits to the images' original size
......
......@@ -870,7 +870,8 @@ class BertModel(BertPreTrainedModel):
self.pooler = BertPooler(config) if add_pooling_layer else None
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
......@@ -1037,7 +1038,8 @@ class BertForPreTraining(BertPreTrainedModel):
self.bert = BertModel(config)
self.cls = BertPreTrainingHeads(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self):
return self.cls.predictions.decoder
......@@ -1145,7 +1147,8 @@ class BertLMHeadModel(BertPreTrainedModel):
self.bert = BertModel(config, add_pooling_layer=False)
self.cls = BertOnlyMLMHead(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self):
return self.cls.predictions.decoder
......@@ -1294,7 +1297,8 @@ class BertForMaskedLM(BertPreTrainedModel):
self.bert = BertModel(config, add_pooling_layer=False)
self.cls = BertOnlyMLMHead(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self):
return self.cls.predictions.decoder
......@@ -1394,7 +1398,8 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
self.bert = BertModel(config)
self.cls = BertOnlyNSPHead(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
......@@ -1501,7 +1506,8 @@ class BertForSequenceClassification(BertPreTrainedModel):
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
......@@ -1600,7 +1606,8 @@ class BertForMultipleChoice(BertPreTrainedModel):
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, 1)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings(
......@@ -1698,7 +1705,8 @@ class BertForTokenClassification(BertPreTrainedModel):
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
......@@ -1788,7 +1796,8 @@ class BertForQuestionAnswering(BertPreTrainedModel):
self.bert = BertModel(config, add_pooling_layer=False)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
......
......@@ -282,7 +282,8 @@ class BertGenerationEncoder(BertGenerationPreTrainedModel):
self.embeddings = BertGenerationEmbeddings(config)
self.encoder = BertEncoder(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
......@@ -456,7 +457,8 @@ class BertGenerationDecoder(BertGenerationPreTrainedModel):
self.bert = BertGenerationEncoder(config)
self.lm_head = BertGenerationOnlyLMHead(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self):
return self.lm_head.decoder
......
......@@ -1953,7 +1953,8 @@ class BigBirdModel(BigBirdPreTrainedModel):
)
self.set_attention_type("original_full")
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
......@@ -2262,7 +2263,8 @@ class BigBirdForPreTraining(BigBirdPreTrainedModel):
self.bert = BigBirdModel(config, add_pooling_layer=True)
self.cls = BigBirdPreTrainingHeads(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self):
return self.cls.predictions.decoder
......@@ -2370,7 +2372,8 @@ class BigBirdForMaskedLM(BigBirdPreTrainedModel):
self.bert = BigBirdModel(config)
self.cls = BigBirdOnlyMLMHead(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self):
return self.cls.predictions.decoder
......@@ -2472,7 +2475,8 @@ class BigBirdForCausalLM(BigBirdPreTrainedModel):
self.bert = BigBirdModel(config)
self.cls = BigBirdOnlyMLMHead(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self):
return self.cls.predictions.decoder
......@@ -2642,7 +2646,8 @@ class BigBirdForSequenceClassification(BigBirdPreTrainedModel):
self.bert = BigBirdModel(config)
self.classifier = BigBirdClassificationHead(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
......@@ -2737,7 +2742,8 @@ class BigBirdForMultipleChoice(BigBirdPreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, 1)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(
BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
......@@ -2834,7 +2840,8 @@ class BigBirdForTokenClassification(BigBirdPreTrainedModel):
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
......@@ -2942,7 +2949,8 @@ class BigBirdForQuestionAnswering(BigBirdPreTrainedModel):
self.bert = BigBirdModel(config, add_pooling_layer=add_pooling_layer)
self.qa_classifier = BigBirdForQuestionAnsweringHead(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
......
......@@ -1775,8 +1775,9 @@ class BigBirdPegasusEncoder(BigBirdPegasusPreTrainedModel):
self.layers = nn.ModuleList([BigBirdPegasusEncoderLayer(config, seed=i) for i in range(config.encoder_layers)])
self.layernorm_embedding = nn.LayerNorm(embed_dim)
self.init_weights()
self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def forward(
self,
......@@ -2066,8 +2067,9 @@ class BigBirdPegasusDecoder(BigBirdPegasusPreTrainedModel):
self.layers = nn.ModuleList([BigBirdPegasusDecoderLayer(config) for _ in range(config.decoder_layers)])
self.layernorm_embedding = nn.LayerNorm(config.d_model)
self.init_weights()
self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.embed_tokens
......@@ -2327,7 +2329,8 @@ class BigBirdPegasusModel(BigBirdPegasusPreTrainedModel):
self.encoder = BigBirdPegasusEncoder(config, self.shared)
self.decoder = BigBirdPegasusDecoder(config, self.shared)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.shared
......@@ -2447,7 +2450,8 @@ class BigBirdPegasusForConditionalGeneration(BigBirdPegasusPreTrainedModel):
self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_encoder(self):
return self.model.get_encoder()
......@@ -2869,7 +2873,8 @@ class BigBirdPegasusForCausalLM(BigBirdPegasusPreTrainedModel):
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.model.decoder.embed_tokens
......
......@@ -656,8 +656,9 @@ class BlenderbotEncoder(BlenderbotPreTrainedModel):
self.layers = nn.ModuleList([BlenderbotEncoderLayer(config) for _ in range(config.encoder_layers)])
self.layer_norm = nn.LayerNorm(config.d_model)
self.init_weights()
self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def forward(
self,
......@@ -821,8 +822,9 @@ class BlenderbotDecoder(BlenderbotPreTrainedModel):
self.layers = nn.ModuleList([BlenderbotDecoderLayer(config) for _ in range(config.decoder_layers)])
self.layer_norm = nn.LayerNorm(config.d_model)
self.init_weights()
self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.embed_tokens
......@@ -1083,7 +1085,8 @@ class BlenderbotModel(BlenderbotPreTrainedModel):
self.encoder = BlenderbotEncoder(config, self.shared)
self.decoder = BlenderbotDecoder(config, self.shared)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
......@@ -1220,7 +1223,8 @@ class BlenderbotForConditionalGeneration(BlenderbotPreTrainedModel):
self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
......@@ -1404,7 +1408,8 @@ class BlenderbotForCausalLM(BlenderbotPreTrainedModel):
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.model.decoder.embed_tokens
......
......@@ -657,8 +657,9 @@ class BlenderbotSmallEncoder(BlenderbotSmallPreTrainedModel):
self.layers = nn.ModuleList([BlenderbotSmallEncoderLayer(config) for _ in range(config.encoder_layers)])
self.layernorm_embedding = nn.LayerNorm(embed_dim)
self.init_weights()
self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def forward(
self,
......@@ -821,8 +822,9 @@ class BlenderbotSmallDecoder(BlenderbotSmallPreTrainedModel):
self.layers = nn.ModuleList([BlenderbotSmallDecoderLayer(config) for _ in range(config.decoder_layers)])
self.layernorm_embedding = nn.LayerNorm(config.d_model)
self.init_weights()
self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.embed_tokens
......@@ -1081,7 +1083,8 @@ class BlenderbotSmallModel(BlenderbotSmallPreTrainedModel):
self.encoder = BlenderbotSmallEncoder(config, self.shared)
self.decoder = BlenderbotSmallDecoder(config, self.shared)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.shared
......@@ -1208,7 +1211,8 @@ class BlenderbotSmallForConditionalGeneration(BlenderbotSmallPreTrainedModel):
self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_encoder(self):
return self.model.get_encoder()
......@@ -1379,7 +1383,8 @@ class BlenderbotSmallForCausalLM(BlenderbotSmallPreTrainedModel):
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.model.decoder.embed_tokens
......
......@@ -1015,7 +1015,8 @@ class CanineModel(CaninePreTrainedModel):
self.pooler = CaninePooler(config) if add_pooling_layer else None
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def _prune_heads(self, heads_to_prune):
"""
......@@ -1273,7 +1274,8 @@ class CanineForSequenceClassification(CaninePreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(CANINE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
......@@ -1369,7 +1371,8 @@ class CanineForMultipleChoice(CaninePreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, 1)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(CANINE_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings(
......@@ -1461,7 +1464,8 @@ class CanineForTokenClassification(CaninePreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(CANINE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
......@@ -1548,7 +1552,8 @@ class CanineForQuestionAnswering(CaninePreTrainedModel):
self.canine = CanineModel(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(CANINE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
......
......@@ -683,7 +683,8 @@ class CLIPTextModel(CLIPPreTrainedModel):
def __init__(self, config: CLIPTextConfig):
super().__init__(config)
self.text_model = CLIPTextTransformer(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self) -> nn.Module:
return self.text_model.embeddings.token_embedding
......@@ -792,7 +793,8 @@ class CLIPVisionModel(CLIPPreTrainedModel):
def __init__(self, config: CLIPVisionConfig):
super().__init__(config)
self.vision_model = CLIPVisionTransformer(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self) -> nn.Module:
return self.vision_model.embeddings.patch_embedding
......@@ -866,7 +868,8 @@ class CLIPModel(CLIPPreTrainedModel):
self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
def get_text_features(
......
......@@ -775,7 +775,8 @@ class ConvBertModel(ConvBertPreTrainedModel):
self.encoder = ConvBertEncoder(config)
self.config = config
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
......@@ -886,7 +887,8 @@ class ConvBertForMaskedLM(ConvBertPreTrainedModel):
self.generator_predictions = ConvBertGeneratorPredictions(config)
self.generator_lm_head = nn.Linear(config.embedding_size, config.vocab_size)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self):
return self.generator_lm_head
......@@ -995,7 +997,8 @@ class ConvBertForSequenceClassification(ConvBertPreTrainedModel):
self.convbert = ConvBertModel(config)
self.classifier = ConvBertClassificationHead(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
......@@ -1090,7 +1093,8 @@ class ConvBertForMultipleChoice(ConvBertPreTrainedModel):
self.sequence_summary = SequenceSummary(config)
self.classifier = nn.Linear(config.hidden_size, 1)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(
CONVBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
......@@ -1187,7 +1191,8 @@ class ConvBertForTokenClassification(ConvBertPreTrainedModel):
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
......@@ -1274,7 +1279,8 @@ class ConvBertForQuestionAnswering(ConvBertPreTrainedModel):
self.convbert = ConvBertModel(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
......
......@@ -338,7 +338,8 @@ class CTRLModel(CTRLPreTrainedModel):
)
self.layernorm = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.w
......@@ -499,7 +500,8 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
self.transformer = CTRLModel(config)
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=True)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self):
return self.lm_head
......@@ -615,7 +617,8 @@ class CTRLForSequenceClassification(CTRLPreTrainedModel):
self.transformer = CTRLModel(config)
self.classifier = nn.Linear(config.n_embd, self.num_labels, bias=False)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(CTRL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
......
......@@ -888,7 +888,8 @@ class DebertaModel(DebertaPreTrainedModel):
self.encoder = DebertaEncoder(config)
self.z_steps = 0
self.config = config
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
......@@ -1001,7 +1002,8 @@ class DebertaForMaskedLM(DebertaPreTrainedModel):
self.deberta = DebertaModel(config)
self.cls = DebertaOnlyMLMHead(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self):
return self.cls.predictions.decoder
......@@ -1141,7 +1143,8 @@ class DebertaForSequenceClassification(DebertaPreTrainedModel):
drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
self.dropout = StableDropout(drop_out)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.deberta.get_input_embeddings()
......@@ -1254,7 +1257,8 @@ class DebertaForTokenClassification(DebertaPreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
......@@ -1338,7 +1342,8 @@ class DebertaForQuestionAnswering(DebertaPreTrainedModel):
self.deberta = DebertaModel(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
......
......@@ -996,7 +996,8 @@ class DebertaV2Model(DebertaV2PreTrainedModel):
self.encoder = DebertaV2Encoder(config)
self.z_steps = 0
self.config = config
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
......@@ -1110,7 +1111,8 @@ class DebertaV2ForMaskedLM(DebertaV2PreTrainedModel):
self.deberta = DebertaV2Model(config)
self.cls = DebertaV2OnlyMLMHead(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self):
return self.cls.predictions.decoder
......@@ -1251,7 +1253,8 @@ class DebertaV2ForSequenceClassification(DebertaV2PreTrainedModel):
drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
self.dropout = StableDropout(drop_out)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.deberta.get_input_embeddings()
......@@ -1365,7 +1368,8 @@ class DebertaV2ForTokenClassification(DebertaV2PreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
......@@ -1450,7 +1454,8 @@ class DebertaV2ForQuestionAnswering(DebertaV2PreTrainedModel):
self.deberta = DebertaV2Model(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
......
......@@ -458,7 +458,8 @@ class DeiTModel(DeiTPreTrainedModel):
self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.pooler = DeiTPooler(config) if add_pooling_layer else None
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.embeddings.patch_embeddings
......@@ -574,7 +575,8 @@ class DeiTForImageClassification(DeiTPreTrainedModel):
# Classifier head
self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
......@@ -711,7 +713,8 @@ class DeiTForImageClassificationWithTeacher(DeiTPreTrainedModel):
nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=DeiTForImageClassificationWithTeacherOutput, config_class=_CONFIG_FOR_DOC)
......
......@@ -894,7 +894,8 @@ class DetrEncoder(DetrPreTrainedModel):
# in the original DETR, no layernorm is used at the end of the encoder, as "normalize_before" is set to False by default
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def forward(
self,
......@@ -1001,8 +1002,9 @@ class DetrDecoder(DetrPreTrainedModel):
# in DETR, the decoder uses layernorm after the last decoder layer output
self.layernorm = nn.LayerNorm(config.d_model)
self.init_weights()
self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def forward(
self,
......@@ -1179,7 +1181,8 @@ class DetrModel(DetrPreTrainedModel):
self.encoder = DetrEncoder(config)
self.decoder = DetrDecoder(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_encoder(self):
return self.encoder
......@@ -1333,7 +1336,8 @@ class DetrForObjectDetection(DetrPreTrainedModel):
input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
# taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
@torch.jit.unused
......@@ -1494,7 +1498,8 @@ class DetrForSegmentation(DetrPreTrainedModel):
hidden_size, hidden_size, number_of_heads, dropout=0.0, std=config.init_xavier_std
)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(DETR_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=DetrSegmentationOutput, config_class=_CONFIG_FOR_DOC)
......
......@@ -441,7 +441,8 @@ class DistilBertModel(DistilBertPreTrainedModel):
self.embeddings = Embeddings(config) # Embeddings
self.transformer = Transformer(config) # Encoder
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_position_embeddings(self) -> nn.Embedding:
"""
......@@ -571,7 +572,8 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
self.vocab_layer_norm = nn.LayerNorm(config.dim, eps=1e-12)
self.vocab_projector = nn.Linear(config.dim, config.vocab_size)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
self.mlm_loss_fct = nn.CrossEntropyLoss()
......@@ -677,7 +679,8 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
self.classifier = nn.Linear(config.dim, config.num_labels)
self.dropout = nn.Dropout(config.seq_classif_dropout)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_position_embeddings(self) -> nn.Embedding:
"""
......@@ -793,7 +796,8 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
assert config.num_labels == 2
self.dropout = nn.Dropout(config.qa_dropout)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_position_embeddings(self) -> nn.Embedding:
"""
......@@ -910,7 +914,8 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):
self.dropout = nn.Dropout(config.dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_position_embeddings(self) -> nn.Embedding:
"""
......@@ -1015,7 +1020,8 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
self.classifier = nn.Linear(config.dim, 1)
self.dropout = nn.Dropout(config.seq_classif_dropout)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_position_embeddings(self) -> nn.Embedding:
"""
......
......@@ -180,7 +180,8 @@ class DPREncoder(DPRPreTrainedModel):
self.projection_dim = config.projection_dim
if self.projection_dim > 0:
self.encode_proj = nn.Linear(self.bert_model.config.hidden_size, config.projection_dim)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def forward(
self,
......@@ -232,7 +233,8 @@ class DPRSpanPredictor(DPRPreTrainedModel):
self.encoder = DPREncoder(config)
self.qa_outputs = nn.Linear(self.encoder.embeddings_size, 2)
self.qa_classifier = nn.Linear(self.encoder.embeddings_size, 1)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def forward(
self,
......@@ -447,7 +449,8 @@ class DPRContextEncoder(DPRPretrainedContextEncoder):
super().__init__(config)
self.config = config
self.ctx_encoder = DPREncoder(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(DPR_ENCODERS_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=DPRContextEncoderOutput, config_class=_CONFIG_FOR_DOC)
......@@ -525,7 +528,8 @@ class DPRQuestionEncoder(DPRPretrainedQuestionEncoder):
super().__init__(config)
self.config = config
self.question_encoder = DPREncoder(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(DPR_ENCODERS_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=DPRQuestionEncoderOutput, config_class=_CONFIG_FOR_DOC)
......@@ -602,7 +606,8 @@ class DPRReader(DPRPretrainedReader):
super().__init__(config)
self.config = config
self.span_predictor = DPRSpanPredictor(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(DPR_READER_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=DPRReaderOutput, config_class=_CONFIG_FOR_DOC)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment