Unverified Commit d83b0e0c authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Add a post init method to all models (#14431)

* Add a post init method to all models

* Fix tests

* Fix last tests

* Fix templates

* Add comment

* Forgot to save
parent 08816de1
......@@ -817,7 +817,8 @@ class ElectraModel(ElectraPreTrainedModel):
self.encoder = ElectraEncoder(config)
self.config = config
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
......@@ -939,7 +940,8 @@ class ElectraForSequenceClassification(ElectraPreTrainedModel):
self.electra = ElectraModel(config)
self.classifier = ElectraClassificationHead(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
......@@ -1033,7 +1035,8 @@ class ElectraForPreTraining(ElectraPreTrainedModel):
self.electra = ElectraModel(config)
self.discriminator_predictions = ElectraDiscriminatorPredictions(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=ElectraForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
......@@ -1128,7 +1131,8 @@ class ElectraForMaskedLM(ElectraPreTrainedModel):
self.generator_predictions = ElectraGeneratorPredictions(config)
self.generator_lm_head = nn.Linear(config.embedding_size, config.vocab_size)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self):
return self.generator_lm_head
......@@ -1216,7 +1220,8 @@ class ElectraForTokenClassification(ElectraPreTrainedModel):
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
......@@ -1305,7 +1310,8 @@ class ElectraForQuestionAnswering(ElectraPreTrainedModel):
self.electra = ElectraModel(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
......@@ -1406,7 +1412,8 @@ class ElectraForMultipleChoice(ElectraPreTrainedModel):
self.sequence_summary = SequenceSummary(config)
self.classifier = nn.Linear(config.hidden_size, 1)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings(
......
......@@ -336,7 +336,8 @@ class FlaubertWithLMHeadModel(XLMWithLMHeadModel):
def __init__(self, config):
super().__init__(config)
self.transformer = FlaubertModel(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings(
......@@ -357,7 +358,8 @@ class FlaubertForSequenceClassification(XLMForSequenceClassification):
def __init__(self, config):
super().__init__(config)
self.transformer = FlaubertModel(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings(
......@@ -378,7 +380,8 @@ class FlaubertForTokenClassification(XLMForTokenClassification):
def __init__(self, config):
super().__init__(config)
self.transformer = FlaubertModel(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings(
......@@ -399,7 +402,8 @@ class FlaubertForQuestionAnsweringSimple(XLMForQuestionAnsweringSimple):
def __init__(self, config):
super().__init__(config)
self.transformer = FlaubertModel(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings(
......@@ -420,7 +424,8 @@ class FlaubertForQuestionAnswering(XLMForQuestionAnswering):
def __init__(self, config):
super().__init__(config)
self.transformer = FlaubertModel(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings(
......@@ -441,4 +446,5 @@ class FlaubertForMultipleChoice(XLMForMultipleChoice):
def __init__(self, config):
super().__init__(config)
self.transformer = FlaubertModel(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
......@@ -535,7 +535,8 @@ class FNetModel(FNetPreTrainedModel):
self.pooler = FNetPooler(config) if add_pooling_layer else None
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
......@@ -633,7 +634,8 @@ class FNetForPreTraining(FNetPreTrainedModel):
self.fnet = FNetModel(config)
self.cls = FNetPreTrainingHeads(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self):
return self.cls.predictions.decoder
......@@ -723,7 +725,8 @@ class FNetForMaskedLM(FNetPreTrainedModel):
self.fnet = FNetModel(config)
self.cls = FNetOnlyMLMHead(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self):
return self.cls.predictions.decoder
......@@ -791,7 +794,8 @@ class FNetForNextSentencePrediction(FNetPreTrainedModel):
self.fnet = FNetModel(config)
self.cls = FNetOnlyNSPHead(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(FNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
......@@ -885,7 +889,8 @@ class FNetForSequenceClassification(FNetPreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(FNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
......@@ -969,7 +974,8 @@ class FNetForMultipleChoice(FNetPreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, 1)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(FNET_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings(
......@@ -1050,7 +1056,8 @@ class FNetForTokenClassification(FNetPreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(FNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
......@@ -1119,7 +1126,8 @@ class FNetForQuestionAnswering(FNetPreTrainedModel):
self.fnet = FNetModel(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(FNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
......
......@@ -1003,7 +1003,8 @@ class FSMTModel(PretrainedFSMTModel):
self.encoder = FSMTEncoder(config, encoder_embed_tokens)
self.decoder = FSMTDecoder(config, decoder_embed_tokens)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(FSMT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
......
......@@ -900,7 +900,8 @@ class FunnelBaseModel(FunnelPreTrainedModel):
self.embeddings = FunnelEmbeddings(config)
self.encoder = FunnelEncoder(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
......@@ -977,7 +978,8 @@ class FunnelModel(FunnelPreTrainedModel):
self.encoder = FunnelEncoder(config)
self.decoder = FunnelDecoder(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
......@@ -1082,7 +1084,8 @@ class FunnelForPreTraining(FunnelPreTrainedModel):
self.funnel = FunnelModel(config)
self.discriminator_predictions = FunnelDiscriminatorPredictions(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=FunnelForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
......@@ -1164,7 +1167,8 @@ class FunnelForMaskedLM(FunnelPreTrainedModel):
self.funnel = FunnelModel(config)
self.lm_head = nn.Linear(config.d_model, config.vocab_size)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self):
return self.lm_head
......@@ -1244,7 +1248,8 @@ class FunnelForSequenceClassification(FunnelPreTrainedModel):
self.funnel = FunnelBaseModel(config)
self.classifier = FunnelClassificationHead(config, config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
......@@ -1334,7 +1339,8 @@ class FunnelForMultipleChoice(FunnelPreTrainedModel):
self.funnel = FunnelBaseModel(config)
self.classifier = FunnelClassificationHead(config, 1)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings(
......@@ -1420,7 +1426,8 @@ class FunnelForTokenClassification(FunnelPreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
......@@ -1502,7 +1509,8 @@ class FunnelForQuestionAnswering(FunnelPreTrainedModel):
self.funnel = FunnelModel(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
......
......@@ -677,13 +677,14 @@ class GPT2Model(GPT2PreTrainedModel):
self.h = nn.ModuleList([GPT2Block(config, layer_idx=i) for i in range(config.num_hidden_layers)])
self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
self.init_weights()
# Model parallel
self.model_parallel = False
self.device_map = None
self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings(PARALLELIZE_DOCSTRING)
def parallelize(self, device_map=None):
# Check validity of device_map
......@@ -947,12 +948,13 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
self.transformer = GPT2Model(config)
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
self.init_weights()
# Model parallel
self.model_parallel = False
self.device_map = None
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings(PARALLELIZE_DOCSTRING)
def parallelize(self, device_map=None):
self.device_map = (
......@@ -1117,12 +1119,13 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
self.multiple_choice_head = SequenceSummary(config)
self.init_weights()
# Model parallel
self.model_parallel = False
self.device_map = None
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings(PARALLELIZE_DOCSTRING)
def parallelize(self, device_map=None):
self.device_map = (
......@@ -1330,12 +1333,13 @@ class GPT2ForSequenceClassification(GPT2PreTrainedModel):
self.transformer = GPT2Model(config)
self.score = nn.Linear(config.n_embd, self.num_labels, bias=False)
self.init_weights()
# Model parallel
self.model_parallel = False
self.device_map = None
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_TOKENIZER_FOR_DOC,
......@@ -1461,12 +1465,13 @@ class GPT2ForTokenClassification(GPT2PreTrainedModel):
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
# Model parallel
self.model_parallel = False
self.device_map = None
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_TOKENIZER_FOR_DOC,
......
......@@ -486,8 +486,9 @@ class GPTNeoModel(GPTNeoPreTrainedModel):
self.h = nn.ModuleList([GPTNeoBlock(config, layer_id=i) for i in range(config.num_layers)])
self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
self.init_weights()
self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.wte
......@@ -675,7 +676,8 @@ class GPTNeoForCausalLM(GPTNeoPreTrainedModel):
self.transformer = GPTNeoModel(config)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self):
return self.lm_head
......@@ -823,7 +825,8 @@ class GPTNeoForSequenceClassification(GPTNeoPreTrainedModel):
self.transformer = GPTNeoModel(config)
self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(GPT_NEO_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
......
......@@ -444,13 +444,15 @@ class GPTJModel(GPTJPreTrainedModel):
self.drop = nn.Dropout(config.embd_pdrop)
self.h = nn.ModuleList([GPTJBlock(config) for _ in range(config.n_layer)])
self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
self.init_weights()
# Model parallel
self.model_parallel = False
self.device_map = None
self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings(PARALLELIZE_DOCSTRING)
def parallelize(self, device_map=None):
# Check validity of device_map
......@@ -680,12 +682,14 @@ class GPTJForCausalLM(GPTJPreTrainedModel):
super().__init__(config)
self.transformer = GPTJModel(config)
self.lm_head = nn.Linear(config.n_embd, config.vocab_size)
self.init_weights()
# Model parallel
self.model_parallel = False
self.device_map = None
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings(PARALLELIZE_DOCSTRING)
def parallelize(self, device_map=None):
self.device_map = (
......@@ -855,12 +859,13 @@ class GPTJForSequenceClassification(GPTJPreTrainedModel):
self.transformer = GPTJModel(config)
self.score = nn.Linear(config.n_embd, self.num_labels, bias=False)
self.init_weights()
# Model parallel
self.model_parallel = False
self.device_map = None
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(GPTJ_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
processor_class=_TOKENIZER_FOR_DOC,
......
......@@ -899,7 +899,8 @@ class HubertModel(HubertPreTrainedModel):
else:
self.encoder = HubertEncoder(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model._mask_hidden_states
def _mask_hidden_states(
......@@ -1039,7 +1040,8 @@ class HubertForCTC(HubertPreTrainedModel):
)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def freeze_feature_extractor(self):
"""
......@@ -1147,7 +1149,8 @@ class HubertForSequenceClassification(HubertPreTrainedModel):
self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def freeze_feature_extractor(self):
"""
......
......@@ -754,7 +754,8 @@ class IBertModel(IBertPreTrainedModel):
self.pooler = IBertPooler(config) if add_pooling_layer else None
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
......@@ -865,7 +866,8 @@ class IBertForMaskedLM(IBertPreTrainedModel):
self.ibert = IBertModel(config, add_pooling_layer=False)
self.lm_head = IBertLMHead(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self):
return self.lm_head.decoder
......@@ -979,7 +981,8 @@ class IBertForSequenceClassification(IBertPreTrainedModel):
self.ibert = IBertModel(config, add_pooling_layer=False)
self.classifier = IBertClassificationHead(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
......@@ -1074,7 +1077,8 @@ class IBertForMultipleChoice(IBertPreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, 1)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings(
......@@ -1168,7 +1172,8 @@ class IBertForTokenClassification(IBertPreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
......@@ -1277,7 +1282,8 @@ class IBertForQuestionAnswering(IBertPreTrainedModel):
self.ibert = IBertModel(config, add_pooling_layer=False)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
......
......@@ -714,7 +714,8 @@ class LayoutLMModel(LayoutLMPreTrainedModel):
self.encoder = LayoutLMEncoder(config)
self.pooler = LayoutLMPooler(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
......@@ -856,7 +857,8 @@ class LayoutLMForMaskedLM(LayoutLMPreTrainedModel):
self.layoutlm = LayoutLMModel(config)
self.cls = LayoutLMOnlyMLMHead(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.layoutlm.embeddings.word_embeddings
......@@ -979,7 +981,8 @@ class LayoutLMForSequenceClassification(LayoutLMPreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.layoutlm.embeddings.word_embeddings
......@@ -1109,7 +1112,8 @@ class LayoutLMForTokenClassification(LayoutLMPreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.layoutlm.embeddings.word_embeddings
......
......@@ -724,7 +724,8 @@ class LayoutLMv2Model(LayoutLMv2PreTrainedModel):
self.encoder = LayoutLMv2Encoder(config)
self.pooler = LayoutLMv2Pooler(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
......@@ -957,7 +958,8 @@ class LayoutLMv2ForSequenceClassification(LayoutLMv2PreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size * 3, config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.layoutlmv2.embeddings.word_embeddings
......@@ -1124,7 +1126,8 @@ class LayoutLMv2ForTokenClassification(LayoutLMv2PreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.layoutlmv2.embeddings.word_embeddings
......@@ -1239,7 +1242,8 @@ class LayoutLMv2ForQuestionAnswering(LayoutLMv2PreTrainedModel):
self.layoutlmv2 = LayoutLMv2Model(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.layoutlmv2.embeddings.word_embeddings
......
......@@ -1629,8 +1629,9 @@ class LEDEncoder(LEDPreTrainedModel):
self.layers = nn.ModuleList([LEDEncoderLayer(config, i) for i in range(config.encoder_layers)])
self.layernorm_embedding = nn.LayerNorm(embed_dim)
self.init_weights()
self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def _merge_to_attention_mask(self, attention_mask: torch.Tensor, global_attention_mask: torch.Tensor):
# longformer self attention expects attention mask to have 0 (no attn), 1 (local attn), 2 (global attn)
......@@ -1904,8 +1905,9 @@ class LEDDecoder(LEDPreTrainedModel):
self.layers = nn.ModuleList([LEDDecoderLayer(config) for _ in range(config.decoder_layers)])
self.layernorm_embedding = nn.LayerNorm(config.d_model)
self.init_weights()
self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def forward(
self,
......@@ -2156,7 +2158,8 @@ class LEDModel(LEDPreTrainedModel):
self.encoder = LEDEncoder(config, self.shared)
self.decoder = LEDDecoder(config, self.shared)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.shared
......@@ -2283,7 +2286,8 @@ class LEDForConditionalGeneration(LEDPreTrainedModel):
self.register_buffer("final_logits_bias", torch.zeros((1, self.led.shared.num_embeddings)))
self.lm_head = nn.Linear(config.d_model, self.led.shared.num_embeddings, bias=False)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_encoder(self):
return self.led.get_encoder()
......
......@@ -1511,7 +1511,8 @@ class LongformerModel(LongformerPreTrainedModel):
self.encoder = LongformerEncoder(config)
self.pooler = LongformerPooler(config) if add_pooling_layer else None
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
......@@ -1713,7 +1714,8 @@ class LongformerForMaskedLM(LongformerPreTrainedModel):
self.longformer = LongformerModel(config, add_pooling_layer=False)
self.lm_head = LongformerLMHead(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self):
return self.lm_head.decoder
......@@ -1818,7 +1820,8 @@ class LongformerForSequenceClassification(LongformerPreTrainedModel):
self.longformer = LongformerModel(config, add_pooling_layer=False)
self.classifier = LongformerClassificationHead(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
......@@ -1943,7 +1946,8 @@ class LongformerForQuestionAnswering(LongformerPreTrainedModel):
self.longformer = LongformerModel(config, add_pooling_layer=False)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=LongformerQuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC)
......@@ -2080,7 +2084,8 @@ class LongformerForTokenClassification(LongformerPreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
......@@ -2170,7 +2175,8 @@ class LongformerForMultipleChoice(LongformerPreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, 1)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(
LONGFORMER_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
......
......@@ -818,7 +818,8 @@ class LukeModel(LukePreTrainedModel):
self.pooler = LukePooler(config) if add_pooling_layer else None
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
......@@ -1029,7 +1030,8 @@ class LukeForEntityClassification(LukePreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(LUKE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=EntityClassificationOutput, config_class=_CONFIG_FOR_DOC)
......@@ -1142,7 +1144,8 @@ class LukeForEntityPairClassification(LukePreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size * 2, config.num_labels, False)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(LUKE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=EntityPairClassificationOutput, config_class=_CONFIG_FOR_DOC)
......@@ -1257,7 +1260,8 @@ class LukeForEntitySpanClassification(LukePreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size * 3, config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(LUKE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=EntitySpanClassificationOutput, config_class=_CONFIG_FOR_DOC)
......
......@@ -891,7 +891,8 @@ class LxmertModel(LxmertPreTrainedModel):
self.embeddings = LxmertEmbeddings(config)
self.encoder = LxmertEncoder(config)
self.pooler = LxmertPooler(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
......@@ -1048,7 +1049,8 @@ class LxmertForPreTraining(LxmertPreTrainedModel):
self.answer_head = LxmertVisualAnswerHead(config, self.num_qa_labels)
# Weight initialization
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
# Loss functions
self.loss_fcts = {
......@@ -1303,7 +1305,8 @@ class LxmertForQuestionAnswering(LxmertPreTrainedModel):
self.answer_head = LxmertVisualAnswerHead(config, self.num_qa_labels)
# Weight initialization
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
# Loss function
self.loss = CrossEntropyLoss()
......
......@@ -705,8 +705,9 @@ class M2M100Encoder(M2M100PreTrainedModel):
self.layers = nn.ModuleList([M2M100EncoderLayer(config) for _ in range(config.encoder_layers)])
self.layer_norm = nn.LayerNorm(config.d_model)
self.init_weights()
self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def forward(
self,
......@@ -870,8 +871,9 @@ class M2M100Decoder(M2M100PreTrainedModel):
self.layers = nn.ModuleList([M2M100DecoderLayer(config) for _ in range(config.decoder_layers)])
self.layer_norm = nn.LayerNorm(config.d_model)
self.init_weights()
self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def forward(
self,
......@@ -1113,7 +1115,8 @@ class M2M100Model(M2M100PreTrainedModel):
self.encoder = M2M100Encoder(config, self.shared)
self.decoder = M2M100Decoder(config, self.shared)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.shared
......@@ -1232,7 +1235,8 @@ class M2M100ForConditionalGeneration(M2M100PreTrainedModel):
self.model = M2M100Model(config)
self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_encoder(self):
return self.model.get_encoder()
......
......@@ -668,8 +668,10 @@ class MarianEncoder(MarianPreTrainedModel):
self.padding_idx,
)
self.layers = nn.ModuleList([MarianEncoderLayer(config) for _ in range(config.encoder_layers)])
self.init_weights()
self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def forward(
self,
......@@ -829,8 +831,10 @@ class MarianDecoder(MarianPreTrainedModel):
self.padding_idx,
)
self.layers = nn.ModuleList([MarianDecoderLayer(config) for _ in range(config.decoder_layers)])
self.init_weights()
self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.embed_tokens
......@@ -1087,7 +1091,8 @@ class MarianModel(MarianPreTrainedModel):
self.encoder = MarianEncoder(config, self.shared)
self.decoder = MarianDecoder(config, self.shared)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.shared
......@@ -1220,7 +1225,8 @@ class MarianMTModel(MarianPreTrainedModel):
self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_encoder(self):
return self.model.get_encoder()
......@@ -1399,7 +1405,8 @@ class MarianForCausalLM(MarianPreTrainedModel):
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.model.decoder.embed_tokens
......
......@@ -696,8 +696,14 @@ class MBartEncoder(MBartPreTrainedModel):
self.layernorm_embedding = nn.LayerNorm(embed_dim)
self.layer_norm = nn.LayerNorm(config.d_model)
self.init_weights()
self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def _backward_compatibility_gradient_checkpointing(self):
# Override to not delete the attribute from the config
if self.supports_gradient_checkpointing and getattr(self.config, "gradient_checkpointing", False):
self.gradient_checkpointing_enable()
def forward(
self,
......@@ -862,8 +868,9 @@ class MBartDecoder(MBartPreTrainedModel):
self.layernorm_embedding = nn.LayerNorm(config.d_model)
self.layer_norm = nn.LayerNorm(config.d_model)
self.init_weights()
self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.embed_tokens
......@@ -1123,7 +1130,8 @@ class MBartModel(MBartPreTrainedModel):
self.encoder = MBartEncoder(config, self.shared)
self.decoder = MBartDecoder(config, self.shared)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.shared
......@@ -1243,7 +1251,8 @@ class MBartForConditionalGeneration(MBartPreTrainedModel):
self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_encoder(self):
return self.model.get_encoder()
......@@ -1664,7 +1673,8 @@ class MBartForCausalLM(MBartPreTrainedModel):
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.model.decoder.embed_tokens
......
......@@ -857,7 +857,8 @@ class MegatronBertModel(MegatronBertPreTrainedModel):
self.pooler = MegatronBertPooler(config) if add_pooling_layer else None
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
......@@ -1018,7 +1019,8 @@ class MegatronBertForPreTraining(MegatronBertPreTrainedModel):
self.bert = MegatronBertModel(config)
self.cls = MegatronBertPreTrainingHeads(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self):
return self.cls.predictions.decoder
......@@ -1127,7 +1129,8 @@ class MegatronBertForCausalLM(MegatronBertPreTrainedModel):
self.bert = MegatronBertModel(config, add_pooling_layer=False)
self.cls = MegatronBertOnlyMLMHead(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self):
return self.cls.predictions.decoder
......@@ -1274,7 +1277,8 @@ class MegatronBertForMaskedLM(MegatronBertPreTrainedModel):
self.bert = MegatronBertModel(config, add_pooling_layer=False)
self.cls = MegatronBertOnlyMLMHead(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self):
return self.cls.predictions.decoder
......@@ -1375,7 +1379,8 @@ class MegatronBertForNextSentencePrediction(MegatronBertPreTrainedModel):
self.bert = MegatronBertModel(config)
self.cls = MegatronBertOnlyNSPHead(config)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
......@@ -1478,7 +1483,8 @@ class MegatronBertForSequenceClassification(MegatronBertPreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
......@@ -1574,7 +1580,8 @@ class MegatronBertForMultipleChoice(MegatronBertPreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, 1)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(
MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
......@@ -1671,7 +1678,8 @@ class MegatronBertForTokenClassification(MegatronBertPreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
......@@ -1761,7 +1769,8 @@ class MegatronBertForQuestionAnswering(MegatronBertPreTrainedModel):
self.bert = MegatronBertModel(config, add_pooling_layer=False)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment