Unverified Commit d83b0e0c authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Add a post init method to all models (#14431)

* Add a post init method to all models

* Fix tests

* Fix last tests

* Fix templates

* Add comment

* Forgot to save
parent 08816de1
...@@ -817,7 +817,8 @@ class ElectraModel(ElectraPreTrainedModel): ...@@ -817,7 +817,8 @@ class ElectraModel(ElectraPreTrainedModel):
self.encoder = ElectraEncoder(config) self.encoder = ElectraEncoder(config)
self.config = config self.config = config
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings.word_embeddings return self.embeddings.word_embeddings
...@@ -939,7 +940,8 @@ class ElectraForSequenceClassification(ElectraPreTrainedModel): ...@@ -939,7 +940,8 @@ class ElectraForSequenceClassification(ElectraPreTrainedModel):
self.electra = ElectraModel(config) self.electra = ElectraModel(config)
self.classifier = ElectraClassificationHead(config) self.classifier = ElectraClassificationHead(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
...@@ -1033,7 +1035,8 @@ class ElectraForPreTraining(ElectraPreTrainedModel): ...@@ -1033,7 +1035,8 @@ class ElectraForPreTraining(ElectraPreTrainedModel):
self.electra = ElectraModel(config) self.electra = ElectraModel(config)
self.discriminator_predictions = ElectraDiscriminatorPredictions(config) self.discriminator_predictions = ElectraDiscriminatorPredictions(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=ElectraForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=ElectraForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
...@@ -1128,7 +1131,8 @@ class ElectraForMaskedLM(ElectraPreTrainedModel): ...@@ -1128,7 +1131,8 @@ class ElectraForMaskedLM(ElectraPreTrainedModel):
self.generator_predictions = ElectraGeneratorPredictions(config) self.generator_predictions = ElectraGeneratorPredictions(config)
self.generator_lm_head = nn.Linear(config.embedding_size, config.vocab_size) self.generator_lm_head = nn.Linear(config.embedding_size, config.vocab_size)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self): def get_output_embeddings(self):
return self.generator_lm_head return self.generator_lm_head
...@@ -1216,7 +1220,8 @@ class ElectraForTokenClassification(ElectraPreTrainedModel): ...@@ -1216,7 +1220,8 @@ class ElectraForTokenClassification(ElectraPreTrainedModel):
) )
self.dropout = nn.Dropout(classifier_dropout) self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
...@@ -1305,7 +1310,8 @@ class ElectraForQuestionAnswering(ElectraPreTrainedModel): ...@@ -1305,7 +1310,8 @@ class ElectraForQuestionAnswering(ElectraPreTrainedModel):
self.electra = ElectraModel(config) self.electra = ElectraModel(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
...@@ -1406,7 +1412,8 @@ class ElectraForMultipleChoice(ElectraPreTrainedModel): ...@@ -1406,7 +1412,8 @@ class ElectraForMultipleChoice(ElectraPreTrainedModel):
self.sequence_summary = SequenceSummary(config) self.sequence_summary = SequenceSummary(config)
self.classifier = nn.Linear(config.hidden_size, 1) self.classifier = nn.Linear(config.hidden_size, 1)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
......
...@@ -336,7 +336,8 @@ class FlaubertWithLMHeadModel(XLMWithLMHeadModel): ...@@ -336,7 +336,8 @@ class FlaubertWithLMHeadModel(XLMWithLMHeadModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.transformer = FlaubertModel(config) self.transformer = FlaubertModel(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings( @add_start_docstrings(
...@@ -357,7 +358,8 @@ class FlaubertForSequenceClassification(XLMForSequenceClassification): ...@@ -357,7 +358,8 @@ class FlaubertForSequenceClassification(XLMForSequenceClassification):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.transformer = FlaubertModel(config) self.transformer = FlaubertModel(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings( @add_start_docstrings(
...@@ -378,7 +380,8 @@ class FlaubertForTokenClassification(XLMForTokenClassification): ...@@ -378,7 +380,8 @@ class FlaubertForTokenClassification(XLMForTokenClassification):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.transformer = FlaubertModel(config) self.transformer = FlaubertModel(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings( @add_start_docstrings(
...@@ -399,7 +402,8 @@ class FlaubertForQuestionAnsweringSimple(XLMForQuestionAnsweringSimple): ...@@ -399,7 +402,8 @@ class FlaubertForQuestionAnsweringSimple(XLMForQuestionAnsweringSimple):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.transformer = FlaubertModel(config) self.transformer = FlaubertModel(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings( @add_start_docstrings(
...@@ -420,7 +424,8 @@ class FlaubertForQuestionAnswering(XLMForQuestionAnswering): ...@@ -420,7 +424,8 @@ class FlaubertForQuestionAnswering(XLMForQuestionAnswering):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.transformer = FlaubertModel(config) self.transformer = FlaubertModel(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings( @add_start_docstrings(
...@@ -441,4 +446,5 @@ class FlaubertForMultipleChoice(XLMForMultipleChoice): ...@@ -441,4 +446,5 @@ class FlaubertForMultipleChoice(XLMForMultipleChoice):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.transformer = FlaubertModel(config) self.transformer = FlaubertModel(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
...@@ -535,7 +535,8 @@ class FNetModel(FNetPreTrainedModel): ...@@ -535,7 +535,8 @@ class FNetModel(FNetPreTrainedModel):
self.pooler = FNetPooler(config) if add_pooling_layer else None self.pooler = FNetPooler(config) if add_pooling_layer else None
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings.word_embeddings return self.embeddings.word_embeddings
...@@ -633,7 +634,8 @@ class FNetForPreTraining(FNetPreTrainedModel): ...@@ -633,7 +634,8 @@ class FNetForPreTraining(FNetPreTrainedModel):
self.fnet = FNetModel(config) self.fnet = FNetModel(config)
self.cls = FNetPreTrainingHeads(config) self.cls = FNetPreTrainingHeads(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self): def get_output_embeddings(self):
return self.cls.predictions.decoder return self.cls.predictions.decoder
...@@ -723,7 +725,8 @@ class FNetForMaskedLM(FNetPreTrainedModel): ...@@ -723,7 +725,8 @@ class FNetForMaskedLM(FNetPreTrainedModel):
self.fnet = FNetModel(config) self.fnet = FNetModel(config)
self.cls = FNetOnlyMLMHead(config) self.cls = FNetOnlyMLMHead(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self): def get_output_embeddings(self):
return self.cls.predictions.decoder return self.cls.predictions.decoder
...@@ -791,7 +794,8 @@ class FNetForNextSentencePrediction(FNetPreTrainedModel): ...@@ -791,7 +794,8 @@ class FNetForNextSentencePrediction(FNetPreTrainedModel):
self.fnet = FNetModel(config) self.fnet = FNetModel(config)
self.cls = FNetOnlyNSPHead(config) self.cls = FNetOnlyNSPHead(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(FNET_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(FNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
...@@ -885,7 +889,8 @@ class FNetForSequenceClassification(FNetPreTrainedModel): ...@@ -885,7 +889,8 @@ class FNetForSequenceClassification(FNetPreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(FNET_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(FNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
...@@ -969,7 +974,8 @@ class FNetForMultipleChoice(FNetPreTrainedModel): ...@@ -969,7 +974,8 @@ class FNetForMultipleChoice(FNetPreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, 1) self.classifier = nn.Linear(config.hidden_size, 1)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(FNET_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @add_start_docstrings_to_model_forward(FNET_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
...@@ -1050,7 +1056,8 @@ class FNetForTokenClassification(FNetPreTrainedModel): ...@@ -1050,7 +1056,8 @@ class FNetForTokenClassification(FNetPreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(FNET_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(FNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
...@@ -1119,7 +1126,8 @@ class FNetForQuestionAnswering(FNetPreTrainedModel): ...@@ -1119,7 +1126,8 @@ class FNetForQuestionAnswering(FNetPreTrainedModel):
self.fnet = FNetModel(config) self.fnet = FNetModel(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(FNET_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(FNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
......
...@@ -1003,7 +1003,8 @@ class FSMTModel(PretrainedFSMTModel): ...@@ -1003,7 +1003,8 @@ class FSMTModel(PretrainedFSMTModel):
self.encoder = FSMTEncoder(config, encoder_embed_tokens) self.encoder = FSMTEncoder(config, encoder_embed_tokens)
self.decoder = FSMTDecoder(config, decoder_embed_tokens) self.decoder = FSMTDecoder(config, decoder_embed_tokens)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(FSMT_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(FSMT_INPUTS_DOCSTRING)
@add_code_sample_docstrings( @add_code_sample_docstrings(
......
...@@ -900,7 +900,8 @@ class FunnelBaseModel(FunnelPreTrainedModel): ...@@ -900,7 +900,8 @@ class FunnelBaseModel(FunnelPreTrainedModel):
self.embeddings = FunnelEmbeddings(config) self.embeddings = FunnelEmbeddings(config)
self.encoder = FunnelEncoder(config) self.encoder = FunnelEncoder(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings.word_embeddings return self.embeddings.word_embeddings
...@@ -977,7 +978,8 @@ class FunnelModel(FunnelPreTrainedModel): ...@@ -977,7 +978,8 @@ class FunnelModel(FunnelPreTrainedModel):
self.encoder = FunnelEncoder(config) self.encoder = FunnelEncoder(config)
self.decoder = FunnelDecoder(config) self.decoder = FunnelDecoder(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings.word_embeddings return self.embeddings.word_embeddings
...@@ -1082,7 +1084,8 @@ class FunnelForPreTraining(FunnelPreTrainedModel): ...@@ -1082,7 +1084,8 @@ class FunnelForPreTraining(FunnelPreTrainedModel):
self.funnel = FunnelModel(config) self.funnel = FunnelModel(config)
self.discriminator_predictions = FunnelDiscriminatorPredictions(config) self.discriminator_predictions = FunnelDiscriminatorPredictions(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=FunnelForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=FunnelForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
...@@ -1164,7 +1167,8 @@ class FunnelForMaskedLM(FunnelPreTrainedModel): ...@@ -1164,7 +1167,8 @@ class FunnelForMaskedLM(FunnelPreTrainedModel):
self.funnel = FunnelModel(config) self.funnel = FunnelModel(config)
self.lm_head = nn.Linear(config.d_model, config.vocab_size) self.lm_head = nn.Linear(config.d_model, config.vocab_size)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self): def get_output_embeddings(self):
return self.lm_head return self.lm_head
...@@ -1244,7 +1248,8 @@ class FunnelForSequenceClassification(FunnelPreTrainedModel): ...@@ -1244,7 +1248,8 @@ class FunnelForSequenceClassification(FunnelPreTrainedModel):
self.funnel = FunnelBaseModel(config) self.funnel = FunnelBaseModel(config)
self.classifier = FunnelClassificationHead(config, config.num_labels) self.classifier = FunnelClassificationHead(config, config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
...@@ -1334,7 +1339,8 @@ class FunnelForMultipleChoice(FunnelPreTrainedModel): ...@@ -1334,7 +1339,8 @@ class FunnelForMultipleChoice(FunnelPreTrainedModel):
self.funnel = FunnelBaseModel(config) self.funnel = FunnelBaseModel(config)
self.classifier = FunnelClassificationHead(config, 1) self.classifier = FunnelClassificationHead(config, 1)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
...@@ -1420,7 +1426,8 @@ class FunnelForTokenClassification(FunnelPreTrainedModel): ...@@ -1420,7 +1426,8 @@ class FunnelForTokenClassification(FunnelPreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout) self.dropout = nn.Dropout(config.hidden_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
...@@ -1502,7 +1509,8 @@ class FunnelForQuestionAnswering(FunnelPreTrainedModel): ...@@ -1502,7 +1509,8 @@ class FunnelForQuestionAnswering(FunnelPreTrainedModel):
self.funnel = FunnelModel(config) self.funnel = FunnelModel(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
......
...@@ -677,13 +677,14 @@ class GPT2Model(GPT2PreTrainedModel): ...@@ -677,13 +677,14 @@ class GPT2Model(GPT2PreTrainedModel):
self.h = nn.ModuleList([GPT2Block(config, layer_idx=i) for i in range(config.num_hidden_layers)]) self.h = nn.ModuleList([GPT2Block(config, layer_idx=i) for i in range(config.num_hidden_layers)])
self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
self.init_weights()
# Model parallel # Model parallel
self.model_parallel = False self.model_parallel = False
self.device_map = None self.device_map = None
self.gradient_checkpointing = False self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings(PARALLELIZE_DOCSTRING) @add_start_docstrings(PARALLELIZE_DOCSTRING)
def parallelize(self, device_map=None): def parallelize(self, device_map=None):
# Check validity of device_map # Check validity of device_map
...@@ -947,12 +948,13 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): ...@@ -947,12 +948,13 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
self.transformer = GPT2Model(config) self.transformer = GPT2Model(config)
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
self.init_weights()
# Model parallel # Model parallel
self.model_parallel = False self.model_parallel = False
self.device_map = None self.device_map = None
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings(PARALLELIZE_DOCSTRING) @add_start_docstrings(PARALLELIZE_DOCSTRING)
def parallelize(self, device_map=None): def parallelize(self, device_map=None):
self.device_map = ( self.device_map = (
...@@ -1117,12 +1119,13 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): ...@@ -1117,12 +1119,13 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
self.multiple_choice_head = SequenceSummary(config) self.multiple_choice_head = SequenceSummary(config)
self.init_weights()
# Model parallel # Model parallel
self.model_parallel = False self.model_parallel = False
self.device_map = None self.device_map = None
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings(PARALLELIZE_DOCSTRING) @add_start_docstrings(PARALLELIZE_DOCSTRING)
def parallelize(self, device_map=None): def parallelize(self, device_map=None):
self.device_map = ( self.device_map = (
...@@ -1330,12 +1333,13 @@ class GPT2ForSequenceClassification(GPT2PreTrainedModel): ...@@ -1330,12 +1333,13 @@ class GPT2ForSequenceClassification(GPT2PreTrainedModel):
self.transformer = GPT2Model(config) self.transformer = GPT2Model(config)
self.score = nn.Linear(config.n_embd, self.num_labels, bias=False) self.score = nn.Linear(config.n_embd, self.num_labels, bias=False)
self.init_weights()
# Model parallel # Model parallel
self.model_parallel = False self.model_parallel = False
self.device_map = None self.device_map = None
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
@add_code_sample_docstrings( @add_code_sample_docstrings(
processor_class=_TOKENIZER_FOR_DOC, processor_class=_TOKENIZER_FOR_DOC,
...@@ -1461,12 +1465,13 @@ class GPT2ForTokenClassification(GPT2PreTrainedModel): ...@@ -1461,12 +1465,13 @@ class GPT2ForTokenClassification(GPT2PreTrainedModel):
self.dropout = nn.Dropout(classifier_dropout) self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
# Model parallel # Model parallel
self.model_parallel = False self.model_parallel = False
self.device_map = None self.device_map = None
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
@add_code_sample_docstrings( @add_code_sample_docstrings(
processor_class=_TOKENIZER_FOR_DOC, processor_class=_TOKENIZER_FOR_DOC,
......
...@@ -486,8 +486,9 @@ class GPTNeoModel(GPTNeoPreTrainedModel): ...@@ -486,8 +486,9 @@ class GPTNeoModel(GPTNeoPreTrainedModel):
self.h = nn.ModuleList([GPTNeoBlock(config, layer_id=i) for i in range(config.num_layers)]) self.h = nn.ModuleList([GPTNeoBlock(config, layer_id=i) for i in range(config.num_layers)])
self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
self.init_weights()
self.gradient_checkpointing = False self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.wte return self.wte
...@@ -675,7 +676,8 @@ class GPTNeoForCausalLM(GPTNeoPreTrainedModel): ...@@ -675,7 +676,8 @@ class GPTNeoForCausalLM(GPTNeoPreTrainedModel):
self.transformer = GPTNeoModel(config) self.transformer = GPTNeoModel(config)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self): def get_output_embeddings(self):
return self.lm_head return self.lm_head
...@@ -823,7 +825,8 @@ class GPTNeoForSequenceClassification(GPTNeoPreTrainedModel): ...@@ -823,7 +825,8 @@ class GPTNeoForSequenceClassification(GPTNeoPreTrainedModel):
self.transformer = GPTNeoModel(config) self.transformer = GPTNeoModel(config)
self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(GPT_NEO_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(GPT_NEO_INPUTS_DOCSTRING)
@add_code_sample_docstrings( @add_code_sample_docstrings(
......
...@@ -444,13 +444,15 @@ class GPTJModel(GPTJPreTrainedModel): ...@@ -444,13 +444,15 @@ class GPTJModel(GPTJPreTrainedModel):
self.drop = nn.Dropout(config.embd_pdrop) self.drop = nn.Dropout(config.embd_pdrop)
self.h = nn.ModuleList([GPTJBlock(config) for _ in range(config.n_layer)]) self.h = nn.ModuleList([GPTJBlock(config) for _ in range(config.n_layer)])
self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
self.init_weights()
# Model parallel # Model parallel
self.model_parallel = False self.model_parallel = False
self.device_map = None self.device_map = None
self.gradient_checkpointing = False self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings(PARALLELIZE_DOCSTRING) @add_start_docstrings(PARALLELIZE_DOCSTRING)
def parallelize(self, device_map=None): def parallelize(self, device_map=None):
# Check validity of device_map # Check validity of device_map
...@@ -680,12 +682,14 @@ class GPTJForCausalLM(GPTJPreTrainedModel): ...@@ -680,12 +682,14 @@ class GPTJForCausalLM(GPTJPreTrainedModel):
super().__init__(config) super().__init__(config)
self.transformer = GPTJModel(config) self.transformer = GPTJModel(config)
self.lm_head = nn.Linear(config.n_embd, config.vocab_size) self.lm_head = nn.Linear(config.n_embd, config.vocab_size)
self.init_weights()
# Model parallel # Model parallel
self.model_parallel = False self.model_parallel = False
self.device_map = None self.device_map = None
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings(PARALLELIZE_DOCSTRING) @add_start_docstrings(PARALLELIZE_DOCSTRING)
def parallelize(self, device_map=None): def parallelize(self, device_map=None):
self.device_map = ( self.device_map = (
...@@ -855,12 +859,13 @@ class GPTJForSequenceClassification(GPTJPreTrainedModel): ...@@ -855,12 +859,13 @@ class GPTJForSequenceClassification(GPTJPreTrainedModel):
self.transformer = GPTJModel(config) self.transformer = GPTJModel(config)
self.score = nn.Linear(config.n_embd, self.num_labels, bias=False) self.score = nn.Linear(config.n_embd, self.num_labels, bias=False)
self.init_weights()
# Model parallel # Model parallel
self.model_parallel = False self.model_parallel = False
self.device_map = None self.device_map = None
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(GPTJ_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(GPTJ_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
processor_class=_TOKENIZER_FOR_DOC, processor_class=_TOKENIZER_FOR_DOC,
......
...@@ -899,7 +899,8 @@ class HubertModel(HubertPreTrainedModel): ...@@ -899,7 +899,8 @@ class HubertModel(HubertPreTrainedModel):
else: else:
self.encoder = HubertEncoder(config) self.encoder = HubertEncoder(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model._mask_hidden_states # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model._mask_hidden_states
def _mask_hidden_states( def _mask_hidden_states(
...@@ -1039,7 +1040,8 @@ class HubertForCTC(HubertPreTrainedModel): ...@@ -1039,7 +1040,8 @@ class HubertForCTC(HubertPreTrainedModel):
) )
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size) self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def freeze_feature_extractor(self): def freeze_feature_extractor(self):
""" """
...@@ -1147,7 +1149,8 @@ class HubertForSequenceClassification(HubertPreTrainedModel): ...@@ -1147,7 +1149,8 @@ class HubertForSequenceClassification(HubertPreTrainedModel):
self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size) self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels) self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def freeze_feature_extractor(self): def freeze_feature_extractor(self):
""" """
......
...@@ -754,7 +754,8 @@ class IBertModel(IBertPreTrainedModel): ...@@ -754,7 +754,8 @@ class IBertModel(IBertPreTrainedModel):
self.pooler = IBertPooler(config) if add_pooling_layer else None self.pooler = IBertPooler(config) if add_pooling_layer else None
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings.word_embeddings return self.embeddings.word_embeddings
...@@ -865,7 +866,8 @@ class IBertForMaskedLM(IBertPreTrainedModel): ...@@ -865,7 +866,8 @@ class IBertForMaskedLM(IBertPreTrainedModel):
self.ibert = IBertModel(config, add_pooling_layer=False) self.ibert = IBertModel(config, add_pooling_layer=False)
self.lm_head = IBertLMHead(config) self.lm_head = IBertLMHead(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self): def get_output_embeddings(self):
return self.lm_head.decoder return self.lm_head.decoder
...@@ -979,7 +981,8 @@ class IBertForSequenceClassification(IBertPreTrainedModel): ...@@ -979,7 +981,8 @@ class IBertForSequenceClassification(IBertPreTrainedModel):
self.ibert = IBertModel(config, add_pooling_layer=False) self.ibert = IBertModel(config, add_pooling_layer=False)
self.classifier = IBertClassificationHead(config) self.classifier = IBertClassificationHead(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
...@@ -1074,7 +1077,8 @@ class IBertForMultipleChoice(IBertPreTrainedModel): ...@@ -1074,7 +1077,8 @@ class IBertForMultipleChoice(IBertPreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, 1) self.classifier = nn.Linear(config.hidden_size, 1)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
...@@ -1168,7 +1172,8 @@ class IBertForTokenClassification(IBertPreTrainedModel): ...@@ -1168,7 +1172,8 @@ class IBertForTokenClassification(IBertPreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
...@@ -1277,7 +1282,8 @@ class IBertForQuestionAnswering(IBertPreTrainedModel): ...@@ -1277,7 +1282,8 @@ class IBertForQuestionAnswering(IBertPreTrainedModel):
self.ibert = IBertModel(config, add_pooling_layer=False) self.ibert = IBertModel(config, add_pooling_layer=False)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
......
...@@ -714,7 +714,8 @@ class LayoutLMModel(LayoutLMPreTrainedModel): ...@@ -714,7 +714,8 @@ class LayoutLMModel(LayoutLMPreTrainedModel):
self.encoder = LayoutLMEncoder(config) self.encoder = LayoutLMEncoder(config)
self.pooler = LayoutLMPooler(config) self.pooler = LayoutLMPooler(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings.word_embeddings return self.embeddings.word_embeddings
...@@ -856,7 +857,8 @@ class LayoutLMForMaskedLM(LayoutLMPreTrainedModel): ...@@ -856,7 +857,8 @@ class LayoutLMForMaskedLM(LayoutLMPreTrainedModel):
self.layoutlm = LayoutLMModel(config) self.layoutlm = LayoutLMModel(config)
self.cls = LayoutLMOnlyMLMHead(config) self.cls = LayoutLMOnlyMLMHead(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.layoutlm.embeddings.word_embeddings return self.layoutlm.embeddings.word_embeddings
...@@ -979,7 +981,8 @@ class LayoutLMForSequenceClassification(LayoutLMPreTrainedModel): ...@@ -979,7 +981,8 @@ class LayoutLMForSequenceClassification(LayoutLMPreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.layoutlm.embeddings.word_embeddings return self.layoutlm.embeddings.word_embeddings
...@@ -1109,7 +1112,8 @@ class LayoutLMForTokenClassification(LayoutLMPreTrainedModel): ...@@ -1109,7 +1112,8 @@ class LayoutLMForTokenClassification(LayoutLMPreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.layoutlm.embeddings.word_embeddings return self.layoutlm.embeddings.word_embeddings
......
...@@ -724,7 +724,8 @@ class LayoutLMv2Model(LayoutLMv2PreTrainedModel): ...@@ -724,7 +724,8 @@ class LayoutLMv2Model(LayoutLMv2PreTrainedModel):
self.encoder = LayoutLMv2Encoder(config) self.encoder = LayoutLMv2Encoder(config)
self.pooler = LayoutLMv2Pooler(config) self.pooler = LayoutLMv2Pooler(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings.word_embeddings return self.embeddings.word_embeddings
...@@ -957,7 +958,8 @@ class LayoutLMv2ForSequenceClassification(LayoutLMv2PreTrainedModel): ...@@ -957,7 +958,8 @@ class LayoutLMv2ForSequenceClassification(LayoutLMv2PreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size * 3, config.num_labels) self.classifier = nn.Linear(config.hidden_size * 3, config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.layoutlmv2.embeddings.word_embeddings return self.layoutlmv2.embeddings.word_embeddings
...@@ -1124,7 +1126,8 @@ class LayoutLMv2ForTokenClassification(LayoutLMv2PreTrainedModel): ...@@ -1124,7 +1126,8 @@ class LayoutLMv2ForTokenClassification(LayoutLMv2PreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.layoutlmv2.embeddings.word_embeddings return self.layoutlmv2.embeddings.word_embeddings
...@@ -1239,7 +1242,8 @@ class LayoutLMv2ForQuestionAnswering(LayoutLMv2PreTrainedModel): ...@@ -1239,7 +1242,8 @@ class LayoutLMv2ForQuestionAnswering(LayoutLMv2PreTrainedModel):
self.layoutlmv2 = LayoutLMv2Model(config) self.layoutlmv2 = LayoutLMv2Model(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.layoutlmv2.embeddings.word_embeddings return self.layoutlmv2.embeddings.word_embeddings
......
...@@ -1629,8 +1629,9 @@ class LEDEncoder(LEDPreTrainedModel): ...@@ -1629,8 +1629,9 @@ class LEDEncoder(LEDPreTrainedModel):
self.layers = nn.ModuleList([LEDEncoderLayer(config, i) for i in range(config.encoder_layers)]) self.layers = nn.ModuleList([LEDEncoderLayer(config, i) for i in range(config.encoder_layers)])
self.layernorm_embedding = nn.LayerNorm(embed_dim) self.layernorm_embedding = nn.LayerNorm(embed_dim)
self.init_weights()
self.gradient_checkpointing = False self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def _merge_to_attention_mask(self, attention_mask: torch.Tensor, global_attention_mask: torch.Tensor): def _merge_to_attention_mask(self, attention_mask: torch.Tensor, global_attention_mask: torch.Tensor):
# longformer self attention expects attention mask to have 0 (no attn), 1 (local attn), 2 (global attn) # longformer self attention expects attention mask to have 0 (no attn), 1 (local attn), 2 (global attn)
...@@ -1904,8 +1905,9 @@ class LEDDecoder(LEDPreTrainedModel): ...@@ -1904,8 +1905,9 @@ class LEDDecoder(LEDPreTrainedModel):
self.layers = nn.ModuleList([LEDDecoderLayer(config) for _ in range(config.decoder_layers)]) self.layers = nn.ModuleList([LEDDecoderLayer(config) for _ in range(config.decoder_layers)])
self.layernorm_embedding = nn.LayerNorm(config.d_model) self.layernorm_embedding = nn.LayerNorm(config.d_model)
self.init_weights()
self.gradient_checkpointing = False self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def forward( def forward(
self, self,
...@@ -2156,7 +2158,8 @@ class LEDModel(LEDPreTrainedModel): ...@@ -2156,7 +2158,8 @@ class LEDModel(LEDPreTrainedModel):
self.encoder = LEDEncoder(config, self.shared) self.encoder = LEDEncoder(config, self.shared)
self.decoder = LEDDecoder(config, self.shared) self.decoder = LEDDecoder(config, self.shared)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.shared return self.shared
...@@ -2283,7 +2286,8 @@ class LEDForConditionalGeneration(LEDPreTrainedModel): ...@@ -2283,7 +2286,8 @@ class LEDForConditionalGeneration(LEDPreTrainedModel):
self.register_buffer("final_logits_bias", torch.zeros((1, self.led.shared.num_embeddings))) self.register_buffer("final_logits_bias", torch.zeros((1, self.led.shared.num_embeddings)))
self.lm_head = nn.Linear(config.d_model, self.led.shared.num_embeddings, bias=False) self.lm_head = nn.Linear(config.d_model, self.led.shared.num_embeddings, bias=False)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_encoder(self): def get_encoder(self):
return self.led.get_encoder() return self.led.get_encoder()
......
...@@ -1511,7 +1511,8 @@ class LongformerModel(LongformerPreTrainedModel): ...@@ -1511,7 +1511,8 @@ class LongformerModel(LongformerPreTrainedModel):
self.encoder = LongformerEncoder(config) self.encoder = LongformerEncoder(config)
self.pooler = LongformerPooler(config) if add_pooling_layer else None self.pooler = LongformerPooler(config) if add_pooling_layer else None
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings.word_embeddings return self.embeddings.word_embeddings
...@@ -1713,7 +1714,8 @@ class LongformerForMaskedLM(LongformerPreTrainedModel): ...@@ -1713,7 +1714,8 @@ class LongformerForMaskedLM(LongformerPreTrainedModel):
self.longformer = LongformerModel(config, add_pooling_layer=False) self.longformer = LongformerModel(config, add_pooling_layer=False)
self.lm_head = LongformerLMHead(config) self.lm_head = LongformerLMHead(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self): def get_output_embeddings(self):
return self.lm_head.decoder return self.lm_head.decoder
...@@ -1818,7 +1820,8 @@ class LongformerForSequenceClassification(LongformerPreTrainedModel): ...@@ -1818,7 +1820,8 @@ class LongformerForSequenceClassification(LongformerPreTrainedModel):
self.longformer = LongformerModel(config, add_pooling_layer=False) self.longformer = LongformerModel(config, add_pooling_layer=False)
self.classifier = LongformerClassificationHead(config) self.classifier = LongformerClassificationHead(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
...@@ -1943,7 +1946,8 @@ class LongformerForQuestionAnswering(LongformerPreTrainedModel): ...@@ -1943,7 +1946,8 @@ class LongformerForQuestionAnswering(LongformerPreTrainedModel):
self.longformer = LongformerModel(config, add_pooling_layer=False) self.longformer = LongformerModel(config, add_pooling_layer=False)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=LongformerQuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=LongformerQuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC)
...@@ -2080,7 +2084,8 @@ class LongformerForTokenClassification(LongformerPreTrainedModel): ...@@ -2080,7 +2084,8 @@ class LongformerForTokenClassification(LongformerPreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
...@@ -2170,7 +2175,8 @@ class LongformerForMultipleChoice(LongformerPreTrainedModel): ...@@ -2170,7 +2175,8 @@ class LongformerForMultipleChoice(LongformerPreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, 1) self.classifier = nn.Linear(config.hidden_size, 1)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward( @add_start_docstrings_to_model_forward(
LONGFORMER_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") LONGFORMER_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
......
...@@ -818,7 +818,8 @@ class LukeModel(LukePreTrainedModel): ...@@ -818,7 +818,8 @@ class LukeModel(LukePreTrainedModel):
self.pooler = LukePooler(config) if add_pooling_layer else None self.pooler = LukePooler(config) if add_pooling_layer else None
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings.word_embeddings return self.embeddings.word_embeddings
...@@ -1029,7 +1030,8 @@ class LukeForEntityClassification(LukePreTrainedModel): ...@@ -1029,7 +1030,8 @@ class LukeForEntityClassification(LukePreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(LUKE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(LUKE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=EntityClassificationOutput, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=EntityClassificationOutput, config_class=_CONFIG_FOR_DOC)
...@@ -1142,7 +1144,8 @@ class LukeForEntityPairClassification(LukePreTrainedModel): ...@@ -1142,7 +1144,8 @@ class LukeForEntityPairClassification(LukePreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size * 2, config.num_labels, False) self.classifier = nn.Linear(config.hidden_size * 2, config.num_labels, False)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(LUKE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(LUKE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=EntityPairClassificationOutput, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=EntityPairClassificationOutput, config_class=_CONFIG_FOR_DOC)
...@@ -1257,7 +1260,8 @@ class LukeForEntitySpanClassification(LukePreTrainedModel): ...@@ -1257,7 +1260,8 @@ class LukeForEntitySpanClassification(LukePreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size * 3, config.num_labels) self.classifier = nn.Linear(config.hidden_size * 3, config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(LUKE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(LUKE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=EntitySpanClassificationOutput, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=EntitySpanClassificationOutput, config_class=_CONFIG_FOR_DOC)
......
...@@ -891,7 +891,8 @@ class LxmertModel(LxmertPreTrainedModel): ...@@ -891,7 +891,8 @@ class LxmertModel(LxmertPreTrainedModel):
self.embeddings = LxmertEmbeddings(config) self.embeddings = LxmertEmbeddings(config)
self.encoder = LxmertEncoder(config) self.encoder = LxmertEncoder(config)
self.pooler = LxmertPooler(config) self.pooler = LxmertPooler(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings.word_embeddings return self.embeddings.word_embeddings
...@@ -1048,7 +1049,8 @@ class LxmertForPreTraining(LxmertPreTrainedModel): ...@@ -1048,7 +1049,8 @@ class LxmertForPreTraining(LxmertPreTrainedModel):
self.answer_head = LxmertVisualAnswerHead(config, self.num_qa_labels) self.answer_head = LxmertVisualAnswerHead(config, self.num_qa_labels)
# Weight initialization # Weight initialization
self.init_weights() # Initialize weights and apply final processing
self.post_init()
# Loss functions # Loss functions
self.loss_fcts = { self.loss_fcts = {
...@@ -1303,7 +1305,8 @@ class LxmertForQuestionAnswering(LxmertPreTrainedModel): ...@@ -1303,7 +1305,8 @@ class LxmertForQuestionAnswering(LxmertPreTrainedModel):
self.answer_head = LxmertVisualAnswerHead(config, self.num_qa_labels) self.answer_head = LxmertVisualAnswerHead(config, self.num_qa_labels)
# Weight initialization # Weight initialization
self.init_weights() # Initialize weights and apply final processing
self.post_init()
# Loss function # Loss function
self.loss = CrossEntropyLoss() self.loss = CrossEntropyLoss()
......
...@@ -705,8 +705,9 @@ class M2M100Encoder(M2M100PreTrainedModel): ...@@ -705,8 +705,9 @@ class M2M100Encoder(M2M100PreTrainedModel):
self.layers = nn.ModuleList([M2M100EncoderLayer(config) for _ in range(config.encoder_layers)]) self.layers = nn.ModuleList([M2M100EncoderLayer(config) for _ in range(config.encoder_layers)])
self.layer_norm = nn.LayerNorm(config.d_model) self.layer_norm = nn.LayerNorm(config.d_model)
self.init_weights()
self.gradient_checkpointing = False self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def forward( def forward(
self, self,
...@@ -870,8 +871,9 @@ class M2M100Decoder(M2M100PreTrainedModel): ...@@ -870,8 +871,9 @@ class M2M100Decoder(M2M100PreTrainedModel):
self.layers = nn.ModuleList([M2M100DecoderLayer(config) for _ in range(config.decoder_layers)]) self.layers = nn.ModuleList([M2M100DecoderLayer(config) for _ in range(config.decoder_layers)])
self.layer_norm = nn.LayerNorm(config.d_model) self.layer_norm = nn.LayerNorm(config.d_model)
self.init_weights()
self.gradient_checkpointing = False self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def forward( def forward(
self, self,
...@@ -1113,7 +1115,8 @@ class M2M100Model(M2M100PreTrainedModel): ...@@ -1113,7 +1115,8 @@ class M2M100Model(M2M100PreTrainedModel):
self.encoder = M2M100Encoder(config, self.shared) self.encoder = M2M100Encoder(config, self.shared)
self.decoder = M2M100Decoder(config, self.shared) self.decoder = M2M100Decoder(config, self.shared)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.shared return self.shared
...@@ -1232,7 +1235,8 @@ class M2M100ForConditionalGeneration(M2M100PreTrainedModel): ...@@ -1232,7 +1235,8 @@ class M2M100ForConditionalGeneration(M2M100PreTrainedModel):
self.model = M2M100Model(config) self.model = M2M100Model(config)
self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False) self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_encoder(self): def get_encoder(self):
return self.model.get_encoder() return self.model.get_encoder()
......
...@@ -668,8 +668,10 @@ class MarianEncoder(MarianPreTrainedModel): ...@@ -668,8 +668,10 @@ class MarianEncoder(MarianPreTrainedModel):
self.padding_idx, self.padding_idx,
) )
self.layers = nn.ModuleList([MarianEncoderLayer(config) for _ in range(config.encoder_layers)]) self.layers = nn.ModuleList([MarianEncoderLayer(config) for _ in range(config.encoder_layers)])
self.init_weights()
self.gradient_checkpointing = False self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def forward( def forward(
self, self,
...@@ -829,8 +831,10 @@ class MarianDecoder(MarianPreTrainedModel): ...@@ -829,8 +831,10 @@ class MarianDecoder(MarianPreTrainedModel):
self.padding_idx, self.padding_idx,
) )
self.layers = nn.ModuleList([MarianDecoderLayer(config) for _ in range(config.decoder_layers)]) self.layers = nn.ModuleList([MarianDecoderLayer(config) for _ in range(config.decoder_layers)])
self.init_weights()
self.gradient_checkpointing = False self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embed_tokens return self.embed_tokens
...@@ -1087,7 +1091,8 @@ class MarianModel(MarianPreTrainedModel): ...@@ -1087,7 +1091,8 @@ class MarianModel(MarianPreTrainedModel):
self.encoder = MarianEncoder(config, self.shared) self.encoder = MarianEncoder(config, self.shared)
self.decoder = MarianDecoder(config, self.shared) self.decoder = MarianDecoder(config, self.shared)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.shared return self.shared
...@@ -1220,7 +1225,8 @@ class MarianMTModel(MarianPreTrainedModel): ...@@ -1220,7 +1225,8 @@ class MarianMTModel(MarianPreTrainedModel):
self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings))) self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False) self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_encoder(self): def get_encoder(self):
return self.model.get_encoder() return self.model.get_encoder()
...@@ -1399,7 +1405,8 @@ class MarianForCausalLM(MarianPreTrainedModel): ...@@ -1399,7 +1405,8 @@ class MarianForCausalLM(MarianPreTrainedModel):
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.model.decoder.embed_tokens return self.model.decoder.embed_tokens
......
...@@ -696,8 +696,14 @@ class MBartEncoder(MBartPreTrainedModel): ...@@ -696,8 +696,14 @@ class MBartEncoder(MBartPreTrainedModel):
self.layernorm_embedding = nn.LayerNorm(embed_dim) self.layernorm_embedding = nn.LayerNorm(embed_dim)
self.layer_norm = nn.LayerNorm(config.d_model) self.layer_norm = nn.LayerNorm(config.d_model)
self.init_weights()
self.gradient_checkpointing = False self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def _backward_compatibility_gradient_checkpointing(self):
# Override to not delete the attribute from the config
if self.supports_gradient_checkpointing and getattr(self.config, "gradient_checkpointing", False):
self.gradient_checkpointing_enable()
def forward( def forward(
self, self,
...@@ -862,8 +868,9 @@ class MBartDecoder(MBartPreTrainedModel): ...@@ -862,8 +868,9 @@ class MBartDecoder(MBartPreTrainedModel):
self.layernorm_embedding = nn.LayerNorm(config.d_model) self.layernorm_embedding = nn.LayerNorm(config.d_model)
self.layer_norm = nn.LayerNorm(config.d_model) self.layer_norm = nn.LayerNorm(config.d_model)
self.init_weights()
self.gradient_checkpointing = False self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embed_tokens return self.embed_tokens
...@@ -1123,7 +1130,8 @@ class MBartModel(MBartPreTrainedModel): ...@@ -1123,7 +1130,8 @@ class MBartModel(MBartPreTrainedModel):
self.encoder = MBartEncoder(config, self.shared) self.encoder = MBartEncoder(config, self.shared)
self.decoder = MBartDecoder(config, self.shared) self.decoder = MBartDecoder(config, self.shared)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.shared return self.shared
...@@ -1243,7 +1251,8 @@ class MBartForConditionalGeneration(MBartPreTrainedModel): ...@@ -1243,7 +1251,8 @@ class MBartForConditionalGeneration(MBartPreTrainedModel):
self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings))) self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False) self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_encoder(self): def get_encoder(self):
return self.model.get_encoder() return self.model.get_encoder()
...@@ -1664,7 +1673,8 @@ class MBartForCausalLM(MBartPreTrainedModel): ...@@ -1664,7 +1673,8 @@ class MBartForCausalLM(MBartPreTrainedModel):
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.model.decoder.embed_tokens return self.model.decoder.embed_tokens
......
...@@ -857,7 +857,8 @@ class MegatronBertModel(MegatronBertPreTrainedModel): ...@@ -857,7 +857,8 @@ class MegatronBertModel(MegatronBertPreTrainedModel):
self.pooler = MegatronBertPooler(config) if add_pooling_layer else None self.pooler = MegatronBertPooler(config) if add_pooling_layer else None
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings.word_embeddings return self.embeddings.word_embeddings
...@@ -1018,7 +1019,8 @@ class MegatronBertForPreTraining(MegatronBertPreTrainedModel): ...@@ -1018,7 +1019,8 @@ class MegatronBertForPreTraining(MegatronBertPreTrainedModel):
self.bert = MegatronBertModel(config) self.bert = MegatronBertModel(config)
self.cls = MegatronBertPreTrainingHeads(config) self.cls = MegatronBertPreTrainingHeads(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self): def get_output_embeddings(self):
return self.cls.predictions.decoder return self.cls.predictions.decoder
...@@ -1127,7 +1129,8 @@ class MegatronBertForCausalLM(MegatronBertPreTrainedModel): ...@@ -1127,7 +1129,8 @@ class MegatronBertForCausalLM(MegatronBertPreTrainedModel):
self.bert = MegatronBertModel(config, add_pooling_layer=False) self.bert = MegatronBertModel(config, add_pooling_layer=False)
self.cls = MegatronBertOnlyMLMHead(config) self.cls = MegatronBertOnlyMLMHead(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self): def get_output_embeddings(self):
return self.cls.predictions.decoder return self.cls.predictions.decoder
...@@ -1274,7 +1277,8 @@ class MegatronBertForMaskedLM(MegatronBertPreTrainedModel): ...@@ -1274,7 +1277,8 @@ class MegatronBertForMaskedLM(MegatronBertPreTrainedModel):
self.bert = MegatronBertModel(config, add_pooling_layer=False) self.bert = MegatronBertModel(config, add_pooling_layer=False)
self.cls = MegatronBertOnlyMLMHead(config) self.cls = MegatronBertOnlyMLMHead(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
def get_output_embeddings(self): def get_output_embeddings(self):
return self.cls.predictions.decoder return self.cls.predictions.decoder
...@@ -1375,7 +1379,8 @@ class MegatronBertForNextSentencePrediction(MegatronBertPreTrainedModel): ...@@ -1375,7 +1379,8 @@ class MegatronBertForNextSentencePrediction(MegatronBertPreTrainedModel):
self.bert = MegatronBertModel(config) self.bert = MegatronBertModel(config)
self.cls = MegatronBertOnlyNSPHead(config) self.cls = MegatronBertOnlyNSPHead(config)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
...@@ -1478,7 +1483,8 @@ class MegatronBertForSequenceClassification(MegatronBertPreTrainedModel): ...@@ -1478,7 +1483,8 @@ class MegatronBertForSequenceClassification(MegatronBertPreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
...@@ -1574,7 +1580,8 @@ class MegatronBertForMultipleChoice(MegatronBertPreTrainedModel): ...@@ -1574,7 +1580,8 @@ class MegatronBertForMultipleChoice(MegatronBertPreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, 1) self.classifier = nn.Linear(config.hidden_size, 1)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward( @add_start_docstrings_to_model_forward(
MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
...@@ -1671,7 +1678,8 @@ class MegatronBertForTokenClassification(MegatronBertPreTrainedModel): ...@@ -1671,7 +1678,8 @@ class MegatronBertForTokenClassification(MegatronBertPreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
...@@ -1761,7 +1769,8 @@ class MegatronBertForQuestionAnswering(MegatronBertPreTrainedModel): ...@@ -1761,7 +1769,8 @@ class MegatronBertForQuestionAnswering(MegatronBertPreTrainedModel):
self.bert = MegatronBertModel(config, add_pooling_layer=False) self.bert = MegatronBertModel(config, add_pooling_layer=False)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() # Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment