"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "4b3ee9cbc53c6cf6cee6bfae86cc2c6ec0778ee5"
Commit c4403006 authored by Lysandre's avatar Lysandre Committed by Lysandre Debut
Browse files

External MLM head

parent b21402fc
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" ALBERT model configuration """
from .configuration_utils import PretrainedConfig from .configuration_utils import PretrainedConfig
class AlbertConfig(PretrainedConfig): class AlbertConfig(PretrainedConfig):
......
...@@ -401,6 +401,26 @@ class AlbertModel(BertModel): ...@@ -401,6 +401,26 @@ class AlbertModel(BertModel):
outputs = (sequence_output, pooled_output) + encoder_outputs[1:] # add hidden_states and attentions if they are here outputs = (sequence_output, pooled_output) + encoder_outputs[1:] # add hidden_states and attentions if they are here
return outputs return outputs
class AlbertMLMHead(nn.Module):
def __init__(self, config):
super(AlbertMLMHead, self).__init__()
self.LayerNorm = nn.LayerNorm(config.embedding_size)
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
self.dense = nn.Linear(config.hidden_size, config.embedding_size)
self.decoder = nn.Linear(config.embedding_size, config.vocab_size)
self.activation = ACT2FN[config.hidden_act]
def forward(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.activation(hidden_states)
hidden_states = self.LayerNorm(hidden_states)
hidden_states = self.decoder(hidden_states)
prediction_scores = hidden_states + self.bias
return prediction_scores
@add_start_docstrings("Bert Model with a `language modeling` head on top.", ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING) @add_start_docstrings("Bert Model with a `language modeling` head on top.", ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
class AlbertForMaskedLM(BertPreTrainedModel): class AlbertForMaskedLM(BertPreTrainedModel):
...@@ -433,13 +453,8 @@ class AlbertForMaskedLM(BertPreTrainedModel): ...@@ -433,13 +453,8 @@ class AlbertForMaskedLM(BertPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super(AlbertForMaskedLM, self).__init__(config) super(AlbertForMaskedLM, self).__init__(config)
self.config = config
self.albert = AlbertModel(config) self.albert = AlbertModel(config)
self.LayerNorm = nn.LayerNorm(config.embedding_size) self.predictions = AlbertMLMHead(config)
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
self.dense = nn.Linear(config.hidden_size, config.embedding_size)
self.decoder = nn.Linear(config.embedding_size, config.vocab_size)
self.activation = ACT2FN[config.hidden_act]
self.init_weights() self.init_weights()
self.tie_weights() self.tie_weights()
...@@ -448,17 +463,15 @@ class AlbertForMaskedLM(BertPreTrainedModel): ...@@ -448,17 +463,15 @@ class AlbertForMaskedLM(BertPreTrainedModel):
""" Make sure we are sharing the input and output embeddings. """ Make sure we are sharing the input and output embeddings.
Export to TorchScript can't handle parameter sharing so we are cloning them instead. Export to TorchScript can't handle parameter sharing so we are cloning them instead.
""" """
self._tie_or_clone_weights(self.decoder, self._tie_or_clone_weights(self.predictions.decoder,
self.albert.embeddings.word_embeddings) self.albert.embeddings.word_embeddings)
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
masked_lm_labels=None): masked_lm_labels=None):
outputs = self.albert(input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None) outputs = self.albert(input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None)
sequence_outputs = outputs[0] sequence_outputs = outputs[0]
hidden_states = self.dense(sequence_outputs)
hidden_states = self.activation(hidden_states) prediction_scores = self.predictions(sequence_outputs)
hidden_states = self.LayerNorm(hidden_states)
prediction_scores = self.decoder(hidden_states)
outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here
if masked_lm_labels is not None: if masked_lm_labels is not None:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment