"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "5c766ecb50a075163efda6e3ff8f1138aedcf31b"
Commit 75feacf1 authored by Rémi Louf's avatar Rémi Louf
Browse files

add general structure for Bert2Bert class

parent 15a2fc88
...@@ -1310,3 +1310,63 @@ class BertForQuestionAnswering(BertPreTrainedModel): ...@@ -1310,3 +1310,63 @@ class BertForQuestionAnswering(BertPreTrainedModel):
outputs = (total_loss,) + outputs outputs = (total_loss,) + outputs
return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions)
@add_start_docstrings("Bert encoder-decoder model",
BERT_START_DOCSTRING,
BERT_INPUTS_DOCSTRING)
class Bert2Bert(BertPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
Examples::
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = Bert2Bert.from_pretrained('bert-base-uncased')
input = tokenizer.encode("Hello, how are you?")
outputs = model(input)
output_text = tokenize.decode(outputs[0])
print(output_text)
"""
def __init__(self, config):
super(Bert2Bert, self).__init__(config)
self.embeddings = BertEmbeddings(config)
self.encoder = BertEncoder(config)
self.decoder = BertDecoder(config)
def forward(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
if attention_mask is None:
attention_mask = torch.ones_like(inputs)
if token_type_ids is None:
token_type_ids = torch.zeros_like(inputs)
# We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, to_seq_length]
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
# this attention mask is more simple than the triangular masking of causal attention
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
embedding_output = self.embeddings(inputs, position_ids=position_ids, token_type_ids=token_type_ids)
encoder_outputs = self.encoder(embedding_output,
extended_attention_mask,
head_mask=head_mask)
decoder_outputs = self.decoder(embedding_output,
encoder_outputs[0],
extended_attention_mask,
head_mask=head_mask)
sequence_output = decoder_outputs[0]
pooled_output = self.pooler(sequence_output)
outputs = (sequence_output, pooled_output,) + encoder_outputs[1:] # add hidden_states and attentions if they are here
return outputs # sequence_output, pooled_output, (hidden_states), (attentions)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment