"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "b25cec13c57656941aac3b920eeb488c1915df18"
Commit 8aa22af0 authored by thomwolf's avatar thomwolf
Browse files

fixing model

parent 38f740a1
This diff is collapsed.
...@@ -268,29 +268,31 @@ def main(): ...@@ -268,29 +268,31 @@ def main():
input_mask = input_mask.float().to(device) input_mask = input_mask.float().to(device)
all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask) all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask)
all_encoder_layers = all_encoder_layers
for enc_layers, example_index in zip(all_encoder_layers, example_indices): for b, example_index in enumerate(example_indices):
feature = features[example_index.item()] feature = features[example_index.item()]
unique_id = int(feature.unique_id) unique_id = int(feature.unique_id)
# feature = unique_id_to_feature[unique_id] # feature = unique_id_to_feature[unique_id]
output_json = collections.OrderedDict() output_json = collections.OrderedDict()
output_json["linex_index"] = unique_id output_json["linex_index"] = unique_id
all_features = [] all_out_features = []
for (i, token) in enumerate(feature.tokens): for (i, token) in enumerate(feature.tokens):
all_layers = [] all_layers = []
for (j, layer_index) in enumerate(layer_indexes): for (j, layer_index) in enumerate(layer_indexes):
layer_output = enc_layers[int(layer_index)].detach().cpu().numpy() layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy()
layer_output = layer_output[b]
layers = collections.OrderedDict() layers = collections.OrderedDict()
layers["index"] = layer_index layers["index"] = layer_index
layers["values"] = [ layers["values"] = [
round(float(x), 6) for x in layer_output[i:(i + 1)].flat round(x.item(), 6) for x in layer_output[i]
] ]
all_layers.append(layers) all_layers.append(layers)
features = collections.OrderedDict() out_features = collections.OrderedDict()
features["token"] = token out_features["token"] = token
features["layers"] = all_layers out_features["layers"] = all_layers
all_features.append(features) all_out_features.append(out_features)
output_json["features"] = all_features output_json["features"] = all_out_features
writer.write(json.dumps(output_json) + "\n") writer.write(json.dumps(output_json) + "\n")
......
...@@ -27,8 +27,9 @@ import torch.nn as nn ...@@ -27,8 +27,9 @@ import torch.nn as nn
from torch.nn import CrossEntropyLoss from torch.nn import CrossEntropyLoss
def gelu(x): def gelu(x):
return 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
# OpenAI GPT gelu version was : 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) # OpenAI GPT gelu version :
# return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
class BertConfig(object): class BertConfig(object):
...@@ -157,7 +158,7 @@ class BERTEmbeddings(nn.Module): ...@@ -157,7 +158,7 @@ class BERTEmbeddings(nn.Module):
words_embeddings = self.word_embeddings(input_ids) words_embeddings = self.word_embeddings(input_ids)
position_embeddings = self.position_embeddings(position_ids) position_embeddings = self.position_embeddings(position_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = words_embeddings + position_embeddings + token_type_embeddings embeddings = words_embeddings + position_embeddings + token_type_embeddings
embeddings = self.LayerNorm(embeddings) embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings) embeddings = self.dropout(embeddings)
...@@ -196,19 +197,19 @@ class BERTSelfAttention(nn.Module): ...@@ -196,19 +197,19 @@ class BERTSelfAttention(nn.Module):
# T = `to_tensor` sequence length # T = `to_tensor` sequence length
# N = `num_attention_heads` # N = `num_attention_heads`
# H = `size_per_head` # H = `size_per_head`
query_layer = self.query(hidden_states) mixed_query_layer = self.query(hidden_states)
key_layer = self.key(hidden_states) mixed_key_layer = self.key(hidden_states)
value_layer = self.value(hidden_states) mixed_value_layer = self.value(hidden_states)
query_layer = self.transpose_for_scores(query_layer) query_layer = self.transpose_for_scores(mixed_query_layer)
key_layer = self.transpose_for_scores(key_layer, is_key_tensor=True) key_layer = self.transpose_for_scores(mixed_key_layer) #, is_key_tensor=True)
value_layer = self.transpose_for_scores(value_layer) value_layer = self.transpose_for_scores(mixed_value_layer)
# Take the dot product between "query" and "key" to get the raw # Take the dot product between "query" and "key" to get the raw
# attention scores. # attention scores.
# `attention_scores` = [B, N, F, T] # `attention_scores` = [B, N, F, T]
attention_scores = torch.matmul(query_layer, key_layer) attention_scores_no_norm = torch.matmul(query_layer, key_layer.transpose(-1, -2))
attention_scores = attention_scores / math.sqrt(self.attention_head_size) attention_scores_no_mask = attention_scores_no_norm / math.sqrt(self.attention_head_size)
# TODO clean up this (precompute) # TODO clean up this (precompute)
# MY PYTORCH: w = w * self.b + -1e9 * (1 - self.b) # TF implem method: mask_attn_weights # MY PYTORCH: w = w * self.b + -1e9 * (1 - self.b) # TF implem method: mask_attn_weights
...@@ -220,21 +221,26 @@ class BERTSelfAttention(nn.Module): ...@@ -220,21 +221,26 @@ class BERTSelfAttention(nn.Module):
# adder = (1.0 - attention_mask) * -10000.0 # adder = (1.0 - attention_mask) * -10000.0
# Since we are adding it to the raw scores before the softmax, this is # Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely. # effectively the same as removing these entirely.
attention_scores += attention_mask attention_scores = attention_scores_no_mask + attention_mask
# Normalize the attention scores to probabilities. # Normalize the attention scores to probabilities.
# `attention_probs` = [B, N, F, T] # `attention_probs` = [B, N, F, T]
attention_probs = nn.Softmax(dim=-1)(attention_scores) attention_probs_no_drop = nn.Softmax(dim=-1)(attention_scores)
# This is actually dropping out entire tokens to attend to, which might # This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper. # seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.dropout(attention_probs) attention_probs = self.dropout(attention_probs_no_drop)
context_layer = torch.matmul(attention_probs, value_layer) context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous() context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(*new_context_layer_shape) context_layer = context_layer.view(*new_context_layer_shape)
# aux_attention = attention_probs[0, 0, 0, :].view(1, 128, 1)
# aux_attention = aux_attention.permute(0, 2, 1, 3).contiguous().view(1, 128, 768)
# aux_attention = key_layer.permute(0, 2, 3, 1).contiguous().view(1, 128, 768)
# aux_attention = key_layer.permute(0, 2, 1, 3).contiguous().view(1, 128, 768)
return context_layer return context_layer
...@@ -246,7 +252,7 @@ class BERTSelfOutput(nn.Module): ...@@ -246,7 +252,7 @@ class BERTSelfOutput(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states, input_tensor): def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(input_tensor) hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states) hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor) hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states return hidden_states
...@@ -259,8 +265,8 @@ class BERTAttention(nn.Module): ...@@ -259,8 +265,8 @@ class BERTAttention(nn.Module):
self.output = BERTSelfOutput(config) self.output = BERTSelfOutput(config)
def forward(self, input_tensor, attention_mask): def forward(self, input_tensor, attention_mask):
attention_output = self.self(input_tensor, attention_mask) self_output = self.self(input_tensor, attention_mask)
attention_output = self.output(attention_output, input_tensor) attention_output = self.output(self_output, input_tensor)
return attention_output return attention_output
...@@ -388,13 +394,16 @@ class BertModel(nn.Module): ...@@ -388,13 +394,16 @@ class BertModel(nn.Module):
if token_type_ids is None: if token_type_ids is None:
token_type_ids = torch.zeros_like(input_ids) token_type_ids = torch.zeros_like(input_ids)
attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
attention_mask = (1.0 - attention_mask) * -10000.0 extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
embedding_output = self.embeddings(input_ids, token_type_ids) embedding_output = self.embeddings(input_ids, token_type_ids)
all_encoder_layers = self.encoder(embedding_output, attention_mask) all_encoder_layers = self.encoder(embedding_output, extended_attention_mask)
sequence_output = all_encoder_layers[-1] sequence_output = all_encoder_layers[-1]
pooled_output = self.pooler(sequence_output) pooled_output = self.pooler(sequence_output)
# TODO DEbugging
# all_encoder_layers = [attention_mask, embeddings_sum, embedding_output] + all_encoder_layers
return all_encoder_layers, pooled_output return all_encoder_layers, pooled_output
class BertForSequenceClassification(nn.Module): class BertForSequenceClassification(nn.Module):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment