Commit 1b925643 authored by Lysandre's avatar Lysandre Committed by Lysandre Debut
Browse files

Reorganize and cleanup

parent 12290c0d
......@@ -100,77 +100,6 @@ class AlbertEmbeddings(BertEmbeddings):
self.LayerNorm = torch.nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps)
class AlbertModel(BertModel):
def __init__(self, config):
super(AlbertModel, self).__init__(config)
self.config = config
self.embeddings = AlbertEmbeddings(config)
self.encoder = AlbertTransformer(config)
self.pooler = nn.Linear(config.hidden_size, config.hidden_size)
self.pooler_activation = nn.Tanh()
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
if attention_mask is None:
attention_mask = torch.ones_like(input_ids)
if token_type_ids is None:
token_type_ids = torch.zeros_like(input_ids)
extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
if head_mask is not None:
if head_mask.dim() == 1:
head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
elif head_mask.dim() == 2:
head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer
head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
else:
head_mask = [None] * self.config.num_hidden_layers
embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids)
encoder_outputs = self.encoder(embedding_output,
extended_attention_mask,
head_mask=head_mask)
sequence_output = encoder_outputs[0]
pooled_output = self.pooler_activation(self.pooler(sequence_output[:, 0]))
outputs = (sequence_output, pooled_output,) + encoder_outputs[1:] # add hidden_states and attentions if they are here
return outputs
class AlbertForMaskedLM(nn.Module):
def __init__(self, config):
super(AlbertForMaskedLM, self).__init__()
self.config = config
self.bert = AlbertModel(config)
self.LayerNorm = nn.LayerNorm(config.embedding_size)
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
self.dense = nn.Linear(config.hidden_size, config.embedding_size)
self.word_embeddings = nn.Linear(config.embedding_size, config.vocab_size)
def tie_weights(self):
""" Make sure we are sharing the input and output embeddings.
Export to TorchScript can't handle parameter sharing so we are cloning them instead.
"""
self._tie_or_clone_weights(self.classifier.word_embeddings,
self.transformer.embeddings.word_embeddings)
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
hidden_states = self.bert(input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None)[0]
hidden_states = self.dense(hidden_states)
hidden_states = gelu_new(hidden_states)
hidden_states = self.LayerNorm(hidden_states)
logits = self.word_embeddings(hidden_states)
return logits
class AlbertAttention(BertSelfAttention):
def __init__(self, config):
super(AlbertAttention, self).__init__(config)
......@@ -238,7 +167,9 @@ class AlbertAttention(BertSelfAttention):
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
reshaped_context_layer = context_layer.view(*new_context_layer_shape)
print(self.dense.weight.T.shape)
# Should find a better way to do this
w = self.dense.weight.T.view(self.num_attention_heads, self.attention_head_size, self.hidden_size)
b = self.dense.bias
......@@ -301,26 +232,72 @@ class AlbertTransformer(nn.Module):
return (hidden_states,)
# model_size = 'base'
# hidden_groups = 1
# inner_groups = 2
# config = AlbertConfig.from_json_file("/home/hf/google-research/albert/config_{}-{}-hg-{}-ig.json".format(model_size, hidden_groups, inner_groups))
# model = AlbertModel(config)
class AlbertModel(BertModel):
def __init__(self, config):
super(AlbertModel, self).__init__(config)
self.config = config
self.embeddings = AlbertEmbeddings(config)
self.encoder = AlbertTransformer(config)
self.pooler = nn.Linear(config.hidden_size, config.hidden_size)
self.pooler_activation = nn.Tanh()
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
if attention_mask is None:
attention_mask = torch.ones_like(input_ids)
if token_type_ids is None:
token_type_ids = torch.zeros_like(input_ids)
extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
if head_mask is not None:
if head_mask.dim() == 1:
head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
elif head_mask.dim() == 2:
head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer
head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
else:
head_mask = [None] * self.config.num_hidden_layers
embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids)
encoder_outputs = self.encoder(embedding_output,
extended_attention_mask,
head_mask=head_mask)
sequence_output = encoder_outputs[0]
pooled_output = self.pooler_activation(self.pooler(sequence_output[:, 0]))
outputs = (sequence_output, pooled_output,) + encoder_outputs[1:] # add hidden_states and attentions if they are here
return outputs
class AlbertForMaskedLM(nn.Module):
def __init__(self, config):
super(AlbertForMaskedLM, self).__init__()
# # print(model)
# model = load_tf_weights_in_albert(model, config, "/home/hf/transformers/albert-{}-{}-hg-{}-ig/albert-{}-{}-hg-{}-ig".format(model_size, hidden_groups, inner_groups, model_size, hidden_groups, inner_groups))
# # model.eval()
# # print(sum(p.numel() for p in model.parameters() if p.requires_grad))
self.config = config
self.bert = AlbertModel(config)
self.LayerNorm = nn.LayerNorm(config.embedding_size)
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
self.dense = nn.Linear(config.hidden_size, config.embedding_size)
self.word_embeddings = nn.Linear(config.embedding_size, config.vocab_size)
def tie_weights(self):
""" Make sure we are sharing the input and output embeddings.
Export to TorchScript can't handle parameter sharing so we are cloning them instead.
"""
self._tie_or_clone_weights(self.classifier.word_embeddings,
self.transformer.embeddings.word_embeddings)
# input_ids = [[31, 51, 99, 88, 54, 34, 23, 23, 12], [15, 5, 0, 88, 54, 34, 23, 23, 12]]
# input_mask = [[1, 1, 1, 1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 1, 0, 0, 0]]
# segment_ids = [[0, 0, 1, 0, 0, 1, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0]]
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
hidden_states = self.bert(input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None)[0]
hidden_states = self.dense(hidden_states)
hidden_states = gelu_new(hidden_states)
hidden_states = self.LayerNorm(hidden_states)
# pt_input_ids = torch.tensor(input_ids)
# pt_input_mask = torch.tensor(input_mask)
# pt_segment_ids = torch.tensor(segment_ids)
logits = self.word_embeddings(hidden_states)
# pt_dict = {"input_ids": pt_input_ids, "attention_mask": pt_input_mask, "token_type_ids": pt_segment_ids}
# pt_output = model(**pt_dict)
# print(pt_output)
\ No newline at end of file
return logits
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment