Reorganize and cleanup

1b925643 · Lysandre · Lysandre Debut · 12290c0d · 1b925643
Commit 1b925643 authored Oct 29, 2019 by Lysandre Committed by Lysandre Debut Nov 26, 2019
Hide whitespace changes
Inline Side-by-side

Showing with 67 additions and 90 deletions

transformers/modeling_albert.py transformers/modeling_albert.py +67 -90

No files found.
--- a/transformers/modeling_albert.py
+++ b/transformers/modeling_albert.py
@@ -100,77 +100,6 @@ class AlbertEmbeddings(BertEmbeddings):
        self.LayerNorm = torch.nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps)
-class AlbertModel(BertModel):
-    def __init__(self, config):
-        super(AlbertModel, self).__init__(config)
-        self.config = config
-        self.embeddings = AlbertEmbeddings(config)
-        self.encoder = AlbertTransformer(config)
-        self.pooler = nn.Linear(config.hidden_size, config.hidden_size)
-        self.pooler_activation = nn.Tanh()
-    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
-        if attention_mask is None:
-            attention_mask = torch.ones_like(input_ids)
-        if token_type_ids is None:
-            token_type_ids = torch.zeros_like(input_ids)
-        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
-        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
-        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
-        if head_mask is not None:
-            if head_mask.dim() == 1:
-                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
-                head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
-            elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
-        else:
-            head_mask = [None] * self.config.num_hidden_layers
-        embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids)
-        encoder_outputs = self.encoder(embedding_output,
-                                       extended_attention_mask,
-                                       head_mask=head_mask)
-        sequence_output = encoder_outputs[0]
-        pooled_output = self.pooler_activation(self.pooler(sequence_output[:, 0]))
-        outputs = (sequence_output, pooled_output,) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
-        return outputs
-class AlbertForMaskedLM(nn.Module):
-    def __init__(self, config):
-        super(AlbertForMaskedLM, self).__init__()
-        self.config = config
-        self.bert = AlbertModel(config)
-        self.LayerNorm = nn.LayerNorm(config.embedding_size)
-        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
-        self.dense = nn.Linear(config.hidden_size, config.embedding_size)
-        self.word_embeddings = nn.Linear(config.embedding_size, config.vocab_size)
-    def tie_weights(self):
-        """ Make sure we are sharing the input and output embeddings.
-            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
-        """
-        self._tie_or_clone_weights(self.classifier.word_embeddings,
-                                   self.transformer.embeddings.word_embeddings)
-    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
-        hidden_states = self.bert(input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None)[0]
-        hidden_states = self.dense(hidden_states)
-        hidden_states = gelu_new(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states)
-        logits = self.word_embeddings(hidden_states)
-        return logits
 class AlbertAttention(BertSelfAttention):
    def __init__(self, config):
        super(AlbertAttention, self).__init__(config)
@@ -238,7 +167,9 @@ class AlbertAttention(BertSelfAttention):
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        reshaped_context_layer = context_layer.view(*new_context_layer_shape)
-        print(self.dense.weight.T.shape)
+        # Should find a better way to do this
        w = self.dense.weight.T.view(self.num_attention_heads, self.attention_head_size, self.hidden_size)
        b = self.dense.bias
@@ -301,26 +232,72 @@ class AlbertTransformer(nn.Module):
        return (hidden_states,)
-# model_size = 'base'
+class AlbertModel(BertModel):
-# hidden_groups = 1
+    def __init__(self, config):
-# inner_groups = 2
+        super(AlbertModel, self).__init__(config)
-# config = AlbertConfig.from_json_file("/home/hf/google-research/albert/config_{}-{}-hg-{}-ig.json".format(model_size, hidden_groups, inner_groups))
-# model = AlbertModel(config)
+        self.config = config
+        self.embeddings = AlbertEmbeddings(config)
+        self.encoder = AlbertTransformer(config)
+        self.pooler = nn.Linear(config.hidden_size, config.hidden_size)
+        self.pooler_activation = nn.Tanh()
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+        else:
+            head_mask = [None] * self.config.num_hidden_layers
+        embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids)
+        encoder_outputs = self.encoder(embedding_output,
+                                       extended_attention_mask,
+                                       head_mask=head_mask)
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler_activation(self.pooler(sequence_output[:, 0]))
+        outputs = (sequence_output, pooled_output,) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
+        return outputs
+class AlbertForMaskedLM(nn.Module):
+    def __init__(self, config):
+        super(AlbertForMaskedLM, self).__init__()
-# # print(model)
+        self.config = config
-# model = load_tf_weights_in_albert(model, config, "/home/hf/transformers/albert-{}-{}-hg-{}-ig/albert-{}-{}-hg-{}-ig".format(model_size, hidden_groups, inner_groups, model_size, hidden_groups, inner_groups))
+        self.bert = AlbertModel(config)
-# # model.eval()
+        self.LayerNorm = nn.LayerNorm(config.embedding_size)
-# # print(sum(p.numel() for p in model.parameters() if p.requires_grad))
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+        self.dense = nn.Linear(config.hidden_size, config.embedding_size)
+        self.word_embeddings = nn.Linear(config.embedding_size, config.vocab_size)
+    def tie_weights(self):
+        """ Make sure we are sharing the input and output embeddings.
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
+        """
+        self._tie_or_clone_weights(self.classifier.word_embeddings,
+                                   self.transformer.embeddings.word_embeddings)
-# input_ids = [[31, 51, 99, 88, 54, 34, 23, 23, 12], [15, 5, 0, 88, 54, 34, 23, 23, 12]]
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
-# input_mask = [[1, 1, 1, 1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 1, 0, 0, 0]]
+        hidden_states = self.bert(input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None)[0]
-# segment_ids = [[0, 0, 1, 0, 0, 1, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0]]
+        hidden_states = self.dense(hidden_states)
+        hidden_states = gelu_new(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
-# pt_input_ids = torch.tensor(input_ids)
+        logits = self.word_embeddings(hidden_states)
-# pt_input_mask = torch.tensor(input_mask)
-# pt_segment_ids = torch.tensor(segment_ids)
-# pt_dict = {"input_ids": pt_input_ids, "attention_mask": pt_input_mask, "token_type_ids": pt_segment_ids}
+        return logits
-# pt_output = model(**pt_dict)
-# print(pt_output)
\ No newline at end of file