prune both attention and self-attention heads

51261167 · Rémi Louf · 17177e73 · 51261167
Commit 51261167 authored Oct 10, 2019 by Rémi Louf
Show whitespace changes
Inline Side-by-side

Showing with 3 additions and 2 deletions

transformers/modeling_bert.py transformers/modeling_bert.py +3 -2

No files found.
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -633,7 +633,7 @@ class BertModel(BertPreTrainedModel):
            See base class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
+            self.encoder.layer[layer].self_attention.prune_heads(heads)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
        if attention_mask is None:
@@ -736,7 +736,8 @@ class BertDecoderModel(BertPreTrainedModel):
            See base class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
+            self.decoder.layer[layer].attention.prune_heads(heads)
+            self.decoder.layer[layer].self_attention.prune_heads(heads)

    def forward(self, input_ids, encoder_outputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
        if attention_mask is None: