Mask t5 relative position bias then head pruned (#17968)

* add position bias head masking if heads pruned * fix pruning function in t5 encoder * make style * make fix-copies * Revert added folder Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

Mask t5 relative position bias then head pruned (#17968)
* add position bias head masking if heads pruned * fix pruning function in t5 encoder * make style * make fix-copies * Revert added folder Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
734b7e2a · Had · GitHub · d4dbd7ca · 734b7e2a · 734b7e2a
Unverified Commit 734b7e2a authored Sep 06, 2022 by Had Committed by GitHub Sep 06, 2022
Showing with 17 additions and 3 deletions

src/transformers/models/longt5/modeling_longt5.py src/transformers/models/longt5/modeling_longt5.py +8 -1

src/transformers/models/t5/modeling_t5.py src/transformers/models/t5/modeling_t5.py +9 -2

No files found.
--- a/src/transformers/models/longt5/modeling_longt5.py
+++ b/src/transformers/models/longt5/modeling_longt5.py
@@ -518,7 +518,14 @@ class LongT5Attention(nn.Module):
            if mask is not None:
                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
-        scores += position_bias
+        if self.pruned_heads:
+            mask = torch.ones(position_bias.shape[1])
+            mask[list(self.pruned_heads)] = 0
+            position_bias_masked = position_bias[:, mask.bool()]
+        else:
+            position_bias_masked = position_bias
+        scores += position_bias_masked
        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
            scores
        )  # (batch_size, n_heads, seq_length, key_length)

--- a/src/transformers/models/t5/modeling_t5.py
+++ b/src/transformers/models/t5/modeling_t5.py
@@ -528,7 +528,14 @@ class T5Attention(nn.Module):
            if mask is not None:
                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
-        scores += position_bias
+        if self.pruned_heads:
+            mask = torch.ones(position_bias.shape[1])
+            mask[list(self.pruned_heads)] = 0
+            position_bias_masked = position_bias[:, mask.bool()]
+        else:
+            position_bias_masked = position_bias
+        scores += position_bias_masked
        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
            scores
        )  # (batch_size, n_heads, seq_length, key_length)
@@ -1802,7 +1809,7 @@ class T5EncoderModel(T5PreTrainedModel):
        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
+            self.encoder.block[layer].layer[0].SelfAttention.prune_heads(heads)
    @add_start_docstrings_to_model_forward(T5_ENCODER_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)