Patching clip model to create mask tensor on the device (#22711)

* Patching clip model to create mask tensor on the device * Addressing PR's comments * Addressing PR's comments * Addressing PR's comments --------- Co-authored-by: Shanmugam Ramasamy <shanmugamr@shanmugamr-mlt.client.nvidia.com>

Patching clip model to create mask tensor on the device (#22711)
* Patching clip model to create mask tensor on the device * Addressing PR's comments * Addressing PR's comments * Addressing PR's comments --------- Co-authored-by: Shanmugam Ramasamy <shanmugamr@shanmugamr-mlt.client.nvidia.com>
cb47293e · Shanmugam Ramasamy · GitHub · 2da73f63 · cb47293e · cb47293e
Unverified Commit cb47293e authored Apr 20, 2023 by Shanmugam Ramasamy Committed by GitHub Apr 20, 2023
3 changed files
--- a/src/transformers/models/clip/modeling_clip.py
+++ b/src/transformers/models/clip/modeling_clip.py
@@ -714,8 +714,8 @@ class CLIPTextTransformer(nn.Module):
        bsz, seq_len = input_shape
        # CLIP's text model uses causal mask, prepare it here.
        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
-        causal_attention_mask = self._build_causal_attention_mask(bsz, seq_len, hidden_states.dtype).to(
+        causal_attention_mask = self._build_causal_attention_mask(
-            hidden_states.device
+            bsz, seq_len, hidden_states.dtype, device=hidden_states.device
        )
        # expand attention_mask
        if attention_mask is not None:
@@ -752,11 +752,11 @@ class CLIPTextTransformer(nn.Module):
            attentions=encoder_outputs.attentions,
        )
-    def _build_causal_attention_mask(self, bsz, seq_len, dtype):
+    def _build_causal_attention_mask(self, bsz, seq_len, dtype, device=None):
        # lazily create causal attention mask, with full attention between the vision tokens
        # pytorch uses additive attention mask; fill with -inf
-        mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype)
+        mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype, device=device)
-        mask.fill_(torch.tensor(torch.finfo(dtype).min))
+        mask.fill_(torch.finfo(dtype).min)
        mask.triu_(1)  # zero out the lower diagonal
        mask = mask.unsqueeze(1)  # expand mask
        return mask

--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -726,8 +726,8 @@ class CLIPSegTextTransformer(nn.Module):
        bsz, seq_len = input_shape
        # CLIPSeg's text model uses causal mask, prepare it here.
        # https://github.com/openai/CLIPSeg/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clipseg/model.py#L324
-        causal_attention_mask = self._build_causal_attention_mask(bsz, seq_len, hidden_states.dtype).to(
+        causal_attention_mask = self._build_causal_attention_mask(
-            hidden_states.device
+            bsz, seq_len, hidden_states.dtype, device=hidden_states.device
        )
        # expand attention_mask
        if attention_mask is not None:
@@ -764,11 +764,11 @@ class CLIPSegTextTransformer(nn.Module):
            attentions=encoder_outputs.attentions,
        )
-    def _build_causal_attention_mask(self, bsz, seq_len, dtype):
+    def _build_causal_attention_mask(self, bsz, seq_len, dtype, device=None):
        # lazily create causal attention mask, with full attention between the vision tokens
        # pytorch uses additive attention mask; fill with -inf
-        mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype)
+        mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype, device=device)
-        mask.fill_(torch.tensor(torch.finfo(dtype).min))
+        mask.fill_(torch.finfo(dtype).min)
        mask.triu_(1)  # zero out the lower diagonal
        mask = mask.unsqueeze(1)  # expand mask
        return mask

--- a/src/transformers/models/groupvit/modeling_groupvit.py
+++ b/src/transformers/models/groupvit/modeling_groupvit.py
@@ -1108,8 +1108,8 @@ class GroupViTTextTransformer(nn.Module):
        bsz, seq_len = input_shape
        # CLIP's text model uses causal mask, prepare it here.
        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
-        causal_attention_mask = self._build_causal_attention_mask(bsz, seq_len, hidden_states.dtype).to(
+        causal_attention_mask = self._build_causal_attention_mask(
-            hidden_states.device
+            bsz, seq_len, hidden_states.dtype, device=hidden_states.device
        )
        # expand attention_mask
        if attention_mask is not None:
@@ -1146,11 +1146,11 @@ class GroupViTTextTransformer(nn.Module):
            attentions=encoder_outputs.attentions,
        )
-    def _build_causal_attention_mask(self, bsz, seq_len, dtype):
+    def _build_causal_attention_mask(self, bsz, seq_len, dtype, device=None):
        # lazily create causal attention mask, with full attention between the vision tokens
        # pytorch uses additive attention mask; fill with -inf
-        mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype)
+        mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype, device=device)
-        mask.fill_(torch.tensor(torch.finfo(dtype).min))
+        mask.fill_(torch.finfo(dtype).min)
        mask.triu_(1)  # zero out the lower diagonal
        mask = mask.unsqueeze(1)  # expand mask
        return mask