Fix tensor device while attention_mask is not None (#23538)

* Fix tensor device while attention_mask is not None * Fix tensor device while attention_mask is not None

Fix tensor device while attention_mask is not None (#23538)
* Fix tensor device while attention_mask is not None * Fix tensor device while attention_mask is not None
29294b0e · zspo · GitHub · 12ec7f0c · 29294b0e · 29294b0e
Unverified Commit 29294b0e authored May 22, 2023 by zspo Committed by GitHub May 22, 2023
4 changed files
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -226,7 +226,9 @@ class LlamaAttention(nn.Module):
                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                )
            attn_weights = attn_weights + attention_mask
-            attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
+            attn_weights = torch.max(
+                attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min, device=attn_weights.device)
+            )
        # upcast attention to fp32
        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)

--- a/src/transformers/models/open_llama/modeling_open_llama.py
+++ b/src/transformers/models/open_llama/modeling_open_llama.py
@@ -249,7 +249,9 @@ class OpenLlamaAttention(nn.Module):
                        f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                    )
                attn_weights = attn_weights + attention_mask
-                attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
+                attn_weights = torch.max(
+                    attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min, device=attn_weights.device)
+                )
            # upcast attention to fp32
            attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)

--- a/src/transformers/models/opt/modeling_opt.py
+++ b/src/transformers/models/opt/modeling_opt.py
@@ -222,7 +222,9 @@ class OPTAttention(nn.Module):
                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
                )
            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
-            attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
+            attn_weights = torch.max(
+                attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min, device=attn_weights.device)
+            )
            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
        # upcast to fp32 if the weights are in fp16. Please see https://github.com/huggingface/transformers/pull/17437

--- a/src/transformers/models/xglm/modeling_xglm.py
+++ b/src/transformers/models/xglm/modeling_xglm.py
@@ -315,7 +315,9 @@ class XGLMAttention(nn.Module):
                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
                )
            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
-            attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
+            attn_weights = torch.max(
+                attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min, device=attn_weights.device)
+            )
            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
        # upcast to fp32 if the weights are in fp16. Please see https://github.com/huggingface/transformers/pull/17437