t5 remove data dependency (#22097)

* t5 remove data dependency * make style * make fix-copies --------- Co-authored-by: Prathik Rao <prathikrao@microsoft.com>

t5 remove data dependency (#22097)
* t5 remove data dependency * make style * make fix-copies --------- Co-authored-by: Prathik Rao <prathikrao@microsoft.com>
7c4999e4 · Prathik Rao · GitHub · 16121bae · 7c4999e4 · 7c4999e4
Unverified Commit 7c4999e4 authored Mar 15, 2023 by Prathik Rao Committed by GitHub Mar 15, 2023
Showing with 36 additions and 12 deletions

src/transformers/models/mt5/modeling_mt5.py src/transformers/models/mt5/modeling_mt5.py +18 -6

src/transformers/models/t5/modeling_t5.py src/transformers/models/t5/modeling_t5.py +18 -6

No files found.
--- a/src/transformers/models/mt5/modeling_mt5.py
+++ b/src/transformers/models/mt5/modeling_mt5.py
@@ -566,8 +566,12 @@ class MT5Block(nn.Module):
        attention_outputs = self_attention_outputs[2:]  # Keep self-attention outputs and relative position weights
        # clamp inf values to enable fp16 training
-        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+        if hidden_states.dtype == torch.float16:
-            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            clamp_value = torch.where(
+                torch.isinf(hidden_states).any(),
+                torch.finfo(hidden_states.dtype).max - 1000,
+                torch.finfo(hidden_states.dtype).max,
+            )
            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
        do_cross_attention = self.is_decoder and encoder_hidden_states is not None
@@ -593,8 +597,12 @@ class MT5Block(nn.Module):
            hidden_states = cross_attention_outputs[0]
            # clamp inf values to enable fp16 training
-            if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+            if hidden_states.dtype == torch.float16:
-                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                clamp_value = torch.where(
+                    torch.isinf(hidden_states).any(),
+                    torch.finfo(hidden_states.dtype).max - 1000,
+                    torch.finfo(hidden_states.dtype).max,
+                )
                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
            # Combine self attn and cross attn key value states
@@ -608,8 +616,12 @@ class MT5Block(nn.Module):
        hidden_states = self.layer[-1](hidden_states)
        # clamp inf values to enable fp16 training
-        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+        if hidden_states.dtype == torch.float16:
-            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            clamp_value = torch.where(
+                torch.isinf(hidden_states).any(),
+                torch.finfo(hidden_states.dtype).max - 1000,
+                torch.finfo(hidden_states.dtype).max,
+            )
            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
        outputs = (hidden_states,)

--- a/src/transformers/models/t5/modeling_t5.py
+++ b/src/transformers/models/t5/modeling_t5.py
@@ -703,8 +703,12 @@ class T5Block(nn.Module):
        attention_outputs = self_attention_outputs[2:]  # Keep self-attention outputs and relative position weights
        # clamp inf values to enable fp16 training
-        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+        if hidden_states.dtype == torch.float16:
-            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            clamp_value = torch.where(
+                torch.isinf(hidden_states).any(),
+                torch.finfo(hidden_states.dtype).max - 1000,
+                torch.finfo(hidden_states.dtype).max,
+            )
            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
        do_cross_attention = self.is_decoder and encoder_hidden_states is not None
@@ -730,8 +734,12 @@ class T5Block(nn.Module):
            hidden_states = cross_attention_outputs[0]
            # clamp inf values to enable fp16 training
-            if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+            if hidden_states.dtype == torch.float16:
-                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                clamp_value = torch.where(
+                    torch.isinf(hidden_states).any(),
+                    torch.finfo(hidden_states.dtype).max - 1000,
+                    torch.finfo(hidden_states.dtype).max,
+                )
                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
            # Combine self attn and cross attn key value states
@@ -745,8 +753,12 @@ class T5Block(nn.Module):
        hidden_states = self.layer[-1](hidden_states)
        # clamp inf values to enable fp16 training
-        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+        if hidden_states.dtype == torch.float16:
-            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            clamp_value = torch.where(
+                torch.isinf(hidden_states).any(),
+                torch.finfo(hidden_states.dtype).max - 1000,
+                torch.finfo(hidden_states.dtype).max,
+            )
            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
        outputs = (hidden_states,)