Fix gpt2 fp16 training when tracing is enabled (#20656)

* ONNX tracing fix * Remove conditional

Fix gpt2 fp16 training when tracing is enabled (#20656)
* ONNX tracing fix * Remove conditional
521da651 · Jingya HUANG · GitHub · 93b54368 · 521da651 · 521da651
Unverified Commit 521da651 authored Dec 08, 2022 by Jingya HUANG Committed by GitHub Dec 08, 2022
2 changed files
--- a/src/transformers/models/decision_transformer/modeling_decision_transformer.py
+++ b/src/transformers/models/decision_transformer/modeling_decision_transformer.py
@@ -186,7 +186,7 @@ class DecisionTransformerGPT2Attention(nn.Module):
            # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
            # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
            mask_value = torch.full([], mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
-            attn_weights = torch.where(causal_mask, attn_weights, mask_value)
+            attn_weights = torch.where(causal_mask, attn_weights.to(attn_weights.dtype), mask_value)
        if attention_mask is not None:
            # Apply the attention mask

--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -198,7 +198,7 @@ class GPT2Attention(nn.Module):
            # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
            # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
            mask_value = torch.full([], mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
-            attn_weights = torch.where(causal_mask, attn_weights, mask_value)
+            attn_weights = torch.where(causal_mask, attn_weights.to(attn_weights.dtype), mask_value)
        if attention_mask is not None:
            # Apply the attention mask