Prevent Dynamo graph fragmentation in GPTNeoX with torch.baddbmm fix (#24941)

* Pass a Python scalar for alpha in torch.baddbmm * fixup --------- Co-authored-by: Arthur Zucker <arthur.zucker@gmail.com>

Prevent Dynamo graph fragmentation in GPTNeoX with torch.baddbmm fix (#24941)
* Pass a Python scalar for alpha in torch.baddbmm * fixup --------- Co-authored-by: Arthur Zucker <arthur.zucker@gmail.com>
2cf87e2b · Nora Belrose · GitHub · b413e061 · 2cf87e2b
Unverified Commit 2cf87e2b authored Aug 23, 2023 by Nora Belrose Committed by GitHub Aug 23, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 2 additions and 6 deletions

src/transformers/models/gpt_neox/modeling_gpt_neox.py src/transformers/models/gpt_neox/modeling_gpt_neox.py +2 -6

No files found.
--- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
@@ -100,11 +100,7 @@ class GPTNeoXAttention(nn.Module):
        self.register_buffer("masked_bias", torch.tensor(-1e9), persistent=False)
        self._init_rope()
-        self.register_buffer(
+        self.norm_factor = self.head_size**-0.5
-            "norm_factor",
-            torch.sqrt(torch.tensor(self.head_size, dtype=torch.float32)).to(torch.get_default_dtype()),
-            persistent=False,
-        )
        self.query_key_value = nn.Linear(config.hidden_size, 3 * config.hidden_size)
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.attention_dropout = nn.Dropout(config.attention_dropout)
@@ -258,7 +254,7 @@ class GPTNeoXAttention(nn.Module):
            query,
            key.transpose(1, 2),
            beta=1.0,
-            alpha=(torch.tensor(1.0, dtype=self.norm_factor.dtype, device=self.norm_factor.device) / self.norm_factor),
+            alpha=self.norm_factor,
        )
        attn_scores = attn_scores.view(batch_size, num_attention_heads, query_length, key_length)