[refactor] apply qk norm in attention processors (#9071)

* apply qk norm in attention processors * revert attention processor * qk-norm in only attention proc 2.0 and fused variant

[refactor] apply qk norm in attention processors (#9071)
* apply qk norm in attention processors * revert attention processor * qk-norm in only attention proc 2.0 and fused variant
2b760996 · Aryan · GitHub · 4f0d01d3 · 2b760996
Unverified Commit 2b760996 authored Aug 04, 2024 by Aryan Committed by GitHub Aug 04, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 10 additions and 0 deletions

src/diffusers/models/attention_processor.py src/diffusers/models/attention_processor.py +10 -0

No files found.
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -1785,6 +1785,11 @@ class AttnProcessor2_0:
        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)

+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
        # the output of sdp = (batch, num_heads, seq_len, head_dim)
        # TODO: add support for attn.scale when we move to Torch 2.1
        hidden_states = F.scaled_dot_product_attention(
@@ -2314,6 +2319,11 @@ class FusedAttnProcessor2_0:
        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)

+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
        # the output of sdp = (batch, num_heads, seq_len, head_dim)
        # TODO: add support for attn.scale when we move to Torch 2.1
        hidden_states = F.scaled_dot_product_attention(