Merged commit includes the following changes: (#7324)

260601376 by hongkuny<hongkuny@google.com>: reorder Q,K to make TPU faster. -- PiperOrigin-RevId: 260601376

Merged commit includes the following changes: (#7324)
260601376 by hongkuny<hongkuny@google.com>: reorder Q,K to make TPU faster. -- PiperOrigin-RevId: 260601376
aad41340 · Hongkun Yu · GitHub · d65af7d8 · aad41340
Unverified Commit aad41340 authored Jul 29, 2019 by Hongkun Yu Committed by GitHub Jul 29, 2019
Show whitespace changes
Inline Side-by-side

Showing with 2 additions and 2 deletions

official/bert/modeling.py official/bert/modeling.py +2 -2

No files found.
--- a/official/bert/modeling.py
+++ b/official/bert/modeling.py
@@ -365,7 +365,7 @@ class Attention(tf.keras.layers.Layer):
    Q:[BFNH] = einsum('BFD,DNH->BFNH', Input_tensor, Wq)
    K:[BTNH] = einsum('BTD,DNH->BTNH', Input_tensor, Wk)
    V:[BTNH] = einsum('BTD,DNH->BTNH', Input_tensor, Wv)
-    attention_scores:[BNFT] = einsum('BFNH,BTNH>BNFT', Q, K) / sqrt(H)
+    attention_scores:[BNFT] = einsum('BTNH,BFNH->BNFT', K, Q) / sqrt(H)
    attention_probs:[BNFT] = softmax(attention_scores)
    context_layer:[BFNH] = einsum('BNFT,BTNH->BFNH', attention_probs, V)
    Wout:[DNH]
@@ -433,7 +433,7 @@ class Attention(tf.keras.layers.Layer):

    # Take the dot product between "query" and "key" to get the raw
    # attention scores.
-    attention_scores = tf.einsum("BFNH,BTNH->BNFT", query_tensor, key_tensor)
+    attention_scores = tf.einsum("BTNH,BFNH->BNFT", key_tensor, query_tensor)
    attention_scores = tf.multiply(attention_scores,
                                   1.0 / math.sqrt(float(self.size_per_head)))