bug fix

7d4ad51e · Vijay Korthikanti · 37ae6646 · 7d4ad51e
Commit 7d4ad51e authored Sep 29, 2020 by Vijay Korthikanti
Hide whitespace changes
Inline Side-by-side

Showing with 9 additions and 9 deletions

megatron/model/transformer.py megatron/model/transformer.py +9 -9

No files found.
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -197,23 +197,23 @@ class ParallelSelfAttention(MegatronModule):
        # Query, Key, and Value
        # =====================

-        # Attention heads [s, b, hp] --> [s, b, 3 * hp]
+        # Attention heads [s, b, hp] --> [s, b, hp * 3]
        mixed_x_layer, _ = self.query_key_value(hidden_states)
 
        if self.old_checkpoint_format:
-            self._transpose_last_dim(mixed_x_layer)
+            # [s, b, 3 * hp] --> [s, b, hp * 3]
+            mixed_x_layer = self._transpose_last_dim(mixed_x_layer)

-        # [s, b, 3 * hp] --> [s, b, np, 3 * hn]  
+        # [s, b, hp * 3] --> [s, b, np, hn, 3]  
        new_tensor_shape = mixed_x_layer.size()[:-1] + \
            (self.num_attention_heads_per_partition,
-             3 * self.hidden_size_per_attention_head)
+             self.hidden_size_per_attention_head, 3)
        mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)

-        # [s, b, np, 3 * hn] --> 3 [s, b, np, hn]
-        (query_layer,
-         key_layer,
-         value_layer) = mpu.split_tensor_along_last_dim(mixed_x_layer, 3)
-
+        # [s, b, np, hn, 3] --> 3 [s, b, np, hn]
+        query_layer = mixed_x_layer[:,:,:,:,0]
+        key_layer = mixed_x_layer[:,:,:,:,1]
+        value_layer = mixed_x_layer[:,:,:,:,2]

        # ==================================
        # Adjust key and value for inference