Fix comments for `_merge_heads` (#24855)

* Fix comments * Fix comments

Fix comments for `_merge_heads` (#24855)
* Fix comments * Fix comments
c965d302 · bofeng huang · GitHub · e4a52b6a · c965d302 · c965d302
Unverified Commit c965d302 authored Jul 17, 2023 by bofeng huang Committed by GitHub Jul 17, 2023
Showing with 4 additions and 4 deletions

src/transformers/models/bloom/modeling_bloom.py src/transformers/models/bloom/modeling_bloom.py +2 -2

src/transformers/models/falcon/modeling_falcon.py src/transformers/models/falcon/modeling_falcon.py +2 -2

No files found.
--- a/src/transformers/models/bloom/modeling_bloom.py
+++ b/src/transformers/models/bloom/modeling_bloom.py
@@ -253,7 +253,7 @@ class BloomAttention(nn.Module):

    def _merge_heads(self, x: torch.Tensor) -> torch.Tensor:
        """
-        Merge heads together over the last dimenstion
+        Merge heads together over the last dimension

        Args:
            x (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim]
@@ -344,7 +344,7 @@ class BloomAttention(nn.Module):
        # matmul: [batch_size * num_heads, q_length, head_dim]
        context_layer = torch.bmm(attention_probs_reshaped, value_layer)

-        # change view [batch_size, num_heads, q_length, head_dim]
+        # change view [batch_size, q_length, num_heads * head_dim]
        context_layer = self._merge_heads(context_layer)

        # aggregate results across tp ranks. See here: https://github.com/pytorch/pytorch/issues/76232

--- a/src/transformers/models/falcon/modeling_falcon.py
+++ b/src/transformers/models/falcon/modeling_falcon.py
@@ -255,7 +255,7 @@ class FalconAttention(nn.Module):
    # Copied from transformers.models.bloom.modeling_bloom.BloomAttention._merge_heads
    def _merge_heads(self, x: torch.Tensor) -> torch.Tensor:
        """
-        Merge heads together over the last dimenstion
+        Merge heads together over the last dimension

        Args:
            x (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim]
@@ -384,7 +384,7 @@ class FalconAttention(nn.Module):
            # matmul: [batch_size * num_heads, q_length, head_dim]
            context_layer = (attention_probs_reshaped @ value_layer_).flatten(0, 1)

-            # change view [batch_size, num_heads, q_length, head_dim]
+            # change view [batch_size, q_length, num_heads * head_dim]
            context_layer = self._merge_heads(context_layer)

            output_tensor = self.dense(context_layer)