fix typo in Bart's attention (#21898)

648d0deb · Kashif Rasul · GitHub · c87654dc · 648d0deb · 648d0deb
Unverified Commit 648d0deb authored Mar 02, 2023 by Kashif Rasul Committed by GitHub Mar 02, 2023
Showing with 4 additions and 4 deletions

src/transformers/models/wav2vec2/modeling_wav2vec2.py src/transformers/models/wav2vec2/modeling_wav2vec2.py +2 -2

src/transformers/models/whisper/modeling_whisper.py src/transformers/models/whisper/modeling_whisper.py +2 -2

No files found.
--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -621,7 +621,7 @@ class Wav2Vec2Attention(nn.Module):

        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
                f" {attn_output.size()}"
            )

@@ -629,7 +629,7 @@ class Wav2Vec2Attention(nn.Module):
        attn_output = attn_output.transpose(1, 2)

        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned aross GPUs when using tensor-parallelism.
+        # partitioned across GPUs when using tensor-parallelism.
        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)

        attn_output = self.out_proj(attn_output)

--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -366,7 +366,7 @@ class WhisperAttention(nn.Module):

        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
                f" {attn_output.size()}"
            )

@@ -374,7 +374,7 @@ class WhisperAttention(nn.Module):
        attn_output = attn_output.transpose(1, 2)

        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned aross GPUs when using tensor-parallelism.
+        # partitioned across GPUs when using tensor-parallelism.
        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)

        attn_output = self.out_proj(attn_output)