[Whisper] Fix feature normalization in `WhisperFeatureExtractor` (#21938)

Fix feature normalization in WhisperFeatureExtractor

[Whisper] Fix feature normalization in `WhisperFeatureExtractor` (#21938)
Fix feature normalization in WhisperFeatureExtractor
003a7cc6 · bofeng huang · GitHub · 718e9d77 · 003a7cc6
Unverified Commit 003a7cc6 authored Mar 03, 2023 by bofeng huang Committed by GitHub Mar 03, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 7 deletions

src/transformers/models/whisper/feature_extraction_whisper.py ...transformers/models/whisper/feature_extraction_whisper.py +5 -7

No files found.
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -334,14 +334,8 @@ class WhisperFeatureExtractor(SequenceFeatureExtractor):
            max_length=max_length if max_length else self.n_samples,
            truncation=truncation,
            pad_to_multiple_of=pad_to_multiple_of,
-            return_attention_mask=return_attention_mask,
+            return_attention_mask=return_attention_mask or do_normalize,
        )
-        # make sure list is in array format
-        input_features = padded_inputs.get("input_features").transpose(2, 0, 1)
-        if return_attention_mask:
-            # rescale from sample (48000) to feature (3000)
-            padded_inputs["attention_mask"] = padded_inputs["attention_mask"][:, :: self.hop_length]
        # zero-mean and unit-variance normalization
        if do_normalize:
@@ -350,6 +344,10 @@ class WhisperFeatureExtractor(SequenceFeatureExtractor):
                attention_mask=padded_inputs["attention_mask"],
                padding_value=self.padding_value,
            )
+            padded_inputs["input_features"] = np.stack(padded_inputs["input_features"], axis=0)
+        # make sure list is in array format
+        input_features = padded_inputs.get("input_features").transpose(2, 0, 1)
        input_features = [self._np_extract_fbank_features(waveform) for waveform in input_features[0]]