[pipeline] fix padding for 1-d tensors (#31776)

* [pipeline] fix padding for 1-d tensors * add test * make style * Update tests/pipelines/test_pipelines_automatic_speech_recognition.py Co-authored-by: Kamil Akesbi <45195979+kamilakesbi@users.noreply.github.com> * Update tests/pipelines/test_pipelines_automatic_speech_recognition.py --------- Co-authored-by: Kamil Akesbi <45195979+kamilakesbi@users.noreply.github.com>

[pipeline] fix padding for 1-d tensors (#31776)
* [pipeline] fix padding for 1-d tensors * add test * make style * Update tests/pipelines/test_pipelines_automatic_speech_recognition.py Co-authored-by: Kamil Akesbi <45195979+kamilakesbi@users.noreply.github.com> * Update tests/pipelines/test_pipelines_automatic_speech_recognition.py --------- Co-authored-by: Kamil Akesbi <45195979+kamilakesbi@users.noreply.github.com>
7f5d644e · Sanchit Gandhi · GitHub · 3fbaaaa6 · 7f5d644e · 7f5d644e
Unverified Commit 7f5d644e authored Jul 29, 2024 by Sanchit Gandhi Committed by GitHub Jul 29, 2024
Showing with 20 additions and 0 deletions

src/transformers/pipelines/base.py src/transformers/pipelines/base.py +3 -0

tests/pipelines/test_pipelines_automatic_speech_recognition.py .../pipelines/test_pipelines_automatic_speech_recognition.py +17 -0

No files found.
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -90,6 +90,9 @@ def _pad(items, key, padding_value, padding_side):
        # Others include `attention_mask` etc...
        shape = items[0][key].shape
        dim = len(shape)
+        if dim == 1:
+            # We have a list of 1-dim torch tensors, which can be stacked without padding
+            return torch.cat([item[key] for item in items], dim=0)
        if key in ["pixel_values", "image"]:
            # This is probable image so padding shouldn't be necessary
            # B, C, H, W

--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -549,6 +549,23 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
        output = speech_recognizer([filename], chunk_length_s=5, batch_size=4)
        self.assertEqual(output, [{"text": " A man said to the universe, Sir, I exist."}])
+    @require_torch
+    @slow
+    def test_torch_whisper_batched(self):
+        speech_recognizer = pipeline(
+            task="automatic-speech-recognition",
+            model="openai/whisper-tiny",
+            framework="pt",
+        )
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:2]")
+        EXPECTED_OUTPUT = [
+            {"text": " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel."},
+            {"text": " Nor is Mr. Quilters' manner less interesting than his matter."},
+        ]
+        output = speech_recognizer(ds["audio"], batch_size=2)
+        self.assertEqual(output, EXPECTED_OUTPUT)
    @slow
    def test_find_longest_common_subsequence(self):
        max_source_positions = 1500