"tests/models/mobilebert/test_modeling_tf_mobilebert.py" did not exist on "7e36deec7a406733f14aa567a624541aaee6bd40"
test_pipelines_automatic_speech_recognition.py 8.68 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest

17
import numpy as np
18
19
import pytest

20
21
22
23
24
25
26
27
from transformers import (
    MODEL_FOR_CTC_MAPPING,
    MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
    AutoFeatureExtractor,
    AutoTokenizer,
    Speech2TextForConditionalGeneration,
    Wav2Vec2ForCTC,
)
28
from transformers.pipelines import AutomaticSpeechRecognitionPipeline, pipeline
29
30
31
32
33
34
35
36
37
38
from transformers.testing_utils import (
    is_pipeline_test,
    require_datasets,
    require_tf,
    require_torch,
    require_torchaudio,
    slow,
)

from .test_pipelines_common import ANY, PipelineTestCaseMeta
39
40


41
# We can't use this mixin because it assumes TF support.
42
43
44
# from .test_pipelines_common import CustomInputPipelineCommonMixin


45
@is_pipeline_test
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
    model_mapping = {
        k: v
        for k, v in (list(MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING.items()) if MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING else [])
        + (MODEL_FOR_CTC_MAPPING.items() if MODEL_FOR_CTC_MAPPING else [])
    }

    def get_test_pipeline(self, model, tokenizer, feature_extractor):
        if tokenizer is None:
            # Side effect of no Fast Tokenizer class for these model, so skipping
            # But the slow tokenizer test should still run as they're quite small
            self.skipTest("No tokenizer available")
            return
            # return None, None

        speech_recognizer = AutomaticSpeechRecognitionPipeline(
            model=model, tokenizer=tokenizer, feature_extractor=feature_extractor
        )

        # test with a raw waveform
        audio = np.zeros((34000,))
        audio2 = np.zeros((14000,))
        return speech_recognizer, [audio, audio2]

    def run_pipeline_test(self, speech_recognizer, examples):
        audio = np.zeros((34000,))
        outputs = speech_recognizer(audio)
        self.assertEqual(outputs, {"text": ANY(str)})

75
76
77
78
79
80
    @require_torch
    @slow
    def test_pt_defaults(self):
        pipeline("automatic-speech-recognition", framework="pt")

    @require_torch
81
    def test_small_model_pt(self):
82
83
84
85
86
87
88
89
        import numpy as np

        speech_recognizer = pipeline(
            task="automatic-speech-recognition",
            model="facebook/s2t-small-mustc-en-fr-st",
            tokenizer="facebook/s2t-small-mustc-en-fr-st",
            framework="pt",
        )
90
        waveform = np.tile(np.arange(1000, dtype=np.float32), 34)
91
        output = speech_recognizer(waveform)
92
        self.assertEqual(output, {"text": "(Applaudissements)"})
93

94
95
96
97
    @require_tf
    def test_small_model_tf(self):
        self.skipTest("Tensorflow not supported yet.")

98
99
100
    @require_torch
    def test_torch_small_no_tokenizer_files(self):
        # test that model without tokenizer file cannot be loaded
101
        with pytest.raises(OSError):
102
103
            pipeline(
                task="automatic-speech-recognition",
104
                model="patrickvonplaten/tiny-wav2vec2-no-tokenizer",
105
106
107
                framework="pt",
            )

108
109
110
111
112
113
114
115
116
117
118
119
    @require_datasets
    @require_torch
    @slow
    def test_torch_large(self):
        import numpy as np

        speech_recognizer = pipeline(
            task="automatic-speech-recognition",
            model="facebook/wav2vec2-base-960h",
            tokenizer="facebook/wav2vec2-base-960h",
            framework="pt",
        )
120
        waveform = np.tile(np.arange(1000, dtype=np.float32), 34)
121
122
123
124
125
        output = speech_recognizer(waveform)
        self.assertEqual(output, {"text": ""})

        from datasets import load_dataset

Patrick von Platen's avatar
Patrick von Platen committed
126
        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
127
        filename = ds[40]["file"]
128
129
        output = speech_recognizer(filename)
        self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"})
130

131
132
133
134
135
136
137
138
139
140
141
142
143
    @require_datasets
    @require_torch
    @slow
    def test_torch_speech_encoder_decoder(self):
        speech_recognizer = pipeline(
            task="automatic-speech-recognition",
            model="facebook/s2t-wav2vec2-large-en-de",
            feature_extractor="facebook/s2t-wav2vec2-large-en-de",
            framework="pt",
        )

        from datasets import load_dataset

Patrick von Platen's avatar
Patrick von Platen committed
144
        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
145
        filename = ds[40]["file"]
146
147
148
        output = speech_recognizer(filename)
        self.assertEqual(output, {"text": 'Ein Mann sagte zum Universum : " Sir, ich existiert! "'})

149
150
151
152
153
154
155
156
157
158
159
160
161
    @slow
    @require_torch
    @require_datasets
    def test_simple_wav2vec2(self):
        import numpy as np
        from datasets import load_dataset

        model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
        tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
        feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")

        asr = AutomaticSpeechRecognitionPipeline(model=model, tokenizer=tokenizer, feature_extractor=feature_extractor)

162
        waveform = np.tile(np.arange(1000, dtype=np.float32), 34)
163
164
165
        output = asr(waveform)
        self.assertEqual(output, {"text": ""})

Patrick von Platen's avatar
Patrick von Platen committed
166
        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
167
        filename = ds[40]["file"]
168
169
170
        output = asr(filename)
        self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"})

171
        filename = ds[40]["file"]
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
        with open(filename, "rb") as f:
            data = f.read()
        output = asr(data)
        self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"})

    @slow
    @require_torch
    @require_torchaudio
    @require_datasets
    def test_simple_s2t(self):
        import numpy as np
        from datasets import load_dataset

        model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-mustc-en-it-st")
        tokenizer = AutoTokenizer.from_pretrained("facebook/s2t-small-mustc-en-it-st")
        feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/s2t-small-mustc-en-it-st")

        asr = AutomaticSpeechRecognitionPipeline(model=model, tokenizer=tokenizer, feature_extractor=feature_extractor)

191
        waveform = np.tile(np.arange(1000, dtype=np.float32), 34)
192
193

        output = asr(waveform)
194
        self.assertEqual(output, {"text": "(Applausi)"})
195

Patrick von Platen's avatar
Patrick von Platen committed
196
        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
197
        filename = ds[40]["file"]
198
199
200
        output = asr(filename)
        self.assertEqual(output, {"text": "Un uomo disse all'universo: \"Signore, io esisto."})

201
        filename = ds[40]["file"]
202
203
204
205
        with open(filename, "rb") as f:
            data = f.read()
        output = asr(data)
        self.assertEqual(output, {"text": "Un uomo disse all'universo: \"Signore, io esisto."})
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243

    @slow
    @require_torch
    @require_torchaudio
    @require_datasets
    def test_xls_r_to_en(self):
        speech_recognizer = pipeline(
            task="automatic-speech-recognition",
            model="facebook/wav2vec2-xls-r-1b-21-to-en",
            feature_extractor="facebook/wav2vec2-xls-r-1b-21-to-en",
            framework="pt",
        )

        from datasets import load_dataset

        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
        filename = ds[40]["file"]
        output = speech_recognizer(filename)
        self.assertEqual(output, {"text": "A man said to the universe: 鈥淪ir, I exist."})

    @slow
    @require_torch
    @require_torchaudio
    @require_datasets
    def test_xls_r_from_en(self):
        speech_recognizer = pipeline(
            task="automatic-speech-recognition",
            model="facebook/wav2vec2-xls-r-1b-en-to-15",
            feature_extractor="facebook/wav2vec2-xls-r-1b-en-to-15",
            framework="pt",
        )

        from datasets import load_dataset

        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
        filename = ds[40]["file"]
        output = speech_recognizer(filename)
        self.assertEqual(output, {"text": "Ein Mann sagte zu dem Universum, Sir, ich bin da."})