Unverified Commit 443aaaa1 authored by Nicolas Patry's avatar Nicolas Patry Committed by GitHub
Browse files

Adding ASR pipeline example. (#20226)

* Adding ASR pipeline example.

* De indent.

* Example deindent.

* Fixing example ?

* Putting the example in a more prominent place.

* Fixup.

* Adding the file.

* Adding the doctest to the daily test.

* Fixing comments.

* transcriber name.

* Adding `>>>`.

* Removing assert.
parent e4346278
...@@ -16,6 +16,8 @@ from typing import TYPE_CHECKING, Dict, Optional, Union ...@@ -16,6 +16,8 @@ from typing import TYPE_CHECKING, Dict, Optional, Union
import numpy as np import numpy as np
import requests
from ..utils import is_torch_available, logging from ..utils import is_torch_available, logging
from .audio_utils import ffmpeg_read from .audio_utils import ffmpeg_read
from .base import ChunkPipeline from .base import ChunkPipeline
...@@ -106,6 +108,18 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline): ...@@ -106,6 +108,18 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline):
The input can be either a raw waveform or a audio file. In case of the audio file, ffmpeg should be installed for The input can be either a raw waveform or a audio file. In case of the audio file, ffmpeg should be installed for
to support multiple audio formats to support multiple audio formats
Example:
```python
>>> from transformers import pipeline
>>> transcriber = pipeline(model="openai/whisper-base")
>>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac")
{'text': ' He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered flour fat and sauce.'}
```
[Using pipelines in a webserver or with a dataset](../pipeline_tutorial)
Arguments: Arguments:
model ([`PreTrainedModel`] or [`TFPreTrainedModel`]): model ([`PreTrainedModel`] or [`TFPreTrainedModel`]):
The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
...@@ -150,6 +164,7 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline): ...@@ -150,6 +164,7 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline):
[PyCTCDecode's [PyCTCDecode's
BeamSearchDecoderCTC](https://github.com/kensho-technologies/pyctcdecode/blob/2fd33dc37c4111417e08d89ccd23d28e9b308d19/pyctcdecode/decoder.py#L180) BeamSearchDecoderCTC](https://github.com/kensho-technologies/pyctcdecode/blob/2fd33dc37c4111417e08d89ccd23d28e9b308d19/pyctcdecode/decoder.py#L180)
can be passed for language model boosted decoding. See [`Wav2Vec2ProcessorWithLM`] for more information. can be passed for language model boosted decoding. See [`Wav2Vec2ProcessorWithLM`] for more information.
""" """
def __init__(self, feature_extractor: Union["SequenceFeatureExtractor", str], *args, **kwargs): def __init__(self, feature_extractor: Union["SequenceFeatureExtractor", str], *args, **kwargs):
...@@ -179,8 +194,8 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline): ...@@ -179,8 +194,8 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline):
**kwargs, **kwargs,
): ):
""" """
Classify the sequence(s) given as inputs. See the [`AutomaticSpeechRecognitionPipeline`] documentation for more Transcribe the audio sequence(s) given as inputs to text. See the [`AutomaticSpeechRecognitionPipeline`]
information. documentation for more information.
Args: Args:
inputs (`np.ndarray` or `bytes` or `str` or `dict`): inputs (`np.ndarray` or `bytes` or `str` or `dict`):
...@@ -236,8 +251,13 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline): ...@@ -236,8 +251,13 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline):
def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None, ignore_warning=False): def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None, ignore_warning=False):
if isinstance(inputs, str): if isinstance(inputs, str):
with open(inputs, "rb") as f: if inputs.startswith("http://") or inputs.startswith("https://"):
inputs = f.read() # We need to actually check for a real protocol, otherwise it's impossible to use a local file
# like http_huggingface_co.png
inputs = requests.get(inputs).content
else:
with open(inputs, "rb") as f:
inputs = f.read()
if isinstance(inputs, bytes): if isinstance(inputs, bytes):
inputs = ffmpeg_read(inputs, self.feature_extractor.sampling_rate) inputs = ffmpeg_read(inputs, self.feature_extractor.sampling_rate)
......
...@@ -194,4 +194,5 @@ src/transformers/models/xlnet/configuration_xlnet.py ...@@ -194,4 +194,5 @@ src/transformers/models/xlnet/configuration_xlnet.py
src/transformers/models/yolos/configuration_yolos.py src/transformers/models/yolos/configuration_yolos.py
src/transformers/models/yolos/modeling_yolos.py src/transformers/models/yolos/modeling_yolos.py
src/transformers/models/x_clip/modeling_x_clip.py src/transformers/models/x_clip/modeling_x_clip.py
src/transformers/models/yoso/configuration_yoso.py src/transformers/models/yoso/configuration_yoso.py
\ No newline at end of file src/transformers/pipelines/
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment