replace `Speech2TextTokenizer` by `Speech2TextFeatureExtractor` in some docstrings (#16835)

* replace `Speech2TextTokenizer` by `Speech2TextFeatureExtractor` in docstring * quality

replace `Speech2TextTokenizer` by `Speech2TextFeatureExtractor` in some docstrings (#16835)
* replace `Speech2TextTokenizer` by `Speech2TextFeatureExtractor` in docstring * quality
1efca4e6 · SaulLu · GitHub · b5c6a63e · 1efca4e6 · 1efca4e6
Unverified Commit 1efca4e6 authored Apr 19, 2022 by SaulLu Committed by GitHub Apr 19, 2022
3 changed files
--- a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
+++ b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
@@ -139,8 +139,8 @@ SPEECH_ENCODER_DECODER_INPUTS_DOCSTRING = r"""
            Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be obtained
            by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.*
            via the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
-            [`Speech2TextTokenizer`] should be used for extracting the fbank features, padding and conversion into a
+            [`Speech2TextFeatureExtractor`] should be used for extracting the fbank features, padding and conversion
-            tensor of type `torch.FloatTensor`. See [`~Speech2TextTokenizer.__call__`]
+            into a tensor of type `torch.FloatTensor`. See [`~Speech2TextFeatureExtractor.__call__`]
        return_dict (`bool`, *optional*):
            If set to `True`, the model will return a [`~utils.Seq2SeqLMOutput`] instead of a plain tuple.
        kwargs: (*optional*) Remaining dictionary of keyword arguments. Keyword arguments come in two flavors:

--- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
@@ -600,8 +600,8 @@ SPEECH_TO_TEXT_INPUTS_DOCSTRING = r"""
            Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be obtained
            by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.*
            via the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
-            [`Speech2TextTokenizer`] should be used for extracting the fbank features, padding and conversion into a
+            [`Speech2TextFeatureExtractor`] should be used for extracting the fbank features, padding and conversion
-            tensor of type `torch.FloatTensor`. See [`~Speech2TextTokenizer.__call__`]
+            into a tensor of type `torch.FloatTensor`. See [`~Speech2TextFeatureExtractor.__call__`]
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
            1]`:
@@ -733,9 +733,9 @@ class Speech2TextEncoder(Speech2TextPreTrainedModel):
                Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
-                `input_features`, the [`Speech2TextTokenizer`] should be used for extracting the fbank features,
+                `input_features`, the [`Speech2TextFeatureExtractor`] should be used for extracting the fbank features,
                padding and conversion into a tensor of type `torch.FloatTensor`. See
-                [`~Speech2TextTokenizer.__call__`]
+                [`~Speech2TextFeatureExtractor.__call__`]
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
                `[0, 1]`:

--- a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
@@ -650,8 +650,8 @@ SPEECH_TO_TEXT_INPUTS_DOCSTRING = r"""
            Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be obtained
            by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.*
            via the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
-            [`Speech2TextTokenizer`] should be used for extracting the fbank features, padding and conversion into a
+            [`Speech2TextFeatureExtractor`] should be used for extracting the fbank features, padding and conversion
-            tensor of floats. See [`~Speech2TextTokenizer.__call__`]
+            into a tensor of floats. See [`~Speech2TextFeatureExtractor.__call__`]
        attention_mask (`tf.Tensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
@@ -798,8 +798,8 @@ class TFSpeech2TextEncoder(tf.keras.layers.Layer):
                Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
-                `input_features`, the [`Speech2TextTokenizer`] should be used for extracting the fbank features,
+                `input_features`, the [`Speech2TextFeatureExtractor`] should be used for extracting the fbank features,
-                padding and conversion into a tensor of floats. See [`~Speech2TextTokenizer.__call__`]
+                padding and conversion into a tensor of floats. See [`~Speech2TextFeatureExtractor.__call__`]
            attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: