Unverified Commit 600496fa authored by Patrick von Platen's avatar Patrick von Platen Committed by GitHub
Browse files

[Wav2Vec2] Rename model's feature extractor to feature encoder (#14959)

* rename classes

* clean up more namings

* remove bogus file

* Apply suggestions from code review

* Apply suggestions from code review

* replace more names

* more regex replace

* make style

* correct

* correct more

* make style

* finish

* correct more in wav2vec2

* make style

* improve freeze_extractor

* add aliases

* add tf aliases
parent 1bfa3477
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
import logging import logging
import os import os
import sys import sys
import warnings
from dataclasses import dataclass, field from dataclasses import dataclass, field
from random import randint from random import randint
from typing import Optional from typing import Optional
...@@ -76,24 +77,24 @@ class DataTrainingArguments: ...@@ -76,24 +77,24 @@ class DataTrainingArguments:
eval_file: Optional[str] = field( eval_file: Optional[str] = field(
default=None, metadata={"help": "A file containing the validation audio paths and labels."} default=None, metadata={"help": "A file containing the validation audio paths and labels."}
) )
train_split_name: Optional[str] = field( train_split_name: str = field(
default="train", default="train",
metadata={ metadata={
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'" "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
}, },
) )
eval_split_name: Optional[str] = field( eval_split_name: str = field(
default="validation", default="validation",
metadata={ metadata={
"help": "The name of the training data set split to use (via the datasets library). Defaults to " "help": "The name of the training data set split to use (via the datasets library). Defaults to "
"'validation'" "'validation'"
}, },
) )
audio_column_name: Optional[str] = field( audio_column_name: str = field(
default="audio", default="audio",
metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"}, metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
) )
label_column_name: Optional[str] = field( label_column_name: str = field(
default="label", metadata={"help": "The name of the dataset column containing the labels. Defaults to 'label'"} default="label", metadata={"help": "The name of the dataset column containing the labels. Defaults to 'label'"}
) )
max_train_samples: Optional[int] = field( max_train_samples: Optional[int] = field(
...@@ -110,7 +111,7 @@ class DataTrainingArguments: ...@@ -110,7 +111,7 @@ class DataTrainingArguments:
"value if set." "value if set."
}, },
) )
max_length_seconds: Optional[float] = field( max_length_seconds: float = field(
default=20, default=20,
metadata={"help": "Audio clips will be randomly cut to this length during training if the value is set."}, metadata={"help": "Audio clips will be randomly cut to this length during training if the value is set."},
) )
...@@ -136,11 +137,13 @@ class ModelArguments: ...@@ -136,11 +137,13 @@ class ModelArguments:
default="main", default="main",
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
) )
feature_extractor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."}) feature_extractor_name: Optional[str] = field(
freeze_feature_extractor: Optional[bool] = field( default=None, metadata={"help": "Name or path of preprocessor config."}
default=True, metadata={"help": "Whether to freeze the feature extractor layers of the model."} )
freeze_feature_encoder: bool = field(
default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
) )
attention_mask: Optional[bool] = field( attention_mask: bool = field(
default=True, metadata={"help": "Whether to generate an attention mask in the feature extractor."} default=True, metadata={"help": "Whether to generate an attention mask in the feature extractor."}
) )
use_auth_token: bool = field( use_auth_token: bool = field(
...@@ -150,6 +153,24 @@ class ModelArguments: ...@@ -150,6 +153,24 @@ class ModelArguments:
"with private models)." "with private models)."
}, },
) )
freeze_feature_extractor: Optional[bool] = field(
default=None, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
)
def __post_init__(self):
if not self.freeze_feature_extractor and self.freeze_feature_encoder:
warnings.warn(
"The argument `--freeze_feature_extractor` is deprecated and "
"will be removed in a future version. Use `--freeze_feature_encoder`"
"instead. Setting `freeze_feature_encoder==True`.",
FutureWarning,
)
if self.freeze_feature_extractor and not self.freeze_feature_encoder:
raise ValueError(
"The argument `--freeze_feature_extractor` is deprecated and "
"should not be used in combination with `--freeze_feature_encoder`."
"Only make use of `--freeze_feature_encoder`."
)
def main(): def main():
...@@ -302,8 +323,8 @@ def main(): ...@@ -302,8 +323,8 @@ def main():
) )
# freeze the convolutional waveform encoder # freeze the convolutional waveform encoder
if model_args.freeze_feature_extractor: if model_args.freeze_feature_encoder:
model.freeze_feature_extractor() model.freeze_feature_encoder()
if training_args.do_train: if training_args.do_train:
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
......
...@@ -78,7 +78,7 @@ python run_speech_recognition_ctc.py \ ...@@ -78,7 +78,7 @@ python run_speech_recognition_ctc.py \
--eval_steps="100" \ --eval_steps="100" \
--layerdrop="0.0" \ --layerdrop="0.0" \
--save_total_limit="3" \ --save_total_limit="3" \
--freeze_feature_extractor \ --freeze_feature_encoder \
--gradient_checkpointing \ --gradient_checkpointing \
--chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” � \ --chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” � \
--fp16 \ --fp16 \
...@@ -113,7 +113,7 @@ python -m torch.distributed.launch \ ...@@ -113,7 +113,7 @@ python -m torch.distributed.launch \
--logging_steps="1" \ --logging_steps="1" \
--layerdrop="0.0" \ --layerdrop="0.0" \
--save_total_limit="3" \ --save_total_limit="3" \
--freeze_feature_extractor \ --freeze_feature_encoder \
--gradient_checkpointing \ --gradient_checkpointing \
--chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” � \ --chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” � \
--fp16 \ --fp16 \
...@@ -304,7 +304,7 @@ python run_speech_recognition_seq2seq.py \ ...@@ -304,7 +304,7 @@ python run_speech_recognition_seq2seq.py \
--eval_steps="400" \ --eval_steps="400" \
--logging_steps="10" \ --logging_steps="10" \
--save_total_limit="1" \ --save_total_limit="1" \
--freeze_feature_extractor \ --freeze_feature_encoder \
--gradient_checkpointing \ --gradient_checkpointing \
--fp16 \ --fp16 \
--group_by_length \ --group_by_length \
...@@ -346,7 +346,7 @@ python -m torch.distributed.launch \ ...@@ -346,7 +346,7 @@ python -m torch.distributed.launch \
--eval_steps="400" \ --eval_steps="400" \
--logging_steps="10" \ --logging_steps="10" \
--save_total_limit="1" \ --save_total_limit="1" \
--freeze_feature_extractor \ --freeze_feature_encoder \
--gradient_checkpointing \ --gradient_checkpointing \
--fp16 \ --fp16 \
--group_by_length \ --group_by_length \
......
...@@ -78,29 +78,27 @@ class ModelArguments: ...@@ -78,29 +78,27 @@ class ModelArguments:
default=None, default=None,
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
) )
freeze_feature_extractor: Optional[bool] = field( freeze_feature_encoder: bool = field(
default=True, metadata={"help": "Whether to freeze the feature extractor layers of the model."} default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
) )
attention_dropout: Optional[float] = field( attention_dropout: float = field(
default=0.0, metadata={"help": "The dropout ratio for the attention probabilities."} default=0.0, metadata={"help": "The dropout ratio for the attention probabilities."}
) )
activation_dropout: Optional[float] = field( activation_dropout: float = field(
default=0.0, metadata={"help": "The dropout ratio for activations inside the fully connected layer."} default=0.0, metadata={"help": "The dropout ratio for activations inside the fully connected layer."}
) )
feat_proj_dropout: Optional[float] = field( feat_proj_dropout: float = field(default=0.0, metadata={"help": "The dropout ratio for the projected features."})
default=0.0, metadata={"help": "The dropout ratio for the projected features."} hidden_dropout: float = field(
)
hidden_dropout: Optional[float] = field(
default=0.0, default=0.0,
metadata={ metadata={
"help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler." "help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler."
}, },
) )
final_dropout: Optional[float] = field( final_dropout: float = field(
default=0.0, default=0.0,
metadata={"help": "The dropout probability for the final projection layer."}, metadata={"help": "The dropout probability for the final projection layer."},
) )
mask_time_prob: Optional[float] = field( mask_time_prob: float = field(
default=0.05, default=0.05,
metadata={ metadata={
"help": "Probability of each feature vector along the time axis to be chosen as the start of the vector" "help": "Probability of each feature vector along the time axis to be chosen as the start of the vector"
...@@ -108,22 +106,22 @@ class ModelArguments: ...@@ -108,22 +106,22 @@ class ModelArguments:
"vectors will be masked along the time axis." "vectors will be masked along the time axis."
}, },
) )
mask_time_length: Optional[int] = field( mask_time_length: int = field(
default=10, default=10,
metadata={"help": "Length of vector span to mask along the time axis."}, metadata={"help": "Length of vector span to mask along the time axis."},
) )
mask_feature_prob: Optional[float] = field( mask_feature_prob: float = field(
default=0.0, default=0.0,
metadata={ metadata={
"help": "Probability of each feature vector along the feature axis to be chosen as the start of the vector" "help": "Probability of each feature vector along the feature axis to be chosen as the start of the vector"
"span to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature bins will be masked along the time axis." "span to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature bins will be masked along the time axis."
}, },
) )
mask_feature_length: Optional[int] = field( mask_feature_length: int = field(
default=10, default=10,
metadata={"help": "Length of vector span to mask along the feature axis."}, metadata={"help": "Length of vector span to mask along the feature axis."},
) )
layerdrop: Optional[float] = field(default=0.0, metadata={"help": "The LayerDrop probability."}) layerdrop: float = field(default=0.0, metadata={"help": "The LayerDrop probability."})
ctc_loss_reduction: Optional[str] = field( ctc_loss_reduction: Optional[str] = field(
default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."} default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."}
) )
...@@ -142,26 +140,26 @@ class DataTrainingArguments: ...@@ -142,26 +140,26 @@ class DataTrainingArguments:
dataset_name: str = field( dataset_name: str = field(
metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
) )
dataset_config_name: Optional[str] = field( dataset_config_name: str = field(
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
) )
train_split_name: Optional[str] = field( train_split_name: str = field(
default="train+validation", default="train+validation",
metadata={ metadata={
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'" "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
}, },
) )
eval_split_name: Optional[str] = field( eval_split_name: str = field(
default="test", default="test",
metadata={ metadata={
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'" "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
}, },
) )
audio_column_name: Optional[str] = field( audio_column_name: str = field(
default="audio", default="audio",
metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"}, metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
) )
text_column_name: Optional[str] = field( text_column_name: str = field(
default="text", default="text",
metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"}, metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
) )
...@@ -190,20 +188,20 @@ class DataTrainingArguments: ...@@ -190,20 +188,20 @@ class DataTrainingArguments:
default=None, default=None,
metadata={"help": "A list of characters to remove from the transcripts."}, metadata={"help": "A list of characters to remove from the transcripts."},
) )
eval_metrics: Optional[List[str]] = list_field( eval_metrics: List[str] = list_field(
default=["wer"], default=["wer"],
metadata={"help": "A list of metrics the model should be evaluated on. E.g. `'wer cer'`"}, metadata={"help": "A list of metrics the model should be evaluated on. E.g. `'wer cer'`"},
) )
max_duration_in_seconds: Optional[float] = field( max_duration_in_seconds: float = field(
default=20.0, default=20.0,
metadata={ metadata={
"help": "Filter audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`" "help": "Filter audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`"
}, },
) )
min_duration_in_seconds: Optional[float] = field( min_duration_in_seconds: float = field(
default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"} default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
) )
preprocessing_only: Optional[bool] = field( preprocessing_only: bool = field(
default=False, default=False,
metadata={ metadata={
"help": "Whether to only do data preprocessing and skip training. " "help": "Whether to only do data preprocessing and skip training. "
...@@ -212,22 +210,22 @@ class DataTrainingArguments: ...@@ -212,22 +210,22 @@ class DataTrainingArguments:
"so that the cached datasets can consequently be loaded in distributed training" "so that the cached datasets can consequently be loaded in distributed training"
}, },
) )
use_auth_token: Optional[bool] = field( use_auth_token: bool = field(
default=False, default=False,
metadata={ metadata={
"help": "If :obj:`True`, will use the token generated when running" "help": "If :obj:`True`, will use the token generated when running"
":obj:`transformers-cli login` as HTTP bearer authorization for remote files." ":obj:`transformers-cli login` as HTTP bearer authorization for remote files."
}, },
) )
unk_token: Optional[str] = field( unk_token: str = field(
default="[UNK]", default="[UNK]",
metadata={"help": "The unk token for the tokenizer"}, metadata={"help": "The unk token for the tokenizer"},
) )
pad_token: Optional[str] = field( pad_token: str = field(
default="[PAD]", default="[PAD]",
metadata={"help": "The padding token for the tokenizer"}, metadata={"help": "The padding token for the tokenizer"},
) )
word_delimiter_token: Optional[str] = field( word_delimiter_token: str = field(
default="|", default="|",
metadata={"help": "The word delimiter token for the tokenizer"}, metadata={"help": "The word delimiter token for the tokenizer"},
) )
...@@ -545,8 +543,8 @@ def main(): ...@@ -545,8 +543,8 @@ def main():
) )
# freeze encoder # freeze encoder
if model_args.freeze_feature_extractor: if model_args.freeze_feature_encoder:
model.freeze_feature_extractor() model.freeze_feature_encoder()
# 6. Now we preprocess the datasets including loading the audio, resampling and normalization # 6. Now we preprocess the datasets including loading the audio, resampling and normalization
# Thankfully, `datasets` takes care of automatically loading and resampling the audio, # Thankfully, `datasets` takes care of automatically loading and resampling the audio,
......
...@@ -91,8 +91,8 @@ class ModelArguments: ...@@ -91,8 +91,8 @@ class ModelArguments:
"with private models)." "with private models)."
}, },
) )
freeze_feature_extractor: Optional[bool] = field( freeze_feature_encoder: bool = field(
default=True, metadata={"help": "Whether to freeze the feature extractor layers of the model."} default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
) )
...@@ -102,7 +102,7 @@ class DataTrainingArguments: ...@@ -102,7 +102,7 @@ class DataTrainingArguments:
Arguments pertaining to what data we are going to input our model for training and eval. Arguments pertaining to what data we are going to input our model for training and eval.
""" """
dataset_name: Optional[str] = field( dataset_name: str = field(
default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
) )
dataset_config_name: Optional[str] = field( dataset_config_name: Optional[str] = field(
...@@ -133,24 +133,24 @@ class DataTrainingArguments: ...@@ -133,24 +133,24 @@ class DataTrainingArguments:
"value if set." "value if set."
}, },
) )
audio_column_name: Optional[str] = field( audio_column_name: str = field(
default="audio", default="audio",
metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"}, metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
) )
text_column_name: Optional[str] = field( text_column_name: str = field(
default="text", default="text",
metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"}, metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
) )
max_duration_in_seconds: Optional[float] = field( max_duration_in_seconds: float = field(
default=20.0, default=20.0,
metadata={ metadata={
"help": "Truncate audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`" "help": "Truncate audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`"
}, },
) )
min_duration_in_seconds: Optional[float] = field( min_duration_in_seconds: float = field(
default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"} default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
) )
preprocessing_only: Optional[bool] = field( preprocessing_only: bool = field(
default=False, default=False,
metadata={ metadata={
"help": "Whether to only do data preprocessing and skip training. " "help": "Whether to only do data preprocessing and skip training. "
...@@ -159,19 +159,19 @@ class DataTrainingArguments: ...@@ -159,19 +159,19 @@ class DataTrainingArguments:
"so that the cached datasets can consequently be loaded in distributed training" "so that the cached datasets can consequently be loaded in distributed training"
}, },
) )
train_split_name: Optional[str] = field( train_split_name: str = field(
default="train", default="train",
metadata={ metadata={
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'" "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
}, },
) )
eval_split_name: Optional[str] = field( eval_split_name: str = field(
default="test", default="test",
metadata={ metadata={
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'" "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
}, },
) )
do_lower_case: Optional[bool] = field( do_lower_case: bool = field(
default=True, default=True,
metadata={"help": "Whether the target text should be lower cased."}, metadata={"help": "Whether the target text should be lower cased."},
) )
...@@ -335,8 +335,8 @@ def main(): ...@@ -335,8 +335,8 @@ def main():
if model.config.decoder_start_token_id is None: if model.config.decoder_start_token_id is None:
raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined") raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")
if model_args.freeze_feature_extractor: if model_args.freeze_feature_encoder:
model.freeze_feature_extractor() model.freeze_feature_encoder()
# 6. Resample speech dataset if necassary # 6. Resample speech dataset if necassary
dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
......
...@@ -64,24 +64,24 @@ class HubertConfig(PretrainedConfig): ...@@ -64,24 +64,24 @@ class HubertConfig(PretrainedConfig):
layer_norm_eps (`float`, *optional*, defaults to 1e-12): layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers. The epsilon used by the layer normalization layers.
feat_extract_norm (`str`, *optional*, defaults to `"group"`): feat_extract_norm (`str`, *optional*, defaults to `"group"`):
The norm to be applied to 1D convolutional layers in feature extractor. One of `"group"` for group The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
convolutional layers. convolutional layers.
feat_proj_dropout (`float`, *optional*, defaults to 0.0): feat_proj_dropout (`float`, *optional*, defaults to 0.0):
The dropout probability for output of the feature extractor. The dropout probability for output of the feature encoder.
feat_proj_layer_norm (`bool`, *optional*, defaults to `True`): feat_proj_layer_norm (`bool`, *optional*, defaults to `True`):
Whether to apply LayerNorm to the output of the feature extractor. Whether to apply LayerNorm to the output of the feature encoder.
feat_extract_activation (`str, `optional`, defaults to `"gelu"`): feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
The non-linear activation function (function or string) in the 1D convolutional layers of the feature The non-linear activation function (function or string) in the 1D convolutional layers of the feature
extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`): conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
feature extractor. The length of *conv_dim* defines the number of 1D convolutional layers. feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`): conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*. of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`): conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
length of *conv_kernel* defines the number of convolutional layers and has to match the the length of length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
*conv_dim*. *conv_dim*.
conv_bias (`bool`, *optional*, defaults to `False`): conv_bias (`bool`, *optional*, defaults to `False`):
...@@ -96,7 +96,7 @@ class HubertConfig(PretrainedConfig): ...@@ -96,7 +96,7 @@ class HubertConfig(PretrainedConfig):
True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is
False` corresponds to applying layer norm after the attention layer. False` corresponds to applying layer norm after the attention layer.
apply_spec_augment (`bool`, *optional*, defaults to `True`): apply_spec_augment (`bool`, *optional*, defaults to `True`):
Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
[SpecAugment: A Simple Data Augmentation Method for Automatic Speech [SpecAugment: A Simple Data Augmentation Method for Automatic Speech
Recognition](https://arxiv.org/abs/1904.08779). Recognition](https://arxiv.org/abs/1904.08779).
mask_time_prob (`float`, *optional*, defaults to 0.05): mask_time_prob (`float`, *optional*, defaults to 0.05):
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
# limitations under the License. # limitations under the License.
""" PyTorch Hubert model.""" """ PyTorch Hubert model."""
import warnings
from typing import Optional, Tuple, Union from typing import Optional, Tuple, Union
import numpy as np import numpy as np
...@@ -284,8 +285,8 @@ class HubertSamePadLayer(nn.Module): ...@@ -284,8 +285,8 @@ class HubertSamePadLayer(nn.Module):
return hidden_states return hidden_states
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureExtractor with Wav2Vec2->Hubert # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder with Wav2Vec2->Hubert
class HubertFeatureExtractor(nn.Module): class HubertFeatureEncoder(nn.Module):
"""Construct the features from raw audio waveform""" """Construct the features from raw audio waveform"""
def __init__(self, config): def __init__(self, config):
...@@ -336,6 +337,17 @@ class HubertFeatureExtractor(nn.Module): ...@@ -336,6 +337,17 @@ class HubertFeatureExtractor(nn.Module):
return hidden_states return hidden_states
class HubertFeatureExtractor(HubertFeatureEncoder):
def __init__(self, config):
super().__init__(config)
warnings.warn(
f"The class `{self.__class__.__name__}` has been depreciated "
"and will be removed in Transformers v5. "
f"Use `{self.__class__.__bases__[0].__name__}` instead.",
FutureWarning,
)
class HubertFeatureProjection(nn.Module): class HubertFeatureProjection(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
...@@ -902,7 +914,7 @@ class HubertModel(HubertPreTrainedModel): ...@@ -902,7 +914,7 @@ class HubertModel(HubertPreTrainedModel):
def __init__(self, config: HubertConfig): def __init__(self, config: HubertConfig):
super().__init__(config) super().__init__(config)
self.config = config self.config = config
self.feature_extractor = HubertFeatureExtractor(config) self.feature_extractor = HubertFeatureEncoder(config)
self.feature_projection = HubertFeatureProjection(config) self.feature_projection = HubertFeatureProjection(config)
if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0: if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
...@@ -1063,8 +1075,20 @@ class HubertForCTC(HubertPreTrainedModel): ...@@ -1063,8 +1075,20 @@ class HubertForCTC(HubertPreTrainedModel):
def freeze_feature_extractor(self): def freeze_feature_extractor(self):
""" """
Calling this function will disable the gradient computation for the feature extractor so that its parameter Calling this function will disable the gradient computation for the feature encoder so that its parameter will
will not be updated during training. not be updated during training.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
""" """
self.hubert.feature_extractor._freeze_parameters() self.hubert.feature_extractor._freeze_parameters()
...@@ -1172,8 +1196,20 @@ class HubertForSequenceClassification(HubertPreTrainedModel): ...@@ -1172,8 +1196,20 @@ class HubertForSequenceClassification(HubertPreTrainedModel):
def freeze_feature_extractor(self): def freeze_feature_extractor(self):
""" """
Calling this function will disable the gradient computation for the feature extractor so that its parameters Calling this function will disable the gradient computation for the feature encoder so that its parameters will
will not be updated during training. not be updated during training.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
""" """
self.hubert.feature_extractor._freeze_parameters() self.hubert.feature_extractor._freeze_parameters()
......
...@@ -659,7 +659,7 @@ class TFHubertSamePadLayer(tf.keras.layers.Layer): ...@@ -659,7 +659,7 @@ class TFHubertSamePadLayer(tf.keras.layers.Layer):
return hidden_states return hidden_states
class TFHubertFeatureExtractor(tf.keras.layers.Layer): class TFHubertFeatureEncoder(tf.keras.layers.Layer):
def __init__(self, config: HubertConfig, **kwargs: Any) -> None: def __init__(self, config: HubertConfig, **kwargs: Any) -> None:
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -686,6 +686,17 @@ class TFHubertFeatureExtractor(tf.keras.layers.Layer): ...@@ -686,6 +686,17 @@ class TFHubertFeatureExtractor(tf.keras.layers.Layer):
return hidden_states return hidden_states
class TFHubertFeatureExtractor(TFHubertFeatureEncoder):
def __init__(self, config, **kwargs):
super().__init__(config, **kwargs)
warnings.warn(
f"The class `{self.__class__.__name__}` has been depreciated "
"and will be removed in Transformers v5. "
f"Use `{self.__class__.__bases__[0].__name__}` instead.",
FutureWarning,
)
class TFHubertFeatureProjection(tf.keras.layers.Layer): class TFHubertFeatureProjection(tf.keras.layers.Layer):
def __init__(self, config: HubertConfig, **kwargs): def __init__(self, config: HubertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -1116,7 +1127,7 @@ class TFHubertMainLayer(tf.keras.layers.Layer): ...@@ -1116,7 +1127,7 @@ class TFHubertMainLayer(tf.keras.layers.Layer):
def __init__(self, config: HubertConfig, **kwargs): def __init__(self, config: HubertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
self.feature_extractor = TFHubertFeatureExtractor(config, name="feature_extractor") self.feature_extractor = TFHubertFeatureEncoder(config, name="feature_extractor")
self.feature_projection = TFHubertFeatureProjection(config, name="feature_projection") self.feature_projection = TFHubertFeatureProjection(config, name="feature_projection")
if config.do_stable_layer_norm: if config.do_stable_layer_norm:
...@@ -1490,8 +1501,20 @@ class TFHubertForCTC(TFHubertPreTrainedModel): ...@@ -1490,8 +1501,20 @@ class TFHubertForCTC(TFHubertPreTrainedModel):
def freeze_feature_extractor(self): def freeze_feature_extractor(self):
""" """
Calling this function will disable the gradient computation for the feature extractor so that its parameter Calling this function will disable the gradient computation for the feature encoder so that its parameters will
will not be updated during training. not be updated during training.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
""" """
self.hubert.feature_extractor.trainable = False self.hubert.feature_extractor.trainable = False
......
...@@ -65,22 +65,22 @@ class SEWConfig(PretrainedConfig): ...@@ -65,22 +65,22 @@ class SEWConfig(PretrainedConfig):
layer_norm_eps (`float`, *optional*, defaults to 1e-12): layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers. The epsilon used by the layer normalization layers.
feat_extract_norm (`str`, *optional*, defaults to `"group"`): feat_extract_norm (`str`, *optional*, defaults to `"group"`):
The norm to be applied to 1D convolutional layers in feature extractor. One of `"group"` for group The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
convolutional layers. convolutional layers.
feat_proj_dropout (`float`, *optional*, defaults to 0.0): feat_proj_dropout (`float`, *optional*, defaults to 0.0):
The dropout probability for output of the feature extractor. The dropout probability for output of the feature encoder.
feat_extract_activation (`str, `optional`, defaults to `"gelu"`): feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
The non-linear activation function (function or string) in the 1D convolutional layers of the feature The non-linear activation function (function or string) in the 1D convolutional layers of the feature
extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
conv_dim (`Tuple[int]`, *optional*, defaults to `(64, 128, 128, 128, 128, 256, 256, 256, 256, 512, 512, 512, 512)`): conv_dim (`Tuple[int]`, *optional*, defaults to `(64, 128, 128, 128, 128, 256, 256, 256, 256, 512, 512, 512, 512)`):
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
feature extractor. The length of *conv_dim* defines the number of 1D convolutional layers. feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1)`): conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1)`):
A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*. of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 1, 3, 1, 3, 1, 3, 1, 2, 1, 2, 1)`): conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 1, 3, 1, 3, 1, 3, 1, 2, 1, 2, 1)`):
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
length of *conv_kernel* defines the number of convolutional layers and has to match the the length of length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
*conv_dim*. *conv_dim*.
conv_bias (`bool`, *optional*, defaults to `False`): conv_bias (`bool`, *optional*, defaults to `False`):
...@@ -91,7 +91,7 @@ class SEWConfig(PretrainedConfig): ...@@ -91,7 +91,7 @@ class SEWConfig(PretrainedConfig):
num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16): num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
Number of groups of 1D convolutional positional embeddings layer. Number of groups of 1D convolutional positional embeddings layer.
apply_spec_augment (`bool`, *optional*, defaults to `True`): apply_spec_augment (`bool`, *optional*, defaults to `True`):
Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
[SpecAugment: A Simple Data Augmentation Method for Automatic Speech [SpecAugment: A Simple Data Augmentation Method for Automatic Speech
Recognition](https://arxiv.org/abs/1904.08779). Recognition](https://arxiv.org/abs/1904.08779).
mask_time_prob (`float`, *optional*, defaults to 0.05): mask_time_prob (`float`, *optional*, defaults to 0.05):
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
""" PyTorch SEW model.""" """ PyTorch SEW model."""
import math import math
import warnings
from typing import Optional, Tuple, Union from typing import Optional, Tuple, Union
import numpy as np import numpy as np
...@@ -301,8 +302,8 @@ class SEWUpsampling(nn.Module): ...@@ -301,8 +302,8 @@ class SEWUpsampling(nn.Module):
return hidden_states return hidden_states
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureExtractor with Wav2Vec2->SEW # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder with Wav2Vec2->SEW
class SEWFeatureExtractor(nn.Module): class SEWFeatureEncoder(nn.Module):
"""Construct the features from raw audio waveform""" """Construct the features from raw audio waveform"""
def __init__(self, config): def __init__(self, config):
...@@ -353,6 +354,17 @@ class SEWFeatureExtractor(nn.Module): ...@@ -353,6 +354,17 @@ class SEWFeatureExtractor(nn.Module):
return hidden_states return hidden_states
class SEWFeatureExtractor(SEWFeatureEncoder):
def __init__(self, config):
super().__init__(config)
warnings.warn(
f"The class `{self.__class__.__name__}` has been depreciated "
"and will be removed in Transformers v5. "
f"Use `{self.__class__.__bases__[0].__name__}` instead.",
FutureWarning,
)
# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->SEW # Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->SEW
class SEWAttention(nn.Module): class SEWAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper""" """Multi-headed attention from 'Attention Is All You Need' paper"""
...@@ -712,7 +724,7 @@ class SEWPreTrainedModel(PreTrainedModel): ...@@ -712,7 +724,7 @@ class SEWPreTrainedModel(PreTrainedModel):
module.bias.data.zero_() module.bias.data.zero_()
def _set_gradient_checkpointing(self, module, value=False): def _set_gradient_checkpointing(self, module, value=False):
if isinstance(module, (SEWEncoder, SEWFeatureExtractor)): if isinstance(module, (SEWEncoder, SEWFeatureEncoder)):
module.gradient_checkpointing = value module.gradient_checkpointing = value
def _get_feat_extract_output_lengths(self, input_lengths: Union[torch.LongTensor, int]): def _get_feat_extract_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
...@@ -797,7 +809,7 @@ class SEWModel(SEWPreTrainedModel): ...@@ -797,7 +809,7 @@ class SEWModel(SEWPreTrainedModel):
def __init__(self, config: SEWConfig): def __init__(self, config: SEWConfig):
super().__init__(config) super().__init__(config)
self.config = config self.config = config
self.feature_extractor = SEWFeatureExtractor(config) self.feature_extractor = SEWFeatureEncoder(config)
self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps) self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
self.project_features = config.conv_dim[-1] != config.hidden_size self.project_features = config.conv_dim[-1] != config.hidden_size
...@@ -943,8 +955,20 @@ class SEWForCTC(SEWPreTrainedModel): ...@@ -943,8 +955,20 @@ class SEWForCTC(SEWPreTrainedModel):
def freeze_feature_extractor(self): def freeze_feature_extractor(self):
""" """
Calling this function will disable the gradient computation for the feature extractor so that its parameter Calling this function will disable the gradient computation for the feature encoder so that its parameter will
will not be updated during training. not be updated during training.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
""" """
self.sew.feature_extractor._freeze_parameters() self.sew.feature_extractor._freeze_parameters()
...@@ -1052,8 +1076,20 @@ class SEWForSequenceClassification(SEWPreTrainedModel): ...@@ -1052,8 +1076,20 @@ class SEWForSequenceClassification(SEWPreTrainedModel):
def freeze_feature_extractor(self): def freeze_feature_extractor(self):
""" """
Calling this function will disable the gradient computation for the feature extractor so that its parameters Calling this function will disable the gradient computation for the feature encoder so that its parameters will
will not be updated during training. not be updated during training.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
""" """
self.sew.feature_extractor._freeze_parameters() self.sew.feature_extractor._freeze_parameters()
......
...@@ -81,24 +81,24 @@ class SEWDConfig(PretrainedConfig): ...@@ -81,24 +81,24 @@ class SEWDConfig(PretrainedConfig):
layer_norm_eps (`float`, *optional*, defaults to 1e-7): layer_norm_eps (`float`, *optional*, defaults to 1e-7):
The epsilon used by the layer normalization layers in the transformer encoder. The epsilon used by the layer normalization layers in the transformer encoder.
feature_layer_norm_eps (`float`, *optional*, defaults to 1e-5): feature_layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization after the feature extractor. The epsilon used by the layer normalization after the feature encoder.
feat_extract_norm (`str`, *optional*, defaults to `"group"`): feat_extract_norm (`str`, *optional*, defaults to `"group"`):
The norm to be applied to 1D convolutional layers in feature extractor. One of `"group"` for group The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
convolutional layers. convolutional layers.
feat_proj_dropout (`float`, *optional*, defaults to 0.0): feat_proj_dropout (`float`, *optional*, defaults to 0.0):
The dropout probability for output of the feature extractor. The dropout probability for output of the feature encoder.
feat_extract_activation (`str, `optional`, defaults to `"gelu"`): feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
The non-linear activation function (function or string) in the 1D convolutional layers of the feature The non-linear activation function (function or string) in the 1D convolutional layers of the feature
extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
conv_dim (`Tuple[int]`, *optional*, defaults to `(64, 128, 128, 128, 128, 256, 256, 256, 256, 512, 512, 512, 512)`): conv_dim (`Tuple[int]`, *optional*, defaults to `(64, 128, 128, 128, 128, 256, 256, 256, 256, 512, 512, 512, 512)`):
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
feature extractor. The length of *conv_dim* defines the number of 1D convolutional layers. feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1)`): conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1)`):
A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*. of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 1, 3, 1, 3, 1, 3, 1, 2, 1, 2, 1)`): conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 1, 3, 1, 3, 1, 3, 1, 2, 1, 2, 1)`):
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
length of *conv_kernel* defines the number of convolutional layers and has to match the the length of length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
*conv_dim*. *conv_dim*.
conv_bias (`bool`, *optional*, defaults to `False`): conv_bias (`bool`, *optional*, defaults to `False`):
...@@ -109,7 +109,7 @@ class SEWDConfig(PretrainedConfig): ...@@ -109,7 +109,7 @@ class SEWDConfig(PretrainedConfig):
num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16): num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
Number of groups of 1D convolutional positional embeddings layer. Number of groups of 1D convolutional positional embeddings layer.
apply_spec_augment (`bool`, *optional*, defaults to `True`): apply_spec_augment (`bool`, *optional*, defaults to `True`):
Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
[SpecAugment: A Simple Data Augmentation Method for Automatic Speech [SpecAugment: A Simple Data Augmentation Method for Automatic Speech
Recognition](https://arxiv.org/abs/1904.08779). Recognition](https://arxiv.org/abs/1904.08779).
mask_time_prob (`float`, *optional*, defaults to 0.05): mask_time_prob (`float`, *optional*, defaults to 0.05):
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
""" PyTorch SEW model.""" """ PyTorch SEW model."""
import math import math
import warnings
from collections.abc import Sequence from collections.abc import Sequence
from typing import Optional, Tuple, Union from typing import Optional, Tuple, Union
...@@ -387,8 +388,8 @@ class SEWDUpsampling(nn.Module): ...@@ -387,8 +388,8 @@ class SEWDUpsampling(nn.Module):
return hidden_states return hidden_states
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureExtractor with Wav2Vec2->SEWD # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder with Wav2Vec2->SEWD
class SEWDFeatureExtractor(nn.Module): class SEWDFeatureEncoder(nn.Module):
"""Construct the features from raw audio waveform""" """Construct the features from raw audio waveform"""
def __init__(self, config): def __init__(self, config):
...@@ -439,6 +440,17 @@ class SEWDFeatureExtractor(nn.Module): ...@@ -439,6 +440,17 @@ class SEWDFeatureExtractor(nn.Module):
return hidden_states return hidden_states
class SEWDFeatureExtractor(SEWDFeatureEncoder):
def __init__(self, config):
super().__init__(config)
warnings.warn(
f"The class `{self.__class__.__name__}` has been depreciated "
"and will be removed in Transformers v5. "
f"Use `{self.__class__.__bases__[0].__name__}` instead.",
FutureWarning,
)
# Copied from transformers.models.deberta.modeling_deberta.ContextPooler # Copied from transformers.models.deberta.modeling_deberta.ContextPooler
class ContextPooler(nn.Module): class ContextPooler(nn.Module):
def __init__(self, config): def __init__(self, config):
...@@ -1333,7 +1345,7 @@ class SEWDModel(SEWDPreTrainedModel): ...@@ -1333,7 +1345,7 @@ class SEWDModel(SEWDPreTrainedModel):
def __init__(self, config: SEWDConfig): def __init__(self, config: SEWDConfig):
super().__init__(config) super().__init__(config)
self.config = config self.config = config
self.feature_extractor = SEWDFeatureExtractor(config) self.feature_extractor = SEWDFeatureEncoder(config)
self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.feature_layer_norm_eps) self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.feature_layer_norm_eps)
self.project_features = config.conv_dim[-1] != config.hidden_size self.project_features = config.conv_dim[-1] != config.hidden_size
...@@ -1479,8 +1491,20 @@ class SEWDForCTC(SEWDPreTrainedModel): ...@@ -1479,8 +1491,20 @@ class SEWDForCTC(SEWDPreTrainedModel):
def freeze_feature_extractor(self): def freeze_feature_extractor(self):
""" """
Calling this function will disable the gradient computation for the feature extractor so that its parameter Calling this function will disable the gradient computation for the feature encoder so that its parameter will
will not be updated during training. not be updated during training.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
""" """
self.sew_d.feature_extractor._freeze_parameters() self.sew_d.feature_extractor._freeze_parameters()
...@@ -1588,8 +1612,20 @@ class SEWDForSequenceClassification(SEWDPreTrainedModel): ...@@ -1588,8 +1612,20 @@ class SEWDForSequenceClassification(SEWDPreTrainedModel):
def freeze_feature_extractor(self): def freeze_feature_extractor(self):
""" """
Calling this function will disable the gradient computation for the feature extractor so that its parameters Calling this function will disable the gradient computation for the feature encoder so that its parameters will
will not be updated during training. not be updated during training.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
""" """
self.sew_d.feature_extractor._freeze_parameters() self.sew_d.feature_extractor._freeze_parameters()
......
...@@ -265,12 +265,12 @@ class SpeechEncoderDecoderModel(PreTrainedModel): ...@@ -265,12 +265,12 @@ class SpeechEncoderDecoderModel(PreTrainedModel):
def set_output_embeddings(self, new_embeddings): def set_output_embeddings(self, new_embeddings):
return self.decoder.set_output_embeddings(new_embeddings) return self.decoder.set_output_embeddings(new_embeddings)
def freeze_feature_extractor(self): def freeze_feature_encoder(self):
""" """
Calling this function will disable the gradient computation for the feature extractor of the speech encoder so Calling this function will disable the gradient computation for the feature encoder of the speech encoder so
that its parameters will not be updated during training. that its parameters will not be updated during training.
""" """
self.encoder.freeze_feature_extractor() self.encoder.freeze_feature_encoder()
@classmethod @classmethod
def from_pretrained(cls, *args, **kwargs): def from_pretrained(cls, *args, **kwargs):
......
...@@ -65,24 +65,24 @@ class UniSpeechConfig(PretrainedConfig): ...@@ -65,24 +65,24 @@ class UniSpeechConfig(PretrainedConfig):
layer_norm_eps (`float`, *optional*, defaults to 1e-12): layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers. The epsilon used by the layer normalization layers.
feat_extract_norm (`str`, *optional*, defaults to `"group"`): feat_extract_norm (`str`, *optional*, defaults to `"group"`):
The norm to be applied to 1D convolutional layers in feature extractor. One of `"group"` for group The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
convolutional layers. convolutional layers.
feat_proj_dropout (`float`, *optional*, defaults to 0.0): feat_proj_dropout (`float`, *optional*, defaults to 0.0):
The dropout probability for output of the feature extractor. The dropout probability for output of the feature encoder.
feat_extract_activation (`str, `optional`, defaults to `"gelu"`): feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
The non-linear activation function (function or string) in the 1D convolutional layers of the feature The non-linear activation function (function or string) in the 1D convolutional layers of the feature
extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0): feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for quantized feature extractor states. The dropout probabilitiy for quantized feature encoder states.
conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`): conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
feature extractor. The length of *conv_dim* defines the number of 1D convolutional layers. feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`): conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*. of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`): conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
length of *conv_kernel* defines the number of convolutional layers and has to match the the length of length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
*conv_dim*. *conv_dim*.
conv_bias (`bool`, *optional*, defaults to `False`): conv_bias (`bool`, *optional*, defaults to `False`):
...@@ -97,7 +97,7 @@ class UniSpeechConfig(PretrainedConfig): ...@@ -97,7 +97,7 @@ class UniSpeechConfig(PretrainedConfig):
True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is
False` corresponds to applying layer norm after the attention layer. False` corresponds to applying layer norm after the attention layer.
apply_spec_augment (`bool`, *optional*, defaults to `True`): apply_spec_augment (`bool`, *optional*, defaults to `True`):
Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
[SpecAugment: A Simple Data Augmentation Method for Automatic Speech [SpecAugment: A Simple Data Augmentation Method for Automatic Speech
Recognition](https://arxiv.org/abs/1904.08779). Recognition](https://arxiv.org/abs/1904.08779).
mask_time_prob (`float`, *optional*, defaults to 0.05): mask_time_prob (`float`, *optional*, defaults to 0.05):
...@@ -132,7 +132,7 @@ class UniSpeechConfig(PretrainedConfig): ...@@ -132,7 +132,7 @@ class UniSpeechConfig(PretrainedConfig):
contrastive_logits_temperature (`float`, *optional*, defaults to 0.1): contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
The temperature *kappa* in the contrastive loss. The temperature *kappa* in the contrastive loss.
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0): feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for the output of the feature extractor that's used by the quantizer. The dropout probabilitiy for the output of the feature encoder that's used by the quantizer.
num_negatives (`int`, *optional*, defaults to 100): num_negatives (`int`, *optional*, defaults to 100):
Number of negative samples for the contrastive loss. Number of negative samples for the contrastive loss.
codevector_dim (`int`, *optional*, defaults to 256): codevector_dim (`int`, *optional*, defaults to 256):
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
""" PyTorch UniSpeech model.""" """ PyTorch UniSpeech model."""
import math import math
import warnings
from dataclasses import dataclass from dataclasses import dataclass
from typing import Optional, Tuple, Union from typing import Optional, Tuple, Union
...@@ -351,8 +352,8 @@ class UniSpeechSamePadLayer(nn.Module): ...@@ -351,8 +352,8 @@ class UniSpeechSamePadLayer(nn.Module):
return hidden_states return hidden_states
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureExtractor with Wav2Vec2->UniSpeech # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder with Wav2Vec2->UniSpeech
class UniSpeechFeatureExtractor(nn.Module): class UniSpeechFeatureEncoder(nn.Module):
"""Construct the features from raw audio waveform""" """Construct the features from raw audio waveform"""
def __init__(self, config): def __init__(self, config):
...@@ -406,6 +407,17 @@ class UniSpeechFeatureExtractor(nn.Module): ...@@ -406,6 +407,17 @@ class UniSpeechFeatureExtractor(nn.Module):
return hidden_states return hidden_states
class UniSpeechFeatureExtractor(UniSpeechFeatureEncoder):
def __init__(self, config):
super().__init__(config)
warnings.warn(
f"The class `{self.__class__.__name__}` has been depreciated "
"and will be removed in Transformers v5. "
f"Use `{self.__class__.__bases__[0].__name__}` instead.",
FutureWarning,
)
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureProjection with Wav2Vec2->UniSpeech # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureProjection with Wav2Vec2->UniSpeech
class UniSpeechFeatureProjection(nn.Module): class UniSpeechFeatureProjection(nn.Module):
def __init__(self, config): def __init__(self, config):
...@@ -980,7 +992,7 @@ class UniSpeechPreTrainedModel(PreTrainedModel): ...@@ -980,7 +992,7 @@ class UniSpeechPreTrainedModel(PreTrainedModel):
return attention_mask return attention_mask
def _set_gradient_checkpointing(self, module, value=False): def _set_gradient_checkpointing(self, module, value=False):
if isinstance(module, (UniSpeechEncoder, UniSpeechEncoderStableLayerNorm, UniSpeechFeatureExtractor)): if isinstance(module, (UniSpeechEncoder, UniSpeechEncoderStableLayerNorm, UniSpeechFeatureEncoder)):
module.gradient_checkpointing = value module.gradient_checkpointing = value
...@@ -1049,7 +1061,7 @@ class UniSpeechModel(UniSpeechPreTrainedModel): ...@@ -1049,7 +1061,7 @@ class UniSpeechModel(UniSpeechPreTrainedModel):
def __init__(self, config: UniSpeechConfig): def __init__(self, config: UniSpeechConfig):
super().__init__(config) super().__init__(config)
self.config = config self.config = config
self.feature_extractor = UniSpeechFeatureExtractor(config) self.feature_extractor = UniSpeechFeatureEncoder(config)
self.feature_projection = UniSpeechFeatureProjection(config) self.feature_projection = UniSpeechFeatureProjection(config)
if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0: if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
...@@ -1193,8 +1205,20 @@ class UniSpeechForPreTraining(UniSpeechPreTrainedModel): ...@@ -1193,8 +1205,20 @@ class UniSpeechForPreTraining(UniSpeechPreTrainedModel):
def freeze_feature_extractor(self): def freeze_feature_extractor(self):
""" """
Calling this function will disable the gradient computation for the feature extractor so that its parameters Calling this function will disable the gradient computation for the feature encoder so that its parameters will
will not be updated during training. not be updated during training.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
""" """
self.unispeech.feature_extractor._freeze_parameters() self.unispeech.feature_extractor._freeze_parameters()
...@@ -1358,8 +1382,20 @@ class UniSpeechForCTC(UniSpeechPreTrainedModel): ...@@ -1358,8 +1382,20 @@ class UniSpeechForCTC(UniSpeechPreTrainedModel):
def freeze_feature_extractor(self): def freeze_feature_extractor(self):
""" """
Calling this function will disable the gradient computation for the feature extractor so that its parameter Calling this function will disable the gradient computation for the feature encoder so that its parameter will
will not be updated during training. not be updated during training.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
""" """
self.unispeech.feature_extractor._freeze_parameters() self.unispeech.feature_extractor._freeze_parameters()
...@@ -1467,8 +1503,20 @@ class UniSpeechForSequenceClassification(UniSpeechPreTrainedModel): ...@@ -1467,8 +1503,20 @@ class UniSpeechForSequenceClassification(UniSpeechPreTrainedModel):
def freeze_feature_extractor(self): def freeze_feature_extractor(self):
""" """
Calling this function will disable the gradient computation for the feature extractor so that its parameters Calling this function will disable the gradient computation for the feature encoder so that its parameters will
will not be updated during training. not be updated during training.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
""" """
self.unispeech.feature_extractor._freeze_parameters() self.unispeech.feature_extractor._freeze_parameters()
......
...@@ -65,24 +65,24 @@ class UniSpeechSatConfig(PretrainedConfig): ...@@ -65,24 +65,24 @@ class UniSpeechSatConfig(PretrainedConfig):
layer_norm_eps (`float`, *optional*, defaults to 1e-12): layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers. The epsilon used by the layer normalization layers.
feat_extract_norm (`str`, *optional*, defaults to `"group"`): feat_extract_norm (`str`, *optional*, defaults to `"group"`):
The norm to be applied to 1D convolutional layers in feature extractor. One of `"group"` for group The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
convolutional layers. convolutional layers.
feat_proj_dropout (`float`, *optional*, defaults to 0.0): feat_proj_dropout (`float`, *optional*, defaults to 0.0):
The dropout probability for output of the feature extractor. The dropout probability for output of the feature encoder.
feat_extract_activation (`str, `optional`, defaults to `"gelu"`): feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
The non-linear activation function (function or string) in the 1D convolutional layers of the feature The non-linear activation function (function or string) in the 1D convolutional layers of the feature
extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0): feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for quantized feature extractor states. The dropout probabilitiy for quantized feature encoder states.
conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`): conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
feature extractor. The length of *conv_dim* defines the number of 1D convolutional layers. feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`): conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*. of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`): conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
length of *conv_kernel* defines the number of convolutional layers and has to match the the length of length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
*conv_dim*. *conv_dim*.
conv_bias (`bool`, *optional*, defaults to `False`): conv_bias (`bool`, *optional*, defaults to `False`):
...@@ -97,7 +97,7 @@ class UniSpeechSatConfig(PretrainedConfig): ...@@ -97,7 +97,7 @@ class UniSpeechSatConfig(PretrainedConfig):
True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is
False` corresponds to applying layer norm after the attention layer. False` corresponds to applying layer norm after the attention layer.
apply_spec_augment (`bool`, *optional*, defaults to `True`): apply_spec_augment (`bool`, *optional*, defaults to `True`):
Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
[SpecAugment: A Simple Data Augmentation Method for Automatic Speech [SpecAugment: A Simple Data Augmentation Method for Automatic Speech
Recognition](https://arxiv.org/abs/1904.08779). Recognition](https://arxiv.org/abs/1904.08779).
mask_time_prob (`float`, *optional*, defaults to 0.05): mask_time_prob (`float`, *optional*, defaults to 0.05):
...@@ -132,7 +132,7 @@ class UniSpeechSatConfig(PretrainedConfig): ...@@ -132,7 +132,7 @@ class UniSpeechSatConfig(PretrainedConfig):
contrastive_logits_temperature (`float`, *optional*, defaults to 0.1): contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
The temperature *kappa* in the contrastive loss. The temperature *kappa* in the contrastive loss.
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0): feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for the output of the feature extractor that's used by the quantizer. The dropout probabilitiy for the output of the feature encoder that's used by the quantizer.
num_negatives (`int`, *optional*, defaults to 100): num_negatives (`int`, *optional*, defaults to 100):
Number of negative samples for the contrastive loss. Number of negative samples for the contrastive loss.
codevector_dim (`int`, *optional*, defaults to 256): codevector_dim (`int`, *optional*, defaults to 256):
......
...@@ -20,12 +20,7 @@ import argparse ...@@ -20,12 +20,7 @@ import argparse
import fairseq import fairseq
import torch import torch
from transformers import ( # UniSpeechSatCTCTokenizer,; UniSpeechSatFeatureExtractor,; UniSpeechSatProcessor, from transformers import UniSpeechSatConfig, UniSpeechSatForCTC, UniSpeechSatForPreTraining, logging
UniSpeechSatConfig,
UniSpeechSatForCTC,
UniSpeechSatForPreTraining,
logging,
)
logging.set_verbosity_info() logging.set_verbosity_info()
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
""" PyTorch UniSpeechSat model.""" """ PyTorch UniSpeechSat model."""
import math import math
import warnings
from dataclasses import dataclass from dataclasses import dataclass
from typing import Optional, Tuple, Union from typing import Optional, Tuple, Union
...@@ -385,8 +386,8 @@ class UniSpeechSatSamePadLayer(nn.Module): ...@@ -385,8 +386,8 @@ class UniSpeechSatSamePadLayer(nn.Module):
return hidden_states return hidden_states
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureExtractor with Wav2Vec2->UniSpeechSat # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder with Wav2Vec2->UniSpeechSat
class UniSpeechSatFeatureExtractor(nn.Module): class UniSpeechSatFeatureEncoder(nn.Module):
"""Construct the features from raw audio waveform""" """Construct the features from raw audio waveform"""
def __init__(self, config): def __init__(self, config):
...@@ -440,6 +441,17 @@ class UniSpeechSatFeatureExtractor(nn.Module): ...@@ -440,6 +441,17 @@ class UniSpeechSatFeatureExtractor(nn.Module):
return hidden_states return hidden_states
class UniSpeechSatFeatureExtractor(UniSpeechSatFeatureEncoder):
def __init__(self, config):
super().__init__(config)
warnings.warn(
f"The class `{self.__class__.__name__}` has been depreciated "
"and will be removed in Transformers v5. "
f"Use `{self.__class__.__bases__[0].__name__}` instead.",
FutureWarning,
)
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureProjection with Wav2Vec2->UniSpeechSat # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureProjection with Wav2Vec2->UniSpeechSat
class UniSpeechSatFeatureProjection(nn.Module): class UniSpeechSatFeatureProjection(nn.Module):
def __init__(self, config): def __init__(self, config):
...@@ -1014,7 +1026,7 @@ class UniSpeechSatPreTrainedModel(PreTrainedModel): ...@@ -1014,7 +1026,7 @@ class UniSpeechSatPreTrainedModel(PreTrainedModel):
return attention_mask return attention_mask
def _set_gradient_checkpointing(self, module, value=False): def _set_gradient_checkpointing(self, module, value=False):
if isinstance(module, (UniSpeechSatEncoder, UniSpeechSatEncoderStableLayerNorm, UniSpeechSatFeatureExtractor)): if isinstance(module, (UniSpeechSatEncoder, UniSpeechSatEncoderStableLayerNorm, UniSpeechSatFeatureEncoder)):
module.gradient_checkpointing = value module.gradient_checkpointing = value
...@@ -1084,7 +1096,7 @@ class UniSpeechSatModel(UniSpeechSatPreTrainedModel): ...@@ -1084,7 +1096,7 @@ class UniSpeechSatModel(UniSpeechSatPreTrainedModel):
def __init__(self, config: UniSpeechSatConfig): def __init__(self, config: UniSpeechSatConfig):
super().__init__(config) super().__init__(config)
self.config = config self.config = config
self.feature_extractor = UniSpeechSatFeatureExtractor(config) self.feature_extractor = UniSpeechSatFeatureEncoder(config)
self.feature_projection = UniSpeechSatFeatureProjection(config) self.feature_projection = UniSpeechSatFeatureProjection(config)
self.masked_spec_embed = nn.Parameter(torch.FloatTensor(config.hidden_size).uniform_()) self.masked_spec_embed = nn.Parameter(torch.FloatTensor(config.hidden_size).uniform_())
...@@ -1232,10 +1244,22 @@ class UniSpeechSatForPreTraining(UniSpeechSatPreTrainedModel): ...@@ -1232,10 +1244,22 @@ class UniSpeechSatForPreTraining(UniSpeechSatPreTrainedModel):
def freeze_feature_extractor(self): def freeze_feature_extractor(self):
""" """
Calling this function will disable the gradient computation for the feature extractor so that its parameters Calling this function will disable the gradient computation for the feature encoder so that its parameters will
will not be updated during training. not be updated during training.
""" """
self.unispeech_sat.feature_extractor._freeze_parameters() warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
self.wav2vec2.feature_extractor._freeze_parameters()
@staticmethod @staticmethod
def compute_contrastive_logits( def compute_contrastive_logits(
...@@ -1274,12 +1298,12 @@ class UniSpeechSatForPreTraining(UniSpeechSatPreTrainedModel): ...@@ -1274,12 +1298,12 @@ class UniSpeechSatForPreTraining(UniSpeechSatPreTrainedModel):
```python ```python
>>> import torch >>> import torch
>>> from transformers import UniSpeechSatFeatureExtractor, UniSpeechSatForPreTraining >>> from transformers import UniSpeechSatFeatureEncoder, UniSpeechSatForPreTraining
>>> from transformers.models.unispeech_sat.modeling_unispeech_sat import _compute_mask_indices >>> from transformers.models.unispeech_sat.modeling_unispeech_sat import _compute_mask_indices
>>> from datasets import load_dataset >>> from datasets import load_dataset
>>> import soundfile as sf >>> import soundfile as sf
>>> feature_extractor = UniSpeechSatFeatureExtractor.from_pretrained("patrickvonplaten/unispeech_sat-base") >>> feature_extractor = UniSpeechSatFeatureEncoder.from_pretrained("patrickvonplaten/unispeech_sat-base")
>>> model = UniSpeechSatForPreTraining.from_pretrained("patrickvonplaten/unispeech_sat-base") >>> model = UniSpeechSatForPreTraining.from_pretrained("patrickvonplaten/unispeech_sat-base")
...@@ -1383,8 +1407,20 @@ class UniSpeechSatForCTC(UniSpeechSatPreTrainedModel): ...@@ -1383,8 +1407,20 @@ class UniSpeechSatForCTC(UniSpeechSatPreTrainedModel):
def freeze_feature_extractor(self): def freeze_feature_extractor(self):
""" """
Calling this function will disable the gradient computation for the feature extractor so that its parameter Calling this function will disable the gradient computation for the feature encoder so that its parameter will
will not be updated during training. not be updated during training.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
""" """
self.unispeech_sat.feature_extractor._freeze_parameters() self.unispeech_sat.feature_extractor._freeze_parameters()
...@@ -1492,8 +1528,20 @@ class UniSpeechSatForSequenceClassification(UniSpeechSatPreTrainedModel): ...@@ -1492,8 +1528,20 @@ class UniSpeechSatForSequenceClassification(UniSpeechSatPreTrainedModel):
def freeze_feature_extractor(self): def freeze_feature_extractor(self):
""" """
Calling this function will disable the gradient computation for the feature extractor so that its parameters Calling this function will disable the gradient computation for the feature encoder so that its parameters will
will not be updated during training. not be updated during training.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
""" """
self.unispeech_sat.feature_extractor._freeze_parameters() self.unispeech_sat.feature_extractor._freeze_parameters()
...@@ -1596,8 +1644,20 @@ class UniSpeechSatForAudioFrameClassification(UniSpeechSatPreTrainedModel): ...@@ -1596,8 +1644,20 @@ class UniSpeechSatForAudioFrameClassification(UniSpeechSatPreTrainedModel):
def freeze_feature_extractor(self): def freeze_feature_extractor(self):
""" """
Calling this function will disable the gradient computation for the feature extractor so that its parameters Calling this function will disable the gradient computation for the feature encoder so that its parameter will
will not be updated during training. not be updated during training.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
""" """
self.unispeech_sat.feature_extractor._freeze_parameters() self.unispeech_sat.feature_extractor._freeze_parameters()
...@@ -1745,8 +1805,20 @@ class UniSpeechSatForXVector(UniSpeechSatPreTrainedModel): ...@@ -1745,8 +1805,20 @@ class UniSpeechSatForXVector(UniSpeechSatPreTrainedModel):
def freeze_feature_extractor(self): def freeze_feature_extractor(self):
""" """
Calling this function will disable the gradient computation for the feature extractor so that its parameters Calling this function will disable the gradient computation for the feature encoder so that its parameter will
will not be updated during training. not be updated during training.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
""" """
self.unispeech_sat.feature_extractor._freeze_parameters() self.unispeech_sat.feature_extractor._freeze_parameters()
......
...@@ -65,24 +65,24 @@ class Wav2Vec2Config(PretrainedConfig): ...@@ -65,24 +65,24 @@ class Wav2Vec2Config(PretrainedConfig):
layer_norm_eps (`float`, *optional*, defaults to 1e-12): layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers. The epsilon used by the layer normalization layers.
feat_extract_norm (`str`, *optional*, defaults to `"group"`): feat_extract_norm (`str`, *optional*, defaults to `"group"`):
The norm to be applied to 1D convolutional layers in feature extractor. One of `"group"` for group The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
convolutional layers. convolutional layers.
feat_proj_dropout (`float`, *optional*, defaults to 0.0): feat_proj_dropout (`float`, *optional*, defaults to 0.0):
The dropout probability for output of the feature extractor. The dropout probability for output of the feature encoder.
feat_extract_activation (`str, `optional`, defaults to `"gelu"`): feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
The non-linear activation function (function or string) in the 1D convolutional layers of the feature The non-linear activation function (function or string) in the 1D convolutional layers of the feature
extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0): feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for quantized feature extractor states. The dropout probabilitiy for quantized feature encoder states.
conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`): conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
feature extractor. The length of *conv_dim* defines the number of 1D convolutional layers. feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`): conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*. of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`): conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
length of *conv_kernel* defines the number of convolutional layers and has to match the length of length of *conv_kernel* defines the number of convolutional layers and has to match the length of
*conv_dim*. *conv_dim*.
conv_bias (`bool`, *optional*, defaults to `False`): conv_bias (`bool`, *optional*, defaults to `False`):
...@@ -97,7 +97,7 @@ class Wav2Vec2Config(PretrainedConfig): ...@@ -97,7 +97,7 @@ class Wav2Vec2Config(PretrainedConfig):
True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is
False` corresponds to applying layer norm after the attention layer. False` corresponds to applying layer norm after the attention layer.
apply_spec_augment (`bool`, *optional*, defaults to `True`): apply_spec_augment (`bool`, *optional*, defaults to `True`):
Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
[SpecAugment: A Simple Data Augmentation Method for Automatic Speech [SpecAugment: A Simple Data Augmentation Method for Automatic Speech
Recognition](https://arxiv.org/abs/1904.08779). Recognition](https://arxiv.org/abs/1904.08779).
mask_time_prob (`float`, *optional*, defaults to 0.05): mask_time_prob (`float`, *optional*, defaults to 0.05):
...@@ -132,7 +132,7 @@ class Wav2Vec2Config(PretrainedConfig): ...@@ -132,7 +132,7 @@ class Wav2Vec2Config(PretrainedConfig):
contrastive_logits_temperature (`float`, *optional*, defaults to 0.1): contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
The temperature *kappa* in the contrastive loss. The temperature *kappa* in the contrastive loss.
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0): feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for the output of the feature extractor that's used by the quantizer. The dropout probabilitiy for the output of the feature encoder that's used by the quantizer.
num_negatives (`int`, *optional*, defaults to 100): num_negatives (`int`, *optional*, defaults to 100):
Number of negative samples for the contrastive loss. Number of negative samples for the contrastive loss.
codevector_dim (`int`, *optional*, defaults to 256): codevector_dim (`int`, *optional*, defaults to 256):
......
...@@ -395,7 +395,7 @@ class FlaxConvLayersCollection(nn.Module): ...@@ -395,7 +395,7 @@ class FlaxConvLayersCollection(nn.Module):
return hidden_states return hidden_states
class FlaxWav2Vec2FeatureExtractor(nn.Module): class FlaxWav2Vec2FeatureEncoder(nn.Module):
"""Construct the features from raw audio waveform""" """Construct the features from raw audio waveform"""
config: Wav2Vec2Config config: Wav2Vec2Config
...@@ -849,7 +849,7 @@ class FlaxWav2Vec2Module(nn.Module): ...@@ -849,7 +849,7 @@ class FlaxWav2Vec2Module(nn.Module):
dtype: jnp.dtype = jnp.float32 dtype: jnp.dtype = jnp.float32
def setup(self): def setup(self):
self.feature_extractor = FlaxWav2Vec2FeatureExtractor(self.config, dtype=self.dtype) self.feature_extractor = FlaxWav2Vec2FeatureEncoder(self.config, dtype=self.dtype)
self.feature_projection = FlaxWav2Vec2FeatureProjection(self.config, dtype=self.dtype) self.feature_projection = FlaxWav2Vec2FeatureProjection(self.config, dtype=self.dtype)
self.masked_spec_embed = self.param( self.masked_spec_embed = self.param(
"masked_spec_embed", jax.nn.initializers.uniform(), (self.config.hidden_size,) "masked_spec_embed", jax.nn.initializers.uniform(), (self.config.hidden_size,)
......
...@@ -655,7 +655,7 @@ class TFWav2Vec2SamePadLayer(tf.keras.layers.Layer): ...@@ -655,7 +655,7 @@ class TFWav2Vec2SamePadLayer(tf.keras.layers.Layer):
return hidden_states return hidden_states
class TFWav2Vec2FeatureExtractor(tf.keras.layers.Layer): class TFWav2Vec2FeatureEncoder(tf.keras.layers.Layer):
def __init__(self, config: Wav2Vec2Config, **kwargs: Any) -> None: def __init__(self, config: Wav2Vec2Config, **kwargs: Any) -> None:
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -682,6 +682,17 @@ class TFWav2Vec2FeatureExtractor(tf.keras.layers.Layer): ...@@ -682,6 +682,17 @@ class TFWav2Vec2FeatureExtractor(tf.keras.layers.Layer):
return hidden_states return hidden_states
class TFWav2Vec2FeatureExtractor(TFWav2Vec2FeatureEncoder):
def __init__(self, config, **kwargs):
super().__init__(config, **kwargs)
warnings.warn(
f"The class `{self.__class__.__name__}` has been depreciated "
"and will be removed in Transformers v5. "
f"Use `{self.__class__.__bases__[0].__name__}` instead.",
FutureWarning,
)
class TFWav2Vec2FeatureProjection(tf.keras.layers.Layer): class TFWav2Vec2FeatureProjection(tf.keras.layers.Layer):
def __init__(self, config: Wav2Vec2Config, **kwargs): def __init__(self, config: Wav2Vec2Config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -1107,7 +1118,7 @@ class TFWav2Vec2MainLayer(tf.keras.layers.Layer): ...@@ -1107,7 +1118,7 @@ class TFWav2Vec2MainLayer(tf.keras.layers.Layer):
def __init__(self, config: Wav2Vec2Config, **kwargs): def __init__(self, config: Wav2Vec2Config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
self.feature_extractor = TFWav2Vec2FeatureExtractor(config, name="feature_extractor") self.feature_extractor = TFWav2Vec2FeatureEncoder(config, name="feature_extractor")
self.feature_projection = TFWav2Vec2FeatureProjection(config, name="feature_projection") self.feature_projection = TFWav2Vec2FeatureProjection(config, name="feature_projection")
if config.do_stable_layer_norm: if config.do_stable_layer_norm:
...@@ -1481,8 +1492,20 @@ class TFWav2Vec2ForCTC(TFWav2Vec2PreTrainedModel): ...@@ -1481,8 +1492,20 @@ class TFWav2Vec2ForCTC(TFWav2Vec2PreTrainedModel):
def freeze_feature_extractor(self): def freeze_feature_extractor(self):
""" """
Calling this function will disable the gradient computation for the feature extractor so that its parameter Calling this function will disable the gradient computation for the feature encoder so that its parameters will
will not be updated during training. not be updated during training.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
""" """
self.wav2vec2.feature_extractor.trainable = False self.wav2vec2.feature_extractor.trainable = False
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment