Unverified Commit 600496fa authored by Patrick von Platen's avatar Patrick von Platen Committed by GitHub
Browse files

[Wav2Vec2] Rename model's feature extractor to feature encoder (#14959)

* rename classes

* clean up more namings

* remove bogus file

* Apply suggestions from code review

* Apply suggestions from code review

* replace more names

* more regex replace

* make style

* correct

* correct more

* make style

* finish

* correct more in wav2vec2

* make style

* improve freeze_extractor

* add aliases

* add tf aliases
parent 1bfa3477
......@@ -17,6 +17,7 @@
import logging
import os
import sys
import warnings
from dataclasses import dataclass, field
from random import randint
from typing import Optional
......@@ -76,24 +77,24 @@ class DataTrainingArguments:
eval_file: Optional[str] = field(
default=None, metadata={"help": "A file containing the validation audio paths and labels."}
)
train_split_name: Optional[str] = field(
train_split_name: str = field(
default="train",
metadata={
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
},
)
eval_split_name: Optional[str] = field(
eval_split_name: str = field(
default="validation",
metadata={
"help": "The name of the training data set split to use (via the datasets library). Defaults to "
"'validation'"
},
)
audio_column_name: Optional[str] = field(
audio_column_name: str = field(
default="audio",
metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
)
label_column_name: Optional[str] = field(
label_column_name: str = field(
default="label", metadata={"help": "The name of the dataset column containing the labels. Defaults to 'label'"}
)
max_train_samples: Optional[int] = field(
......@@ -110,7 +111,7 @@ class DataTrainingArguments:
"value if set."
},
)
max_length_seconds: Optional[float] = field(
max_length_seconds: float = field(
default=20,
metadata={"help": "Audio clips will be randomly cut to this length during training if the value is set."},
)
......@@ -136,11 +137,13 @@ class ModelArguments:
default="main",
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
)
feature_extractor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
freeze_feature_extractor: Optional[bool] = field(
default=True, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
feature_extractor_name: Optional[str] = field(
default=None, metadata={"help": "Name or path of preprocessor config."}
)
freeze_feature_encoder: bool = field(
default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
)
attention_mask: Optional[bool] = field(
attention_mask: bool = field(
default=True, metadata={"help": "Whether to generate an attention mask in the feature extractor."}
)
use_auth_token: bool = field(
......@@ -150,6 +153,24 @@ class ModelArguments:
"with private models)."
},
)
freeze_feature_extractor: Optional[bool] = field(
default=None, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
)
def __post_init__(self):
if not self.freeze_feature_extractor and self.freeze_feature_encoder:
warnings.warn(
"The argument `--freeze_feature_extractor` is deprecated and "
"will be removed in a future version. Use `--freeze_feature_encoder`"
"instead. Setting `freeze_feature_encoder==True`.",
FutureWarning,
)
if self.freeze_feature_extractor and not self.freeze_feature_encoder:
raise ValueError(
"The argument `--freeze_feature_extractor` is deprecated and "
"should not be used in combination with `--freeze_feature_encoder`."
"Only make use of `--freeze_feature_encoder`."
)
def main():
......@@ -302,8 +323,8 @@ def main():
)
# freeze the convolutional waveform encoder
if model_args.freeze_feature_extractor:
model.freeze_feature_extractor()
if model_args.freeze_feature_encoder:
model.freeze_feature_encoder()
if training_args.do_train:
if data_args.max_train_samples is not None:
......
......@@ -78,7 +78,7 @@ python run_speech_recognition_ctc.py \
--eval_steps="100" \
--layerdrop="0.0" \
--save_total_limit="3" \
--freeze_feature_extractor \
--freeze_feature_encoder \
--gradient_checkpointing \
--chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” � \
--fp16 \
......@@ -113,7 +113,7 @@ python -m torch.distributed.launch \
--logging_steps="1" \
--layerdrop="0.0" \
--save_total_limit="3" \
--freeze_feature_extractor \
--freeze_feature_encoder \
--gradient_checkpointing \
--chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” � \
--fp16 \
......@@ -304,7 +304,7 @@ python run_speech_recognition_seq2seq.py \
--eval_steps="400" \
--logging_steps="10" \
--save_total_limit="1" \
--freeze_feature_extractor \
--freeze_feature_encoder \
--gradient_checkpointing \
--fp16 \
--group_by_length \
......@@ -346,7 +346,7 @@ python -m torch.distributed.launch \
--eval_steps="400" \
--logging_steps="10" \
--save_total_limit="1" \
--freeze_feature_extractor \
--freeze_feature_encoder \
--gradient_checkpointing \
--fp16 \
--group_by_length \
......
......@@ -78,29 +78,27 @@ class ModelArguments:
default=None,
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
)
freeze_feature_extractor: Optional[bool] = field(
default=True, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
freeze_feature_encoder: bool = field(
default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
)
attention_dropout: Optional[float] = field(
attention_dropout: float = field(
default=0.0, metadata={"help": "The dropout ratio for the attention probabilities."}
)
activation_dropout: Optional[float] = field(
activation_dropout: float = field(
default=0.0, metadata={"help": "The dropout ratio for activations inside the fully connected layer."}
)
feat_proj_dropout: Optional[float] = field(
default=0.0, metadata={"help": "The dropout ratio for the projected features."}
)
hidden_dropout: Optional[float] = field(
feat_proj_dropout: float = field(default=0.0, metadata={"help": "The dropout ratio for the projected features."})
hidden_dropout: float = field(
default=0.0,
metadata={
"help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler."
},
)
final_dropout: Optional[float] = field(
final_dropout: float = field(
default=0.0,
metadata={"help": "The dropout probability for the final projection layer."},
)
mask_time_prob: Optional[float] = field(
mask_time_prob: float = field(
default=0.05,
metadata={
"help": "Probability of each feature vector along the time axis to be chosen as the start of the vector"
......@@ -108,22 +106,22 @@ class ModelArguments:
"vectors will be masked along the time axis."
},
)
mask_time_length: Optional[int] = field(
mask_time_length: int = field(
default=10,
metadata={"help": "Length of vector span to mask along the time axis."},
)
mask_feature_prob: Optional[float] = field(
mask_feature_prob: float = field(
default=0.0,
metadata={
"help": "Probability of each feature vector along the feature axis to be chosen as the start of the vector"
"span to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature bins will be masked along the time axis."
},
)
mask_feature_length: Optional[int] = field(
mask_feature_length: int = field(
default=10,
metadata={"help": "Length of vector span to mask along the feature axis."},
)
layerdrop: Optional[float] = field(default=0.0, metadata={"help": "The LayerDrop probability."})
layerdrop: float = field(default=0.0, metadata={"help": "The LayerDrop probability."})
ctc_loss_reduction: Optional[str] = field(
default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."}
)
......@@ -142,26 +140,26 @@ class DataTrainingArguments:
dataset_name: str = field(
metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
)
dataset_config_name: Optional[str] = field(
dataset_config_name: str = field(
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
)
train_split_name: Optional[str] = field(
train_split_name: str = field(
default="train+validation",
metadata={
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
},
)
eval_split_name: Optional[str] = field(
eval_split_name: str = field(
default="test",
metadata={
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
},
)
audio_column_name: Optional[str] = field(
audio_column_name: str = field(
default="audio",
metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
)
text_column_name: Optional[str] = field(
text_column_name: str = field(
default="text",
metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
)
......@@ -190,20 +188,20 @@ class DataTrainingArguments:
default=None,
metadata={"help": "A list of characters to remove from the transcripts."},
)
eval_metrics: Optional[List[str]] = list_field(
eval_metrics: List[str] = list_field(
default=["wer"],
metadata={"help": "A list of metrics the model should be evaluated on. E.g. `'wer cer'`"},
)
max_duration_in_seconds: Optional[float] = field(
max_duration_in_seconds: float = field(
default=20.0,
metadata={
"help": "Filter audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`"
},
)
min_duration_in_seconds: Optional[float] = field(
min_duration_in_seconds: float = field(
default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
)
preprocessing_only: Optional[bool] = field(
preprocessing_only: bool = field(
default=False,
metadata={
"help": "Whether to only do data preprocessing and skip training. "
......@@ -212,22 +210,22 @@ class DataTrainingArguments:
"so that the cached datasets can consequently be loaded in distributed training"
},
)
use_auth_token: Optional[bool] = field(
use_auth_token: bool = field(
default=False,
metadata={
"help": "If :obj:`True`, will use the token generated when running"
":obj:`transformers-cli login` as HTTP bearer authorization for remote files."
},
)
unk_token: Optional[str] = field(
unk_token: str = field(
default="[UNK]",
metadata={"help": "The unk token for the tokenizer"},
)
pad_token: Optional[str] = field(
pad_token: str = field(
default="[PAD]",
metadata={"help": "The padding token for the tokenizer"},
)
word_delimiter_token: Optional[str] = field(
word_delimiter_token: str = field(
default="|",
metadata={"help": "The word delimiter token for the tokenizer"},
)
......@@ -545,8 +543,8 @@ def main():
)
# freeze encoder
if model_args.freeze_feature_extractor:
model.freeze_feature_extractor()
if model_args.freeze_feature_encoder:
model.freeze_feature_encoder()
# 6. Now we preprocess the datasets including loading the audio, resampling and normalization
# Thankfully, `datasets` takes care of automatically loading and resampling the audio,
......
......@@ -91,8 +91,8 @@ class ModelArguments:
"with private models)."
},
)
freeze_feature_extractor: Optional[bool] = field(
default=True, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
freeze_feature_encoder: bool = field(
default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
)
......@@ -102,7 +102,7 @@ class DataTrainingArguments:
Arguments pertaining to what data we are going to input our model for training and eval.
"""
dataset_name: Optional[str] = field(
dataset_name: str = field(
default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
)
dataset_config_name: Optional[str] = field(
......@@ -133,24 +133,24 @@ class DataTrainingArguments:
"value if set."
},
)
audio_column_name: Optional[str] = field(
audio_column_name: str = field(
default="audio",
metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
)
text_column_name: Optional[str] = field(
text_column_name: str = field(
default="text",
metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
)
max_duration_in_seconds: Optional[float] = field(
max_duration_in_seconds: float = field(
default=20.0,
metadata={
"help": "Truncate audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`"
},
)
min_duration_in_seconds: Optional[float] = field(
min_duration_in_seconds: float = field(
default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
)
preprocessing_only: Optional[bool] = field(
preprocessing_only: bool = field(
default=False,
metadata={
"help": "Whether to only do data preprocessing and skip training. "
......@@ -159,19 +159,19 @@ class DataTrainingArguments:
"so that the cached datasets can consequently be loaded in distributed training"
},
)
train_split_name: Optional[str] = field(
train_split_name: str = field(
default="train",
metadata={
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
},
)
eval_split_name: Optional[str] = field(
eval_split_name: str = field(
default="test",
metadata={
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
},
)
do_lower_case: Optional[bool] = field(
do_lower_case: bool = field(
default=True,
metadata={"help": "Whether the target text should be lower cased."},
)
......@@ -335,8 +335,8 @@ def main():
if model.config.decoder_start_token_id is None:
raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")
if model_args.freeze_feature_extractor:
model.freeze_feature_extractor()
if model_args.freeze_feature_encoder:
model.freeze_feature_encoder()
# 6. Resample speech dataset if necassary
dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
......
......@@ -64,24 +64,24 @@ class HubertConfig(PretrainedConfig):
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
feat_extract_norm (`str`, *optional*, defaults to `"group"`):
The norm to be applied to 1D convolutional layers in feature extractor. One of `"group"` for group
The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
convolutional layers.
feat_proj_dropout (`float`, *optional*, defaults to 0.0):
The dropout probability for output of the feature extractor.
The dropout probability for output of the feature encoder.
feat_proj_layer_norm (`bool`, *optional*, defaults to `True`):
Whether to apply LayerNorm to the output of the feature extractor.
Whether to apply LayerNorm to the output of the feature encoder.
feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
The non-linear activation function (function or string) in the 1D convolutional layers of the feature
extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
feature extractor. The length of *conv_dim* defines the number of 1D convolutional layers.
feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length
A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
*conv_dim*.
conv_bias (`bool`, *optional*, defaults to `False`):
......@@ -96,7 +96,7 @@ class HubertConfig(PretrainedConfig):
True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is
False` corresponds to applying layer norm after the attention layer.
apply_spec_augment (`bool`, *optional*, defaults to `True`):
Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see
Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
[SpecAugment: A Simple Data Augmentation Method for Automatic Speech
Recognition](https://arxiv.org/abs/1904.08779).
mask_time_prob (`float`, *optional*, defaults to 0.05):
......
......@@ -14,6 +14,7 @@
# limitations under the License.
""" PyTorch Hubert model."""
import warnings
from typing import Optional, Tuple, Union
import numpy as np
......@@ -284,8 +285,8 @@ class HubertSamePadLayer(nn.Module):
return hidden_states
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureExtractor with Wav2Vec2->Hubert
class HubertFeatureExtractor(nn.Module):
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder with Wav2Vec2->Hubert
class HubertFeatureEncoder(nn.Module):
"""Construct the features from raw audio waveform"""
def __init__(self, config):
......@@ -336,6 +337,17 @@ class HubertFeatureExtractor(nn.Module):
return hidden_states
class HubertFeatureExtractor(HubertFeatureEncoder):
def __init__(self, config):
super().__init__(config)
warnings.warn(
f"The class `{self.__class__.__name__}` has been depreciated "
"and will be removed in Transformers v5. "
f"Use `{self.__class__.__bases__[0].__name__}` instead.",
FutureWarning,
)
class HubertFeatureProjection(nn.Module):
def __init__(self, config):
super().__init__()
......@@ -902,7 +914,7 @@ class HubertModel(HubertPreTrainedModel):
def __init__(self, config: HubertConfig):
super().__init__(config)
self.config = config
self.feature_extractor = HubertFeatureExtractor(config)
self.feature_extractor = HubertFeatureEncoder(config)
self.feature_projection = HubertFeatureProjection(config)
if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
......@@ -1063,8 +1075,20 @@ class HubertForCTC(HubertPreTrainedModel):
def freeze_feature_extractor(self):
"""
Calling this function will disable the gradient computation for the feature extractor so that its parameter
will not be updated during training.
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
self.hubert.feature_extractor._freeze_parameters()
......@@ -1172,8 +1196,20 @@ class HubertForSequenceClassification(HubertPreTrainedModel):
def freeze_feature_extractor(self):
"""
Calling this function will disable the gradient computation for the feature extractor so that its parameters
will not be updated during training.
Calling this function will disable the gradient computation for the feature encoder so that its parameters will
not be updated during training.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
self.hubert.feature_extractor._freeze_parameters()
......
......@@ -659,7 +659,7 @@ class TFHubertSamePadLayer(tf.keras.layers.Layer):
return hidden_states
class TFHubertFeatureExtractor(tf.keras.layers.Layer):
class TFHubertFeatureEncoder(tf.keras.layers.Layer):
def __init__(self, config: HubertConfig, **kwargs: Any) -> None:
super().__init__(**kwargs)
......@@ -686,6 +686,17 @@ class TFHubertFeatureExtractor(tf.keras.layers.Layer):
return hidden_states
class TFHubertFeatureExtractor(TFHubertFeatureEncoder):
def __init__(self, config, **kwargs):
super().__init__(config, **kwargs)
warnings.warn(
f"The class `{self.__class__.__name__}` has been depreciated "
"and will be removed in Transformers v5. "
f"Use `{self.__class__.__bases__[0].__name__}` instead.",
FutureWarning,
)
class TFHubertFeatureProjection(tf.keras.layers.Layer):
def __init__(self, config: HubertConfig, **kwargs):
super().__init__(**kwargs)
......@@ -1116,7 +1127,7 @@ class TFHubertMainLayer(tf.keras.layers.Layer):
def __init__(self, config: HubertConfig, **kwargs):
super().__init__(**kwargs)
self.config = config
self.feature_extractor = TFHubertFeatureExtractor(config, name="feature_extractor")
self.feature_extractor = TFHubertFeatureEncoder(config, name="feature_extractor")
self.feature_projection = TFHubertFeatureProjection(config, name="feature_projection")
if config.do_stable_layer_norm:
......@@ -1490,8 +1501,20 @@ class TFHubertForCTC(TFHubertPreTrainedModel):
def freeze_feature_extractor(self):
"""
Calling this function will disable the gradient computation for the feature extractor so that its parameter
will not be updated during training.
Calling this function will disable the gradient computation for the feature encoder so that its parameters will
not be updated during training.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
self.hubert.feature_extractor.trainable = False
......
......@@ -65,22 +65,22 @@ class SEWConfig(PretrainedConfig):
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
feat_extract_norm (`str`, *optional*, defaults to `"group"`):
The norm to be applied to 1D convolutional layers in feature extractor. One of `"group"` for group
The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
convolutional layers.
feat_proj_dropout (`float`, *optional*, defaults to 0.0):
The dropout probability for output of the feature extractor.
The dropout probability for output of the feature encoder.
feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
The non-linear activation function (function or string) in the 1D convolutional layers of the feature
extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
conv_dim (`Tuple[int]`, *optional*, defaults to `(64, 128, 128, 128, 128, 256, 256, 256, 256, 512, 512, 512, 512)`):
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
feature extractor. The length of *conv_dim* defines the number of 1D convolutional layers.
feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1)`):
A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length
A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 1, 3, 1, 3, 1, 3, 1, 2, 1, 2, 1)`):
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
*conv_dim*.
conv_bias (`bool`, *optional*, defaults to `False`):
......@@ -91,7 +91,7 @@ class SEWConfig(PretrainedConfig):
num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
Number of groups of 1D convolutional positional embeddings layer.
apply_spec_augment (`bool`, *optional*, defaults to `True`):
Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see
Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
[SpecAugment: A Simple Data Augmentation Method for Automatic Speech
Recognition](https://arxiv.org/abs/1904.08779).
mask_time_prob (`float`, *optional*, defaults to 0.05):
......
......@@ -15,6 +15,7 @@
""" PyTorch SEW model."""
import math
import warnings
from typing import Optional, Tuple, Union
import numpy as np
......@@ -301,8 +302,8 @@ class SEWUpsampling(nn.Module):
return hidden_states
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureExtractor with Wav2Vec2->SEW
class SEWFeatureExtractor(nn.Module):
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder with Wav2Vec2->SEW
class SEWFeatureEncoder(nn.Module):
"""Construct the features from raw audio waveform"""
def __init__(self, config):
......@@ -353,6 +354,17 @@ class SEWFeatureExtractor(nn.Module):
return hidden_states
class SEWFeatureExtractor(SEWFeatureEncoder):
def __init__(self, config):
super().__init__(config)
warnings.warn(
f"The class `{self.__class__.__name__}` has been depreciated "
"and will be removed in Transformers v5. "
f"Use `{self.__class__.__bases__[0].__name__}` instead.",
FutureWarning,
)
# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->SEW
class SEWAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
......@@ -712,7 +724,7 @@ class SEWPreTrainedModel(PreTrainedModel):
module.bias.data.zero_()
def _set_gradient_checkpointing(self, module, value=False):
if isinstance(module, (SEWEncoder, SEWFeatureExtractor)):
if isinstance(module, (SEWEncoder, SEWFeatureEncoder)):
module.gradient_checkpointing = value
def _get_feat_extract_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
......@@ -797,7 +809,7 @@ class SEWModel(SEWPreTrainedModel):
def __init__(self, config: SEWConfig):
super().__init__(config)
self.config = config
self.feature_extractor = SEWFeatureExtractor(config)
self.feature_extractor = SEWFeatureEncoder(config)
self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
self.project_features = config.conv_dim[-1] != config.hidden_size
......@@ -943,8 +955,20 @@ class SEWForCTC(SEWPreTrainedModel):
def freeze_feature_extractor(self):
"""
Calling this function will disable the gradient computation for the feature extractor so that its parameter
will not be updated during training.
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
self.sew.feature_extractor._freeze_parameters()
......@@ -1052,8 +1076,20 @@ class SEWForSequenceClassification(SEWPreTrainedModel):
def freeze_feature_extractor(self):
"""
Calling this function will disable the gradient computation for the feature extractor so that its parameters
will not be updated during training.
Calling this function will disable the gradient computation for the feature encoder so that its parameters will
not be updated during training.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
self.sew.feature_extractor._freeze_parameters()
......
......@@ -81,24 +81,24 @@ class SEWDConfig(PretrainedConfig):
layer_norm_eps (`float`, *optional*, defaults to 1e-7):
The epsilon used by the layer normalization layers in the transformer encoder.
feature_layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization after the feature extractor.
The epsilon used by the layer normalization after the feature encoder.
feat_extract_norm (`str`, *optional*, defaults to `"group"`):
The norm to be applied to 1D convolutional layers in feature extractor. One of `"group"` for group
The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
convolutional layers.
feat_proj_dropout (`float`, *optional*, defaults to 0.0):
The dropout probability for output of the feature extractor.
The dropout probability for output of the feature encoder.
feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
The non-linear activation function (function or string) in the 1D convolutional layers of the feature
extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
conv_dim (`Tuple[int]`, *optional*, defaults to `(64, 128, 128, 128, 128, 256, 256, 256, 256, 512, 512, 512, 512)`):
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
feature extractor. The length of *conv_dim* defines the number of 1D convolutional layers.
feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1)`):
A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length
A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 1, 3, 1, 3, 1, 3, 1, 2, 1, 2, 1)`):
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
*conv_dim*.
conv_bias (`bool`, *optional*, defaults to `False`):
......@@ -109,7 +109,7 @@ class SEWDConfig(PretrainedConfig):
num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
Number of groups of 1D convolutional positional embeddings layer.
apply_spec_augment (`bool`, *optional*, defaults to `True`):
Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see
Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
[SpecAugment: A Simple Data Augmentation Method for Automatic Speech
Recognition](https://arxiv.org/abs/1904.08779).
mask_time_prob (`float`, *optional*, defaults to 0.05):
......
......@@ -15,6 +15,7 @@
""" PyTorch SEW model."""
import math
import warnings
from collections.abc import Sequence
from typing import Optional, Tuple, Union
......@@ -387,8 +388,8 @@ class SEWDUpsampling(nn.Module):
return hidden_states
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureExtractor with Wav2Vec2->SEWD
class SEWDFeatureExtractor(nn.Module):
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder with Wav2Vec2->SEWD
class SEWDFeatureEncoder(nn.Module):
"""Construct the features from raw audio waveform"""
def __init__(self, config):
......@@ -439,6 +440,17 @@ class SEWDFeatureExtractor(nn.Module):
return hidden_states
class SEWDFeatureExtractor(SEWDFeatureEncoder):
def __init__(self, config):
super().__init__(config)
warnings.warn(
f"The class `{self.__class__.__name__}` has been depreciated "
"and will be removed in Transformers v5. "
f"Use `{self.__class__.__bases__[0].__name__}` instead.",
FutureWarning,
)
# Copied from transformers.models.deberta.modeling_deberta.ContextPooler
class ContextPooler(nn.Module):
def __init__(self, config):
......@@ -1333,7 +1345,7 @@ class SEWDModel(SEWDPreTrainedModel):
def __init__(self, config: SEWDConfig):
super().__init__(config)
self.config = config
self.feature_extractor = SEWDFeatureExtractor(config)
self.feature_extractor = SEWDFeatureEncoder(config)
self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.feature_layer_norm_eps)
self.project_features = config.conv_dim[-1] != config.hidden_size
......@@ -1479,8 +1491,20 @@ class SEWDForCTC(SEWDPreTrainedModel):
def freeze_feature_extractor(self):
"""
Calling this function will disable the gradient computation for the feature extractor so that its parameter
will not be updated during training.
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
self.sew_d.feature_extractor._freeze_parameters()
......@@ -1588,8 +1612,20 @@ class SEWDForSequenceClassification(SEWDPreTrainedModel):
def freeze_feature_extractor(self):
"""
Calling this function will disable the gradient computation for the feature extractor so that its parameters
will not be updated during training.
Calling this function will disable the gradient computation for the feature encoder so that its parameters will
not be updated during training.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
self.sew_d.feature_extractor._freeze_parameters()
......
......@@ -265,12 +265,12 @@ class SpeechEncoderDecoderModel(PreTrainedModel):
def set_output_embeddings(self, new_embeddings):
return self.decoder.set_output_embeddings(new_embeddings)
def freeze_feature_extractor(self):
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature extractor of the speech encoder so
Calling this function will disable the gradient computation for the feature encoder of the speech encoder so
that its parameters will not be updated during training.
"""
self.encoder.freeze_feature_extractor()
self.encoder.freeze_feature_encoder()
@classmethod
def from_pretrained(cls, *args, **kwargs):
......
......@@ -65,24 +65,24 @@ class UniSpeechConfig(PretrainedConfig):
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
feat_extract_norm (`str`, *optional*, defaults to `"group"`):
The norm to be applied to 1D convolutional layers in feature extractor. One of `"group"` for group
The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
convolutional layers.
feat_proj_dropout (`float`, *optional*, defaults to 0.0):
The dropout probability for output of the feature extractor.
The dropout probability for output of the feature encoder.
feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
The non-linear activation function (function or string) in the 1D convolutional layers of the feature
extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for quantized feature extractor states.
The dropout probabilitiy for quantized feature encoder states.
conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
feature extractor. The length of *conv_dim* defines the number of 1D convolutional layers.
feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length
A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
*conv_dim*.
conv_bias (`bool`, *optional*, defaults to `False`):
......@@ -97,7 +97,7 @@ class UniSpeechConfig(PretrainedConfig):
True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is
False` corresponds to applying layer norm after the attention layer.
apply_spec_augment (`bool`, *optional*, defaults to `True`):
Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see
Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
[SpecAugment: A Simple Data Augmentation Method for Automatic Speech
Recognition](https://arxiv.org/abs/1904.08779).
mask_time_prob (`float`, *optional*, defaults to 0.05):
......@@ -132,7 +132,7 @@ class UniSpeechConfig(PretrainedConfig):
contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
The temperature *kappa* in the contrastive loss.
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for the output of the feature extractor that's used by the quantizer.
The dropout probabilitiy for the output of the feature encoder that's used by the quantizer.
num_negatives (`int`, *optional*, defaults to 100):
Number of negative samples for the contrastive loss.
codevector_dim (`int`, *optional*, defaults to 256):
......
......@@ -15,6 +15,7 @@
""" PyTorch UniSpeech model."""
import math
import warnings
from dataclasses import dataclass
from typing import Optional, Tuple, Union
......@@ -351,8 +352,8 @@ class UniSpeechSamePadLayer(nn.Module):
return hidden_states
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureExtractor with Wav2Vec2->UniSpeech
class UniSpeechFeatureExtractor(nn.Module):
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder with Wav2Vec2->UniSpeech
class UniSpeechFeatureEncoder(nn.Module):
"""Construct the features from raw audio waveform"""
def __init__(self, config):
......@@ -406,6 +407,17 @@ class UniSpeechFeatureExtractor(nn.Module):
return hidden_states
class UniSpeechFeatureExtractor(UniSpeechFeatureEncoder):
def __init__(self, config):
super().__init__(config)
warnings.warn(
f"The class `{self.__class__.__name__}` has been depreciated "
"and will be removed in Transformers v5. "
f"Use `{self.__class__.__bases__[0].__name__}` instead.",
FutureWarning,
)
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureProjection with Wav2Vec2->UniSpeech
class UniSpeechFeatureProjection(nn.Module):
def __init__(self, config):
......@@ -980,7 +992,7 @@ class UniSpeechPreTrainedModel(PreTrainedModel):
return attention_mask
def _set_gradient_checkpointing(self, module, value=False):
if isinstance(module, (UniSpeechEncoder, UniSpeechEncoderStableLayerNorm, UniSpeechFeatureExtractor)):
if isinstance(module, (UniSpeechEncoder, UniSpeechEncoderStableLayerNorm, UniSpeechFeatureEncoder)):
module.gradient_checkpointing = value
......@@ -1049,7 +1061,7 @@ class UniSpeechModel(UniSpeechPreTrainedModel):
def __init__(self, config: UniSpeechConfig):
super().__init__(config)
self.config = config
self.feature_extractor = UniSpeechFeatureExtractor(config)
self.feature_extractor = UniSpeechFeatureEncoder(config)
self.feature_projection = UniSpeechFeatureProjection(config)
if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
......@@ -1193,8 +1205,20 @@ class UniSpeechForPreTraining(UniSpeechPreTrainedModel):
def freeze_feature_extractor(self):
"""
Calling this function will disable the gradient computation for the feature extractor so that its parameters
will not be updated during training.
Calling this function will disable the gradient computation for the feature encoder so that its parameters will
not be updated during training.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
self.unispeech.feature_extractor._freeze_parameters()
......@@ -1358,8 +1382,20 @@ class UniSpeechForCTC(UniSpeechPreTrainedModel):
def freeze_feature_extractor(self):
"""
Calling this function will disable the gradient computation for the feature extractor so that its parameter
will not be updated during training.
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
self.unispeech.feature_extractor._freeze_parameters()
......@@ -1467,8 +1503,20 @@ class UniSpeechForSequenceClassification(UniSpeechPreTrainedModel):
def freeze_feature_extractor(self):
"""
Calling this function will disable the gradient computation for the feature extractor so that its parameters
will not be updated during training.
Calling this function will disable the gradient computation for the feature encoder so that its parameters will
not be updated during training.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
self.unispeech.feature_extractor._freeze_parameters()
......
......@@ -65,24 +65,24 @@ class UniSpeechSatConfig(PretrainedConfig):
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
feat_extract_norm (`str`, *optional*, defaults to `"group"`):
The norm to be applied to 1D convolutional layers in feature extractor. One of `"group"` for group
The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
convolutional layers.
feat_proj_dropout (`float`, *optional*, defaults to 0.0):
The dropout probability for output of the feature extractor.
The dropout probability for output of the feature encoder.
feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
The non-linear activation function (function or string) in the 1D convolutional layers of the feature
extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for quantized feature extractor states.
The dropout probabilitiy for quantized feature encoder states.
conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
feature extractor. The length of *conv_dim* defines the number of 1D convolutional layers.
feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length
A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
*conv_dim*.
conv_bias (`bool`, *optional*, defaults to `False`):
......@@ -97,7 +97,7 @@ class UniSpeechSatConfig(PretrainedConfig):
True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is
False` corresponds to applying layer norm after the attention layer.
apply_spec_augment (`bool`, *optional*, defaults to `True`):
Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see
Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
[SpecAugment: A Simple Data Augmentation Method for Automatic Speech
Recognition](https://arxiv.org/abs/1904.08779).
mask_time_prob (`float`, *optional*, defaults to 0.05):
......@@ -132,7 +132,7 @@ class UniSpeechSatConfig(PretrainedConfig):
contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
The temperature *kappa* in the contrastive loss.
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for the output of the feature extractor that's used by the quantizer.
The dropout probabilitiy for the output of the feature encoder that's used by the quantizer.
num_negatives (`int`, *optional*, defaults to 100):
Number of negative samples for the contrastive loss.
codevector_dim (`int`, *optional*, defaults to 256):
......
......@@ -20,12 +20,7 @@ import argparse
import fairseq
import torch
from transformers import ( # UniSpeechSatCTCTokenizer,; UniSpeechSatFeatureExtractor,; UniSpeechSatProcessor,
UniSpeechSatConfig,
UniSpeechSatForCTC,
UniSpeechSatForPreTraining,
logging,
)
from transformers import UniSpeechSatConfig, UniSpeechSatForCTC, UniSpeechSatForPreTraining, logging
logging.set_verbosity_info()
......
......@@ -15,6 +15,7 @@
""" PyTorch UniSpeechSat model."""
import math
import warnings
from dataclasses import dataclass
from typing import Optional, Tuple, Union
......@@ -385,8 +386,8 @@ class UniSpeechSatSamePadLayer(nn.Module):
return hidden_states
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureExtractor with Wav2Vec2->UniSpeechSat
class UniSpeechSatFeatureExtractor(nn.Module):
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder with Wav2Vec2->UniSpeechSat
class UniSpeechSatFeatureEncoder(nn.Module):
"""Construct the features from raw audio waveform"""
def __init__(self, config):
......@@ -440,6 +441,17 @@ class UniSpeechSatFeatureExtractor(nn.Module):
return hidden_states
class UniSpeechSatFeatureExtractor(UniSpeechSatFeatureEncoder):
def __init__(self, config):
super().__init__(config)
warnings.warn(
f"The class `{self.__class__.__name__}` has been depreciated "
"and will be removed in Transformers v5. "
f"Use `{self.__class__.__bases__[0].__name__}` instead.",
FutureWarning,
)
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureProjection with Wav2Vec2->UniSpeechSat
class UniSpeechSatFeatureProjection(nn.Module):
def __init__(self, config):
......@@ -1014,7 +1026,7 @@ class UniSpeechSatPreTrainedModel(PreTrainedModel):
return attention_mask
def _set_gradient_checkpointing(self, module, value=False):
if isinstance(module, (UniSpeechSatEncoder, UniSpeechSatEncoderStableLayerNorm, UniSpeechSatFeatureExtractor)):
if isinstance(module, (UniSpeechSatEncoder, UniSpeechSatEncoderStableLayerNorm, UniSpeechSatFeatureEncoder)):
module.gradient_checkpointing = value
......@@ -1084,7 +1096,7 @@ class UniSpeechSatModel(UniSpeechSatPreTrainedModel):
def __init__(self, config: UniSpeechSatConfig):
super().__init__(config)
self.config = config
self.feature_extractor = UniSpeechSatFeatureExtractor(config)
self.feature_extractor = UniSpeechSatFeatureEncoder(config)
self.feature_projection = UniSpeechSatFeatureProjection(config)
self.masked_spec_embed = nn.Parameter(torch.FloatTensor(config.hidden_size).uniform_())
......@@ -1232,10 +1244,22 @@ class UniSpeechSatForPreTraining(UniSpeechSatPreTrainedModel):
def freeze_feature_extractor(self):
"""
Calling this function will disable the gradient computation for the feature extractor so that its parameters
will not be updated during training.
Calling this function will disable the gradient computation for the feature encoder so that its parameters will
not be updated during training.
"""
self.unispeech_sat.feature_extractor._freeze_parameters()
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
self.wav2vec2.feature_extractor._freeze_parameters()
@staticmethod
def compute_contrastive_logits(
......@@ -1274,12 +1298,12 @@ class UniSpeechSatForPreTraining(UniSpeechSatPreTrainedModel):
```python
>>> import torch
>>> from transformers import UniSpeechSatFeatureExtractor, UniSpeechSatForPreTraining
>>> from transformers import UniSpeechSatFeatureEncoder, UniSpeechSatForPreTraining
>>> from transformers.models.unispeech_sat.modeling_unispeech_sat import _compute_mask_indices
>>> from datasets import load_dataset
>>> import soundfile as sf
>>> feature_extractor = UniSpeechSatFeatureExtractor.from_pretrained("patrickvonplaten/unispeech_sat-base")
>>> feature_extractor = UniSpeechSatFeatureEncoder.from_pretrained("patrickvonplaten/unispeech_sat-base")
>>> model = UniSpeechSatForPreTraining.from_pretrained("patrickvonplaten/unispeech_sat-base")
......@@ -1383,8 +1407,20 @@ class UniSpeechSatForCTC(UniSpeechSatPreTrainedModel):
def freeze_feature_extractor(self):
"""
Calling this function will disable the gradient computation for the feature extractor so that its parameter
will not be updated during training.
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
self.unispeech_sat.feature_extractor._freeze_parameters()
......@@ -1492,8 +1528,20 @@ class UniSpeechSatForSequenceClassification(UniSpeechSatPreTrainedModel):
def freeze_feature_extractor(self):
"""
Calling this function will disable the gradient computation for the feature extractor so that its parameters
will not be updated during training.
Calling this function will disable the gradient computation for the feature encoder so that its parameters will
not be updated during training.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
self.unispeech_sat.feature_extractor._freeze_parameters()
......@@ -1596,8 +1644,20 @@ class UniSpeechSatForAudioFrameClassification(UniSpeechSatPreTrainedModel):
def freeze_feature_extractor(self):
"""
Calling this function will disable the gradient computation for the feature extractor so that its parameters
will not be updated during training.
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
self.unispeech_sat.feature_extractor._freeze_parameters()
......@@ -1745,8 +1805,20 @@ class UniSpeechSatForXVector(UniSpeechSatPreTrainedModel):
def freeze_feature_extractor(self):
"""
Calling this function will disable the gradient computation for the feature extractor so that its parameters
will not be updated during training.
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
self.unispeech_sat.feature_extractor._freeze_parameters()
......
......@@ -65,24 +65,24 @@ class Wav2Vec2Config(PretrainedConfig):
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
feat_extract_norm (`str`, *optional*, defaults to `"group"`):
The norm to be applied to 1D convolutional layers in feature extractor. One of `"group"` for group
The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
convolutional layers.
feat_proj_dropout (`float`, *optional*, defaults to 0.0):
The dropout probability for output of the feature extractor.
The dropout probability for output of the feature encoder.
feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
The non-linear activation function (function or string) in the 1D convolutional layers of the feature
extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for quantized feature extractor states.
The dropout probabilitiy for quantized feature encoder states.
conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
feature extractor. The length of *conv_dim* defines the number of 1D convolutional layers.
feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length
A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
length of *conv_kernel* defines the number of convolutional layers and has to match the length of
*conv_dim*.
conv_bias (`bool`, *optional*, defaults to `False`):
......@@ -97,7 +97,7 @@ class Wav2Vec2Config(PretrainedConfig):
True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is
False` corresponds to applying layer norm after the attention layer.
apply_spec_augment (`bool`, *optional*, defaults to `True`):
Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see
Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
[SpecAugment: A Simple Data Augmentation Method for Automatic Speech
Recognition](https://arxiv.org/abs/1904.08779).
mask_time_prob (`float`, *optional*, defaults to 0.05):
......@@ -132,7 +132,7 @@ class Wav2Vec2Config(PretrainedConfig):
contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
The temperature *kappa* in the contrastive loss.
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for the output of the feature extractor that's used by the quantizer.
The dropout probabilitiy for the output of the feature encoder that's used by the quantizer.
num_negatives (`int`, *optional*, defaults to 100):
Number of negative samples for the contrastive loss.
codevector_dim (`int`, *optional*, defaults to 256):
......
......@@ -395,7 +395,7 @@ class FlaxConvLayersCollection(nn.Module):
return hidden_states
class FlaxWav2Vec2FeatureExtractor(nn.Module):
class FlaxWav2Vec2FeatureEncoder(nn.Module):
"""Construct the features from raw audio waveform"""
config: Wav2Vec2Config
......@@ -849,7 +849,7 @@ class FlaxWav2Vec2Module(nn.Module):
dtype: jnp.dtype = jnp.float32
def setup(self):
self.feature_extractor = FlaxWav2Vec2FeatureExtractor(self.config, dtype=self.dtype)
self.feature_extractor = FlaxWav2Vec2FeatureEncoder(self.config, dtype=self.dtype)
self.feature_projection = FlaxWav2Vec2FeatureProjection(self.config, dtype=self.dtype)
self.masked_spec_embed = self.param(
"masked_spec_embed", jax.nn.initializers.uniform(), (self.config.hidden_size,)
......
......@@ -655,7 +655,7 @@ class TFWav2Vec2SamePadLayer(tf.keras.layers.Layer):
return hidden_states
class TFWav2Vec2FeatureExtractor(tf.keras.layers.Layer):
class TFWav2Vec2FeatureEncoder(tf.keras.layers.Layer):
def __init__(self, config: Wav2Vec2Config, **kwargs: Any) -> None:
super().__init__(**kwargs)
......@@ -682,6 +682,17 @@ class TFWav2Vec2FeatureExtractor(tf.keras.layers.Layer):
return hidden_states
class TFWav2Vec2FeatureExtractor(TFWav2Vec2FeatureEncoder):
def __init__(self, config, **kwargs):
super().__init__(config, **kwargs)
warnings.warn(
f"The class `{self.__class__.__name__}` has been depreciated "
"and will be removed in Transformers v5. "
f"Use `{self.__class__.__bases__[0].__name__}` instead.",
FutureWarning,
)
class TFWav2Vec2FeatureProjection(tf.keras.layers.Layer):
def __init__(self, config: Wav2Vec2Config, **kwargs):
super().__init__(**kwargs)
......@@ -1107,7 +1118,7 @@ class TFWav2Vec2MainLayer(tf.keras.layers.Layer):
def __init__(self, config: Wav2Vec2Config, **kwargs):
super().__init__(**kwargs)
self.config = config
self.feature_extractor = TFWav2Vec2FeatureExtractor(config, name="feature_extractor")
self.feature_extractor = TFWav2Vec2FeatureEncoder(config, name="feature_extractor")
self.feature_projection = TFWav2Vec2FeatureProjection(config, name="feature_projection")
if config.do_stable_layer_norm:
......@@ -1481,8 +1492,20 @@ class TFWav2Vec2ForCTC(TFWav2Vec2PreTrainedModel):
def freeze_feature_extractor(self):
"""
Calling this function will disable the gradient computation for the feature extractor so that its parameter
will not be updated during training.
Calling this function will disable the gradient computation for the feature encoder so that its parameters will
not be updated during training.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
self.wav2vec2.feature_extractor.trainable = False
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment