[Wav2Vec2] Rename model's feature extractor to feature encoder (#14959)

* rename classes * clean up more namings * remove bogus file * Apply suggestions from code review * Apply suggestions from code review * replace more names * more regex replace * make style * correct * correct more * make style * finish * correct more in wav2vec2 * make style * improve freeze_extractor * add aliases * add tf aliases

[Wav2Vec2] Rename model's feature extractor to feature encoder (#14959)
* rename classes * clean up more namings * remove bogus file * Apply suggestions from code review * Apply suggestions from code review * replace more names * more regex replace * make style * correct * correct more * make style * finish * correct more in wav2vec2 * make style * improve freeze_extractor * add aliases * add tf aliases
600496fa · Patrick von Platen · GitHub · 1bfa3477 · 600496fa · 600496fa
Unverified Commit 600496fa authored Dec 28, 2021 by Patrick von Platen Committed by GitHub Dec 28, 2021
12 changed files
--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -431,7 +431,7 @@ class Wav2Vec2SamePadLayer(nn.Module):
        return hidden_states


-class Wav2Vec2FeatureExtractor(nn.Module):
+class Wav2Vec2FeatureEncoder(nn.Module):
    """Construct the features from raw audio waveform"""

    def __init__(self, config):
@@ -484,6 +484,17 @@ class Wav2Vec2FeatureExtractor(nn.Module):
        return hidden_states


+class Wav2Vec2FeatureExtractor(Wav2Vec2FeatureEncoder):
+    def __init__(self, config):
+        super().__init__(config)
+        warnings.warn(
+            f"The class `{self.__class__.__name__}` has been depreciated "
+            "and will be removed in Transformers v5. "
+            f"Use `{self.__class__.__bases__[0].__name__}` instead.",
+            FutureWarning,
+        )
+
+
 class Wav2Vec2FeatureProjection(nn.Module):
    def __init__(self, config):
        super().__init__()
@@ -1125,7 +1136,7 @@ class Wav2Vec2PreTrainedModel(PreTrainedModel):
        return attention_mask

    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, (Wav2Vec2Encoder, Wav2Vec2EncoderStableLayerNorm, Wav2Vec2FeatureExtractor)):
+        if isinstance(module, (Wav2Vec2Encoder, Wav2Vec2EncoderStableLayerNorm, Wav2Vec2FeatureEncoder)):
            module.gradient_checkpointing = value


@@ -1194,7 +1205,7 @@ class Wav2Vec2Model(Wav2Vec2PreTrainedModel):
    def __init__(self, config: Wav2Vec2Config):
        super().__init__(config)
        self.config = config
-        self.feature_extractor = Wav2Vec2FeatureExtractor(config)
+        self.feature_extractor = Wav2Vec2FeatureEncoder(config)
        self.feature_projection = Wav2Vec2FeatureProjection(config)

        # model only needs masking vector if mask prob is > 0.0
@@ -1213,8 +1224,20 @@ class Wav2Vec2Model(Wav2Vec2PreTrainedModel):

    def freeze_feature_extractor(self):
        """
-        Calling this function will disable the gradient computation for the feature extractor so that its parameters
-        will not be updated during training.
+        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
        """
        self.feature_extractor._freeze_parameters()

@@ -1349,8 +1372,20 @@ class Wav2Vec2ForPreTraining(Wav2Vec2PreTrainedModel):

    def freeze_feature_extractor(self):
        """
-        Calling this function will disable the gradient computation for the feature extractor so that its parameters
-        will not be updated during training.
+        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
        """
        self.wav2vec2.feature_extractor._freeze_parameters()

@@ -1637,8 +1672,20 @@ class Wav2Vec2ForCTC(Wav2Vec2PreTrainedModel):

    def freeze_feature_extractor(self):
        """
-        Calling this function will disable the gradient computation for the feature extractor so that its parameter
-        will not be updated during training.
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
        """
        self.wav2vec2.feature_extractor._freeze_parameters()

@@ -1745,8 +1792,20 @@ class Wav2Vec2ForSequenceClassification(Wav2Vec2PreTrainedModel):

    def freeze_feature_extractor(self):
        """
-        Calling this function will disable the gradient computation for the feature extractor so that its parameters
-        will not be updated during training.
+        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
        """
        self.wav2vec2.feature_extractor._freeze_parameters()

@@ -1848,8 +1907,20 @@ class Wav2Vec2ForAudioFrameClassification(Wav2Vec2PreTrainedModel):

    def freeze_feature_extractor(self):
        """
-        Calling this function will disable the gradient computation for the feature extractor so that its parameters
-        will not be updated during training.
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
        """
        self.wav2vec2.feature_extractor._freeze_parameters()

@@ -1994,8 +2065,20 @@ class Wav2Vec2ForXVector(Wav2Vec2PreTrainedModel):

    def freeze_feature_extractor(self):
        """
-        Calling this function will disable the gradient computation for the feature extractor so that its parameters
-        will not be updated during training.
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
        """
        self.wav2vec2.feature_extractor._freeze_parameters()


--- a/src/transformers/models/wavlm/configuration_wavlm.py
+++ b/src/transformers/models/wavlm/configuration_wavlm.py
@@ -64,24 +64,24 @@ class WavLMConfig(PretrainedConfig):
        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
            The epsilon used by the layer normalization layers.
        feat_extract_norm (`str`, *optional*, defaults to `"group"`):
-            The norm to be applied to 1D convolutional layers in feature extractor. One of `"group"` for group
+            The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
            normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
            convolutional layers.
        feat_proj_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probability for output of the feature extractor.
+            The dropout probability for output of the feature encoder.
        feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the 1D convolutional layers of the feature
            extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probabilitiy for quantized feature extractor states.
+            The dropout probabilitiy for quantized feature encoder states.
        conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
            A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
-            feature extractor. The length of *conv_dim* defines the number of 1D convolutional layers.
+            feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
        conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
-            A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length
+            A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
            of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
        conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
-            A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
            length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
            *conv_dim*.
        conv_bias (`bool`, *optional*, defaults to `False`):
@@ -96,7 +96,7 @@ class WavLMConfig(PretrainedConfig):
            True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is
            False` corresponds to applying layer norm after the attention layer.
        apply_spec_augment (`bool`, *optional*, defaults to `True`):
-            Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see
+            Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
            [SpecAugment: A Simple Data Augmentation Method for Automatic Speech
            Recognition](https://arxiv.org/abs/1904.08779).
        mask_time_prob (`float`, *optional*, defaults to 0.05):
@@ -122,7 +122,7 @@ class WavLMConfig(PretrainedConfig):
        contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
            The temperature *kappa* in the contrastive loss.
        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probabilitiy for the output of the feature extractor that's used by the quantizer.
+            The dropout probabilitiy for the output of the feature encoder that's used by the quantizer.
        num_negatives (`int`, *optional*, defaults to 100):
            Number of negative samples for the contrastive loss.
        codevector_dim (`int`, *optional*, defaults to 256):

--- a/src/transformers/models/wavlm/modeling_wavlm.py
+++ b/src/transformers/models/wavlm/modeling_wavlm.py
@@ -15,6 +15,7 @@
 """ PyTorch WavLM model."""

 import math
+import warnings
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union

@@ -352,8 +353,8 @@ class WavLMSamePadLayer(nn.Module):
        return hidden_states


-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureExtractor with Wav2Vec2->WavLM
-class WavLMFeatureExtractor(nn.Module):
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder with Wav2Vec2->WavLM
+class WavLMFeatureEncoder(nn.Module):
    """Construct the features from raw audio waveform"""

    def __init__(self, config):
@@ -404,6 +405,17 @@ class WavLMFeatureExtractor(nn.Module):
        return hidden_states


+class WavLMFeatureExtractor(WavLMFeatureEncoder):
+    def __init__(self, config):
+        super().__init__(config)
+        warnings.warn(
+            f"The class `{self.__class__.__name__}` has been depreciated "
+            "and will be removed in Transformers v5. "
+            f"Use `{self.__class__.__bases__[0].__name__}` instead.",
+            FutureWarning,
+        )
+
+
 # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureProjection with Wav2Vec2->WavLM
 class WavLMFeatureProjection(nn.Module):
    def __init__(self, config):
@@ -1077,7 +1089,7 @@ class WavLMPreTrainedModel(PreTrainedModel):
        return attention_mask

    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, (WavLMEncoder, WavLMEncoderStableLayerNorm, WavLMFeatureExtractor)):
+        if isinstance(module, (WavLMEncoder, WavLMEncoderStableLayerNorm, WavLMFeatureEncoder)):
            module.gradient_checkpointing = value


@@ -1146,7 +1158,7 @@ class WavLMModel(WavLMPreTrainedModel):
    def __init__(self, config: WavLMConfig):
        super().__init__(config)
        self.config = config
-        self.feature_extractor = WavLMFeatureExtractor(config)
+        self.feature_extractor = WavLMFeatureEncoder(config)
        self.feature_projection = WavLMFeatureProjection(config)

        # model only needs masking vector if mask prob is > 0.0
@@ -1165,8 +1177,20 @@ class WavLMModel(WavLMPreTrainedModel):

    def freeze_feature_extractor(self):
        """
-        Calling this function will disable the gradient computation for the feature extractor so that its parameters
-        will not be updated during training.
+        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
        """
        self.feature_extractor._freeze_parameters()

@@ -1303,8 +1327,20 @@ class WavLMForCTC(WavLMPreTrainedModel):

    def freeze_feature_extractor(self):
        """
-        Calling this function will disable the gradient computation for the feature extractor so that its parameter
-        will not be updated during training.
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
        """
        self.wavlm.feature_extractor._freeze_parameters()

@@ -1412,8 +1448,20 @@ class WavLMForSequenceClassification(WavLMPreTrainedModel):

    def freeze_feature_extractor(self):
        """
-        Calling this function will disable the gradient computation for the feature extractor so that its parameters
-        will not be updated during training.
+        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
        """
        self.wavlm.feature_extractor._freeze_parameters()

@@ -1516,8 +1564,20 @@ class WavLMForAudioFrameClassification(WavLMPreTrainedModel):

    def freeze_feature_extractor(self):
        """
-        Calling this function will disable the gradient computation for the feature extractor so that its parameters
-        will not be updated during training.
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
        """
        self.wavlm.feature_extractor._freeze_parameters()

@@ -1665,8 +1725,20 @@ class WavLMForXVector(WavLMPreTrainedModel):

    def freeze_feature_extractor(self):
        """
-        Calling this function will disable the gradient computation for the feature extractor so that its parameters
-        will not be updated during training.
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
        """
        self.wavlm.feature_extractor._freeze_parameters()


--- a/tests/test_modeling_hubert.py
+++ b/tests/test_modeling_hubert.py
@@ -225,7 +225,7 @@ class HubertModelTester:
        model.train()

        # freeze feature encoder
-        model.freeze_feature_extractor()
+        model.freeze_feature_encoder()

        input_values = input_values[:3]


--- a/tests/test_modeling_sew.py
+++ b/tests/test_modeling_sew.py
@@ -203,7 +203,7 @@ class SEWModelTester:
        model.train()

        # freeze feature encoder
-        model.freeze_feature_extractor()
+        model.freeze_feature_encoder()

        input_values = input_values[:3]


--- a/tests/test_modeling_sew_d.py
+++ b/tests/test_modeling_sew_d.py
@@ -224,7 +224,7 @@ class SEWDModelTester:
        model.train()

        # freeze feature encoder
-        model.freeze_feature_extractor()
+        model.freeze_feature_encoder()

        input_values = input_values[:3]


--- a/tests/test_modeling_tf_hubert.py
+++ b/tests/test_modeling_tf_hubert.py
@@ -184,7 +184,7 @@ class TFHubertModelTester:
        model = TFHubertForCTC(config)

        # freeze feature encoder
-        model.freeze_feature_extractor()
+        model.freeze_feature_encoder()

        input_values = input_values[:3]


--- a/tests/test_modeling_tf_wav2vec2.py
+++ b/tests/test_modeling_tf_wav2vec2.py
@@ -194,7 +194,7 @@ class TFWav2Vec2ModelTester:
        model = TFWav2Vec2ForCTC(config)

        # freeze feature encoder
-        model.freeze_feature_extractor()
+        model.freeze_feature_encoder()

        input_values = input_values[:3]


--- a/tests/test_modeling_unispeech.py
+++ b/tests/test_modeling_unispeech.py
@@ -226,7 +226,7 @@ class UniSpeechModelTester:
        model.train()

        # freeze feature encoder
-        model.freeze_feature_extractor()
+        model.freeze_feature_encoder()

        input_values = input_values[:3]


--- a/tests/test_modeling_unispeech_sat.py
+++ b/tests/test_modeling_unispeech_sat.py
@@ -246,7 +246,7 @@ class UniSpeechSatModelTester:
        model.train()

        # freeze feature encoder
-        model.freeze_feature_extractor()
+        model.freeze_feature_encoder()

        input_values = input_values[:3]


--- a/tests/test_modeling_wav2vec2.py
+++ b/tests/test_modeling_wav2vec2.py
@@ -300,7 +300,7 @@ class Wav2Vec2ModelTester:
        model.train()

        # freeze feature encoder
-        model.freeze_feature_extractor()
+        model.freeze_feature_encoder()

        input_values = input_values[:3]


--- a/tests/test_modeling_wavlm.py
+++ b/tests/test_modeling_wavlm.py
@@ -238,7 +238,7 @@ class WavLMModelTester:
        model.train()

        # freeze feature encoder
-        model.freeze_feature_extractor()
+        model.freeze_feature_encoder()

        input_values = input_values[:3]