Add feature_grad_mult argument to HuBERTPretrainModel (#2335)

Summary: In Wav2Vec2 and HuBERT model training, the convolutional feature extraction layers use `group_norm` for normalization in `Base` model, while they use `layer_norm` in `Large` and `XLarge` models. For `Base` model, the gradients of feature extraction layers will be unstable in pre-training, thus we need to scale down the gradient by multiplying 0.1. In this PR, we add such argument to `HuBERTPretrainModel` to control the gradient of feature extractor layers. We also put the argument in the factory functions (`hubert_pretrain_base`, `hubert_pretrain_large`, and `hubert_pretrain_xlarge`. The reason is in finetuning, the feature extractor's parameters are fixed, we can multiply the gradient with 0.0 to avoid back propagating gradients. Pull Request resolved: https://github.com/pytorch/audio/pull/2335 Reviewed By: xiaohui-zhang, mthrok Differential Revision: D35646928 Pulled By: nateanl fbshipit-source-id: 6a9563e227aac6e3127b634357946d860f26c994

Add feature_grad_mult argument to HuBERTPretrainModel (#2335)
Summary: In Wav2Vec2 and HuBERT model training, the convolutional feature extraction layers use `group_norm` for normalization in `Base` model, while they use `layer_norm` in `Large` and `XLarge` models. For `Base` model, the gradients of feature extraction layers will be unstable in pre-training, thus we need to scale down the gradient by multiplying 0.1. In this PR, we add such argument to `HuBERTPretrainModel` to control the gradient of feature extractor layers. We also put the argument in the factory functions (`hubert_pretrain_base`, `hubert_pretrain_large`, and `hubert_pretrain_xlarge`. The reason is in finetuning, the feature extractor's parameters are fixed, we can multiply the gradient with 0.0 to avoid back propagating gradients. Pull Request resolved: https://github.com/pytorch/audio/pull/2335 Reviewed By: xiaohui-zhang, mthrok Differential Revision: D35646928 Pulled By: nateanl fbshipit-source-id: 6a9563e227aac6e3127b634357946d860f26c994
647f28e4 · Zhaoheng Ni · Facebook GitHub Bot · c6a376cc · 647f28e4 · 647f28e4
Commit 647f28e4 authored May 18, 2022 by Zhaoheng Ni Committed by Facebook GitHub Bot May 18, 2022
Show whitespace changes
Inline Side-by-side

Showing with 52 additions and 4 deletions

torchaudio/models/wav2vec2/components.py torchaudio/models/wav2vec2/components.py +12 -0

torchaudio/models/wav2vec2/model.py torchaudio/models/wav2vec2/model.py +40 -4

No files found.
--- a/torchaudio/models/wav2vec2/components.py
+++ b/torchaudio/models/wav2vec2/components.py
@@ -1037,3 +1037,15 @@ class LogitGenerator(Module):
            label_u = label[mask_u]
            logit_u = _compute_logits(proj_x_u, label_u, self.label_embeddings)
        return logit_m, logit_u
+
+
+class GradMultiply(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, scale):
+        ctx.scale = scale
+        res = x.new(x)
+        return res
+
+    @staticmethod
+    def backward(ctx, grad):
+        return grad * ctx.scale, None
--- a/torchaudio/models/wav2vec2/model.py
+++ b/torchaudio/models/wav2vec2/model.py
@@ -137,6 +137,11 @@ class HuBERTPretrainModel(Module):

        logit_generator (torch.nn.Module):
            Logit generator that predicts the logits of the masked and unmasked inputs.
+
+        feature_grad_mult (float or None):
+            The factor to scale the convolutional feature extraction layer gradients by.
+            If ``None``, the gradients of feature extraction layers are not affected.
+            The scale factor will not affect the forward pass.
    """

    def __init__(
@@ -144,11 +149,16 @@ class HuBERTPretrainModel(Module):
        wav2vec2: Wav2Vec2Model,
        mask_generator: Module,
        logit_generator: Module,
+        feature_grad_mult: Optional[float],
    ):
        super().__init__()
        self.wav2vec2 = wav2vec2
        self.mask_generator = mask_generator
        self.logit_generator = logit_generator
+        assert (
+            feature_grad_mult is None or 0.0 < feature_grad_mult < 1.0
+        ), f"The value of `feature_grad_mult` must be ``None`` or between (0, 1). Found {feature_grad_mult}"
+        self.feature_grad_mult = feature_grad_mult

    def forward(
        self,
@@ -184,6 +194,8 @@ class HuBERTPretrainModel(Module):
                Shape: `(1,)`.
        """
        x, lengths = self.wav2vec2.feature_extractor(waveforms, audio_lengths)
+        if self.feature_grad_mult is not None and self.feature_grad_mult < 1.0:
+            x = components.GradMultiply.apply(x, self.feature_grad_mult)
        features_pen = x.float().pow(2).mean()
        if lengths is not None:
            padding_mask = components._get_padding_mask(x, lengths)
@@ -712,6 +724,7 @@ def hubert_pretrain_model(
    skip_nomask: bool,
    num_classes: int,
    final_dim: int,
+    feature_grad_mult: Optional[float],
 ) -> HuBERTPretrainModel:
    # Overriding the signature so that the return type is correct on Sphinx
    """hubert_pretrain_model(extractor_mode: str, extractor_conv_layer_config: Optional[List[Tuple[int, int, int]]], extractor_conv_bias: bool, encoder_embed_dim: int, encoder_projection_dropout: float, encoder_pos_conv_kernel: int, encoder_pos_conv_groups: int, encoder_num_layers: int, encoder_num_heads: int, encoder_attention_dropout: float, encoder_ff_interm_features: int, encoder_ff_interm_dropout: float, encoder_dropout: float, encoder_layer_norm_first: bool, encoder_layer_drop: float, mask_prob: float, mask_selection: str, mask_other: float, mask_length: int, no_mask_overlap: bool, mask_min_space: int, mask_channel_prob: float, mask_channel_selection: str, mask_channel_other: float, mask_channel_length: int, no_mask_channel_overlap: bool, mask_channel_min_space: int, skip_masked: bool, skip_nomask: bool, num_classes: int, final_dim: int) -> torchaudio.models.HuBERTPretrainModel
@@ -910,6 +923,12 @@ def hubert_pretrain_model(

            This option corresponds to ``final_dim`` from ``fairseq``.

+        feature_grad_mult (float or None):
+            The factor to scale the convolutional feature extraction layer gradients by.
+            The scale factor will not affect the forward pass.
+
+            This option corresponds to ``feature_grad_mult`` from ``fairseq``.
+
    Returns:
        HuBERTPretrainModel:
            The resulting model.
@@ -958,7 +977,12 @@ def hubert_pretrain_model(
        skip_masked,
        skip_nomask,
    )
-    return HuBERTPretrainModel(wav2vec2=wav2vec2, mask_generator=mask_generator, logit_generator=logit_generator)
+    return HuBERTPretrainModel(
+        wav2vec2=wav2vec2,
+        mask_generator=mask_generator,
+        logit_generator=logit_generator,
+        feature_grad_mult=feature_grad_mult,
+    )


 def hubert_pretrain_base(
@@ -970,10 +994,11 @@ def hubert_pretrain_base(
    mask_prob: float = 0.8,
    mask_channel_prob: float = 0.0,
    mask_channel_length: int = 10,
+    feature_grad_mult: Optional[float] = 0.1,
    num_classes: int = 100,
 ) -> HuBERTPretrainModel:
    # Overriding the signature so that the return type is correct on Sphinx
-    """hubert_pretrain_base(encoder_projection_dropout: float = 0.1, encoder_attention_dropout: float = 0.1, encoder_ff_interm_dropout: float = 0.0, encoder_dropout: float = 0.1, encoder_layer_drop: float = 0.05, mask_prob: float = 0.8, mask_channel_prob: float = 0.0, mask_channel_length: int = 10, num_classes: int = 100) -> torchaudio.models.HuBERTPretrainModel
+    """hubert_pretrain_base(encoder_projection_dropout: float = 0.1, encoder_attention_dropout: float = 0.1, encoder_ff_interm_dropout: float = 0.0, encoder_dropout: float = 0.1, encoder_layer_drop: float = 0.05, mask_prob: float = 0.8, mask_channel_prob: float = 0.0, mask_channel_length: int = 10, feature_grad_mult: Optional[float] = 0.1, num_classes: int = 100) -> torchaudio.models.HuBERTPretrainModel

    Build HuBERTPretrainModel model with "base" architecture from *HuBERT* [:footcite:`hsu2021hubert`]

@@ -994,6 +1019,8 @@ def hubert_pretrain_base(
            See :py:func:`hubert_pretrain_model`.
        mask_channel_length (int):
            See :py:func:`hubert_pretrain_model`.
+        feature_grad_mult (float or None):
+            See :py:func:`hubert_pretrain_model`.
        num_classes (int, optional):
            See :py:func:`hubert_pretrain_model`.

@@ -1033,6 +1060,7 @@ def hubert_pretrain_base(
        skip_nomask=False,
        num_classes=num_classes,
        final_dim=256,
+        feature_grad_mult=feature_grad_mult,
    )


@@ -1045,9 +1073,10 @@ def hubert_pretrain_large(
    mask_prob: float = 0.8,
    mask_channel_prob: float = 0.0,
    mask_channel_length: int = 10,
+    feature_grad_mult: Optional[float] = None,
 ) -> HuBERTPretrainModel:
    # Overriding the signature so that the return type is correct on Sphinx
-    """hubert_pretrain_large(encoder_projection_dropout: float = 0.0, encoder_attention_dropout: float = 0.0, encoder_ff_interm_dropout: float = 0.0, encoder_dropout: float = 0.0, encoder_layer_drop: float = 0.0, mask_prob: float = 0.8, mask_channel_prob: float = 0.0, mask_channel_length: int = 10) -> torchaudio.models.HuBERTPretrainModel
+    """hubert_pretrain_large(encoder_projection_dropout: float = 0.0, encoder_attention_dropout: float = 0.0, encoder_ff_interm_dropout: float = 0.0, encoder_dropout: float = 0.0, encoder_layer_drop: float = 0.0, mask_prob: float = 0.8, mask_channel_prob: float = 0.0, mask_channel_length: int = 10, feature_grad_mult: Optional[float] = None) -> torchaudio.models.HuBERTPretrainModel

    Build HuBERTPretrainModel model for pre-training with "large" architecture from *HuBERT* [:footcite:`hsu2021hubert`]

@@ -1068,6 +1097,8 @@ def hubert_pretrain_large(
            See :py:func:`hubert_pretrain_model`.
        mask_channel_length (int):
            See :py:func:`hubert_pretrain_model`.
+        feature_grad_mult (float or None):
+            See :py:func:`hubert_pretrain_model`.

    Returns:
        HuBERTPretrainModel:
@@ -1105,6 +1136,7 @@ def hubert_pretrain_large(
        skip_nomask=False,
        num_classes=500,
        final_dim=768,
+        feature_grad_mult=feature_grad_mult,
    )


@@ -1117,9 +1149,10 @@ def hubert_pretrain_xlarge(
    mask_prob: float = 0.8,
    mask_channel_prob: float = 0.0,
    mask_channel_length: int = 10,
+    feature_grad_mult: Optional[float] = None,
 ) -> HuBERTPretrainModel:
    # Overriding the signature so that the return type is correct on Sphinx
-    """hubert_pretrain_xlarge(encoder_projection_dropout: float = 0.0, encoder_attention_dropout: float = 0.0, encoder_ff_interm_dropout: float = 0.0, encoder_dropout: float = 0.0, encoder_layer_drop: float = 0.0, mask_prob: float = 0.8, mask_channel_prob: float = 0.0, mask_channel_length: int = 10) -> torchaudio.models.HuBERTPretrainModel
+    """hubert_pretrain_xlarge(encoder_projection_dropout: float = 0.0, encoder_attention_dropout: float = 0.0, encoder_ff_interm_dropout: float = 0.0, encoder_dropout: float = 0.0, encoder_layer_drop: float = 0.0, mask_prob: float = 0.8, mask_channel_prob: float = 0.0, mask_channel_length: int = 10, feature_grad_mult: Optional[float] = None) -> torchaudio.models.HuBERTPretrainModel

    Build HuBERTPretrainModel model for pre-training with "extra large" architecture from *HuBERT* [:footcite:`hsu2021hubert`]

@@ -1140,6 +1173,8 @@ def hubert_pretrain_xlarge(
            See :py:func:`hubert_pretrain_model`.
        mask_channel_length (int):
            See :py:func:`hubert_pretrain_model`.
+        feature_grad_mult (float or None):
+            See :py:func:`hubert_pretrain_model`.

    Returns:
        HuBERTPretrainModel:
@@ -1177,4 +1212,5 @@ def hubert_pretrain_xlarge(
        skip_nomask=False,
        num_classes=500,
        final_dim=1024,
+        feature_grad_mult=feature_grad_mult,
    )