Add extra arguments to hubert pretrain factory functions (#2345)

Summary: In different pre-training and fine-tuning settings, the `mask_prob`, `mask_channel_prob`, and `mask_channel_length` are different. For example, the settings in [pre-training](https://github.com/pytorch/fairseq/blob/main/examples/hubert/config/pretrain/hubert_base_librispeech.yaml#L70) and [fine-tuning](https://github.com/pytorch/fairseq/blob/main/examples/hubert/config/finetune/base_10h.yaml#L69-L73) are different. The motivation is to avoid overfitting when fine-tuning on a small dataset (example: [fine-tune on 10 minutes of audio](https://github.com/pytorch/fairseq/blob/main/examples/wav2vec/config/finetuning/vox_10m.yaml#L57-L59)). This PR adds the required arguments in the factory functions to make them tunable for pre-training and fine-tuning. `mask_length` is set to `10` by default for all cases, hence it's not included in the factory function. Pull Request resolved: https://github.com/pytorch/audio/pull/2345 Reviewed By: carolineechen, xiaohui-zhang Differential Revision: D35845117 Pulled By: nateanl fbshipit-source-id: 0cbb74d09535d189b8258aa8ee0f88779bdb77e7

Add extra arguments to hubert pretrain factory functions (#2345)
Summary: In different pre-training and fine-tuning settings, the `mask_prob`, `mask_channel_prob`, and `mask_channel_length` are different. For example, the settings in [pre-training](https://github.com/pytorch/fairseq/blob/main/examples/hubert/config/pretrain/hubert_base_librispeech.yaml#L70) and [fine-tuning](https://github.com/pytorch/fairseq/blob/main/examples/hubert/config/finetune/base_10h.yaml#L69-L73) are different. The motivation is to avoid overfitting when fine-tuning on a small dataset (example: [fine-tune on 10 minutes of audio](https://github.com/pytorch/fairseq/blob/main/examples/wav2vec/config/finetuning/vox_10m.yaml#L57-L59)). This PR adds the required arguments in the factory functions to make them tunable for pre-training and fine-tuning. `mask_length` is set to `10` by default for all cases, hence it's not included in the factory function. Pull Request resolved: https://github.com/pytorch/audio/pull/2345 Reviewed By: carolineechen, xiaohui-zhang Differential Revision: D35845117 Pulled By: nateanl fbshipit-source-id: 0cbb74d09535d189b8258aa8ee0f88779bdb77e7
7c249d17 · Zhaoheng Ni · Facebook GitHub Bot · 0986eebf · 7c249d17
Commit 7c249d17 authored Apr 26, 2022 by Zhaoheng Ni Committed by Facebook GitHub Bot Apr 26, 2022
Hide whitespace changes
Inline Side-by-side

Showing with 39 additions and 12 deletions

torchaudio/models/wav2vec2/model.py torchaudio/models/wav2vec2/model.py +39 -12

No files found.
--- a/torchaudio/models/wav2vec2/model.py
+++ b/torchaudio/models/wav2vec2/model.py
@@ -967,10 +967,13 @@ def hubert_pretrain_base(
    encoder_ff_interm_dropout: float = 0.0,
    encoder_dropout: float = 0.1,
    encoder_layer_drop: float = 0.05,
+    mask_prob: float = 0.8,
+    mask_channel_prob: float = 0.0,
+    mask_channel_length: int = 10,
    num_classes: int = 100,
 ) -> HuBERTPretrainModel:
    # Overriding the signature so that the return type is correct on Sphinx
-    """hubert_pretrain_base(encoder_projection_dropout: float = 0.1, encoder_attention_dropout: float = 0.1, encoder_ff_interm_dropout: float = 0.0, encoder_dropout: float = 0.1, encoder_layer_drop: float = 0.05, num_classes: int = 100) -> torchaudio.models.HuBERTPretrainModel
+    """hubert_pretrain_base(encoder_projection_dropout: float = 0.1, encoder_attention_dropout: float = 0.1, encoder_ff_interm_dropout: float = 0.0, encoder_dropout: float = 0.1, encoder_layer_drop: float = 0.05, mask_prob: float = 0.8, mask_channel_prob: float = 0.0, mask_channel_length: int = 10, num_classes: int = 100) -> torchaudio.models.HuBERTPretrainModel
    Build HuBERTPretrainModel model with "base" architecture from *HuBERT* [:footcite:`hsu2021hubert`]
@@ -985,6 +988,12 @@ def hubert_pretrain_base(
            See :py:func:`hubert_pretrain_model`.
        encoder_layer_drop (float):
            See :py:func:`hubert_pretrain_model`.
+        mask_prob (float):
+            See :py:func:`hubert_pretrain_model`.
+        mask_channel_prob (float):
+            See :py:func:`hubert_pretrain_model`.
+        mask_channel_length (int):
+            See :py:func:`hubert_pretrain_model`.
        num_classes (int, optional):
            See :py:func:`hubert_pretrain_model`.
@@ -1008,16 +1017,16 @@ def hubert_pretrain_base(
        encoder_dropout=encoder_dropout,
        encoder_layer_norm_first=False,
        encoder_layer_drop=encoder_layer_drop,
-        mask_prob=0.80,
+        mask_prob=mask_prob,
        mask_selection="static",
        mask_other=0.0,
        mask_length=10,
        no_mask_overlap=False,
        mask_min_space=1,
-        mask_channel_prob=0.0,
+        mask_channel_prob=mask_channel_prob,
        mask_channel_selection="static",
        mask_channel_other=0.0,
-        mask_channel_length=10,
+        mask_channel_length=mask_channel_length,
        no_mask_channel_overlap=False,
        mask_channel_min_space=1,
        skip_masked=False,
@@ -1033,9 +1042,12 @@ def hubert_pretrain_large(
    encoder_ff_interm_dropout: float = 0.0,
    encoder_dropout: float = 0.0,
    encoder_layer_drop: float = 0.0,
+    mask_prob: float = 0.8,
+    mask_channel_prob: float = 0.0,
+    mask_channel_length: int = 10,
 ) -> HuBERTPretrainModel:
    # Overriding the signature so that the return type is correct on Sphinx
-    """hubert_pretrain_large(encoder_projection_dropout: float = 0.0, encoder_attention_dropout: float = 0.0, encoder_ff_interm_dropout: float = 0.0, encoder_dropout: float = 0.0, encoder_layer_drop: float = 0.0) -> torchaudio.models.HuBERTPretrainModel
+    """hubert_pretrain_large(encoder_projection_dropout: float = 0.0, encoder_attention_dropout: float = 0.0, encoder_ff_interm_dropout: float = 0.0, encoder_dropout: float = 0.0, encoder_layer_drop: float = 0.0, mask_prob: float = 0.8, mask_channel_prob: float = 0.0, mask_channel_length: int = 10) -> torchaudio.models.HuBERTPretrainModel
    Build HuBERTPretrainModel model for pre-training with "large" architecture from *HuBERT* [:footcite:`hsu2021hubert`]
@@ -1050,6 +1062,12 @@ def hubert_pretrain_large(
            See :py:func:`hubert_pretrain_model`.
        encoder_layer_drop (float):
            See :py:func:`hubert_pretrain_model`.
+        mask_prob (float):
+            See :py:func:`hubert_pretrain_model`.
+        mask_channel_prob (float):
+            See :py:func:`hubert_pretrain_model`.
+        mask_channel_length (int):
+            See :py:func:`hubert_pretrain_model`.
    Returns:
        HuBERTPretrainModel:
@@ -1071,16 +1089,16 @@ def hubert_pretrain_large(
        encoder_dropout=encoder_dropout,
        encoder_layer_norm_first=True,
        encoder_layer_drop=encoder_layer_drop,
-        mask_prob=0.80,
+        mask_prob=mask_prob,
        mask_selection="static",
        mask_other=0.0,
        mask_length=10,
        no_mask_overlap=False,
        mask_min_space=1,
-        mask_channel_prob=0.0,
+        mask_channel_prob=mask_channel_prob,
        mask_channel_selection="static",
        mask_channel_other=0.0,
-        mask_channel_length=10,
+        mask_channel_length=mask_channel_length,
        no_mask_channel_overlap=False,
        mask_channel_min_space=1,
        skip_masked=False,
@@ -1096,9 +1114,12 @@ def hubert_pretrain_xlarge(
    encoder_ff_interm_dropout: float = 0.0,
    encoder_dropout: float = 0.0,
    encoder_layer_drop: float = 0.0,
+    mask_prob: float = 0.8,
+    mask_channel_prob: float = 0.0,
+    mask_channel_length: int = 10,
 ) -> HuBERTPretrainModel:
    # Overriding the signature so that the return type is correct on Sphinx
-    """hubert_pretrain_xlarge(encoder_projection_dropout: float = 0.0, encoder_attention_dropout: float = 0.0, encoder_ff_interm_dropout: float = 0.0, encoder_dropout: float = 0.0, encoder_layer_drop: float = 0.0) -> torchaudio.models.HuBERTPretrainModel
+    """hubert_pretrain_xlarge(encoder_projection_dropout: float = 0.0, encoder_attention_dropout: float = 0.0, encoder_ff_interm_dropout: float = 0.0, encoder_dropout: float = 0.0, encoder_layer_drop: float = 0.0, mask_prob: float = 0.8, mask_channel_prob: float = 0.0, mask_channel_length: int = 10) -> torchaudio.models.HuBERTPretrainModel
    Build HuBERTPretrainModel model for pre-training with "extra large" architecture from *HuBERT* [:footcite:`hsu2021hubert`]
@@ -1113,6 +1134,12 @@ def hubert_pretrain_xlarge(
            See :py:func:`hubert_pretrain_model`.
        encoder_layer_drop (float):
            See :py:func:`hubert_pretrain_model`.
+        mask_prob (float):
+            See :py:func:`hubert_pretrain_model`.
+        mask_channel_prob (float):
+            See :py:func:`hubert_pretrain_model`.
+        mask_channel_length (int):
+            See :py:func:`hubert_pretrain_model`.
    Returns:
        HuBERTPretrainModel:
@@ -1134,16 +1161,16 @@ def hubert_pretrain_xlarge(
        encoder_dropout=encoder_dropout,
        encoder_layer_norm_first=True,
        encoder_layer_drop=encoder_layer_drop,
-        mask_prob=0.80,
+        mask_prob=mask_prob,
        mask_selection="static",
        mask_other=0.0,
        mask_length=10,
        no_mask_overlap=False,
        mask_min_space=1,
-        mask_channel_prob=0.0,
+        mask_channel_prob=mask_channel_prob,
        mask_channel_selection="static",
        mask_channel_other=0.0,
-        mask_channel_length=10,
+        mask_channel_length=mask_channel_length,
        no_mask_channel_overlap=False,
        mask_channel_min_space=1,
        skip_masked=False,