Add the rest of HuBERT pretrained models (#1824)

This commit adds - HUBERT_LARGE - HUBERT_XLARGE - HUBERT_ASR_XLARGE

Add the rest of HuBERT pretrained models (#1824)
This commit adds - HUBERT_LARGE - HUBERT_XLARGE - HUBERT_ASR_XLARGE
c9e4c75d · moto · GitHub · 181f0c80 · c9e4c75d · c9e4c75d
Unverified Commit c9e4c75d authored Oct 05, 2021 by moto Committed by GitHub Oct 05, 2021
4 changed files
--- a/docs/source/models.rst
+++ b/docs/source/models.rst
@@ -142,9 +142,18 @@ Pre-trained Models
   .. autodata:: HUBERT_BASE
      :no-value:
+   .. autodata:: HUBERT_LARGE
+      :no-value:
+   .. autodata:: HUBERT_XLARGE
+      :no-value:
   .. autodata:: HUBERT_ASR_LARGE
      :no-value:
+   .. autodata:: HUBERT_ASR_XLARGE
+      :no-value:
 Utility Functions
 -----------------

--- a/test/integration_tests/wav2vec2_model_test.py
+++ b/test/integration_tests/wav2vec2_model_test.py
 import torchaudio
 from torchaudio.models import (
    HUBERT_BASE,
+    HUBERT_LARGE,
+    HUBERT_XLARGE,
    HUBERT_ASR_LARGE,
+    HUBERT_ASR_XLARGE,
 )
 import pytest
@@ -10,6 +13,8 @@ import pytest
    "bundle",
    [
        HUBERT_BASE,
+        HUBERT_LARGE,
+        HUBERT_XLARGE,
    ]
 )
 def test_pretraining_models(bundle):
@@ -21,6 +26,7 @@ def test_pretraining_models(bundle):
    "bundle,expected",
    [
        (HUBERT_ASR_LARGE, 'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'),
+        (HUBERT_ASR_XLARGE, 'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|')
    ]
 )
 def test_finetune_asr_model(

--- a/torchaudio/models/__init__.py
+++ b/torchaudio/models/__init__.py
@@ -20,7 +20,10 @@ from .wav2vec2 import (
 from .wav2vec2.pretrained import (
    Wav2Vec2PretrainedModelBundle,
    HUBERT_BASE,
+    HUBERT_LARGE,
+    HUBERT_XLARGE,
    HUBERT_ASR_LARGE,
+    HUBERT_ASR_XLARGE,
 )
 __all__ = [
@@ -43,7 +46,10 @@ __all__ = [
    'hubert_ft_xlarge',
    'Wav2Vec2PretrainedModelBundle',
    'HUBERT_BASE',
+    'HUBERT_LARGE',
+    'HUBERT_XLARGE',
    'HUBERT_ASR_LARGE',
+    'HUBERT_ASR_XLARGE',
    'Tacotron2',
    'tacotron2',
 ]
--- a/torchaudio/models/wav2vec2/pretrained.py
+++ b/torchaudio/models/wav2vec2/pretrained.py
@@ -122,6 +122,7 @@ def _get_labels():
        'Z',
    )
 HUBERT_BASE = Wav2Vec2PretrainedModelBundle(
    'hubert_fairseq_base_ls960.pth',
    {
@@ -154,7 +155,89 @@ HUBERT_BASE = Wav2Vec2PretrainedModelBundle(
 )
 HUBERT_BASE.__doc__ = """HuBERT model with "Base" configuration.
-Trained on 960 hours of *LibriSpeech* [:footcite:`7178964`] dataset. Not fine-tuned.
+Pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset [:footcite:`7178964`]
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500").
+Not fine-tuned.
+Originally published by the authors of *HuBERT* [:footcite:`hsu2021hubert`].
+[`Source <https://github.com/pytorch/fairseq/tree/main/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
+"""
+HUBERT_LARGE = Wav2Vec2PretrainedModelBundle(
+    'hubert_fairseq_large_ll60k.pth',
+    {
+        'extractor_mode': 'layer_norm',
+        'extractor_conv_layer_config': [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        'extractor_conv_bias': False,
+        'encoder_embed_dim': 1024,
+        'encoder_projection_dropout': 0.0,
+        'encoder_pos_conv_kernel': 128,
+        'encoder_pos_conv_groups': 16,
+        'encoder_num_layers': 24,
+        'encoder_num_heads': 16,
+        'encoder_attention_dropout': 0.0,
+        'encoder_ff_interm_features': 4096,
+        'encoder_ff_interm_dropout': 0.0,
+        'encoder_dropout': 0.0,
+        'encoder_layer_norm_first': True,
+        'encoder_layer_drop': 0.0,
+        'aux_num_out': None,
+    },
+    _labels=None,
+)
+HUBERT_LARGE.__doc__ = """HuBERT model with "Large" configuration.
+Pre-trained on 60,000 hours of unlabeled audio from
+*Libri-Light* dataset [:footcite:`librilight`].
+Not fine-tuned.
+Originally published by the authors of *HuBERT* [:footcite:`hsu2021hubert`].
+[`Source <https://github.com/pytorch/fairseq/tree/main/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
+"""
+HUBERT_XLARGE = Wav2Vec2PretrainedModelBundle(
+    'hubert_fairseq_xlarge_ll60k.pth',
+    {
+        'extractor_mode': 'layer_norm',
+        'extractor_conv_layer_config': [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        'extractor_conv_bias': False,
+        'encoder_embed_dim': 1280,
+        'encoder_projection_dropout': 0.0,
+        'encoder_pos_conv_kernel': 128,
+        'encoder_pos_conv_groups': 16,
+        'encoder_num_layers': 48,
+        'encoder_num_heads': 16,
+        'encoder_attention_dropout': 0.0,
+        'encoder_ff_interm_features': 5120,
+        'encoder_ff_interm_dropout': 0.0,
+        'encoder_dropout': 0.0,
+        'encoder_layer_norm_first': True,
+        'encoder_layer_drop': 0.0,
+        'aux_num_out': None,
+    },
+    _labels=None,
+)
+HUBERT_XLARGE.__doc__ = """HuBERT model with "Extra Large" configuration.
+Pre-trained on 60,000 hours of unlabeled audio from
+*Libri-Light* dataset [:footcite:`librilight`].
+Not fine-tuned.
 Originally published by the authors of *HuBERT* [:footcite:`hsu2021hubert`].
 [`Source <https://github.com/pytorch/fairseq/tree/main/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
@@ -192,8 +275,53 @@ HUBERT_ASR_LARGE = Wav2Vec2PretrainedModelBundle(
 )
 HUBERT_ASR_LARGE.__doc__ = """HuBERT model with "Large" configuration.
-Pre-trained on 60,000 hours of *Libri-Light* [:footcite:`librilight`] dataset, and
+Pre-trained on 60,000 hours of unlabeled audio from
-fine-tuned for ASR on 960 hours of *LibriSpeech* [:footcite:`7178964`] dataset.
+*Libri-Light* dataset [:footcite:`librilight`], and
+fine-tuned for ASR on 960 hours of transcribed audio from
+*LibriSpeech* dataset [:footcite:`7178964`]
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500").
+Originally published by the authors of *HuBERT* [:footcite:`hsu2021hubert`].
+[`Source <https://github.com/pytorch/fairseq/tree/main/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
+"""
+HUBERT_ASR_XLARGE = Wav2Vec2PretrainedModelBundle(
+    'hubert_fairseq_xlarge_ll60k_asr_ls960.pth',
+    {
+        'extractor_mode': 'layer_norm',
+        'extractor_conv_layer_config': [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        'extractor_conv_bias': False,
+        'encoder_embed_dim': 1280,
+        'encoder_projection_dropout': 0.0,
+        'encoder_pos_conv_kernel': 128,
+        'encoder_pos_conv_groups': 16,
+        'encoder_num_layers': 48,
+        'encoder_num_heads': 16,
+        'encoder_attention_dropout': 0.0,
+        'encoder_ff_interm_features': 5120,
+        'encoder_ff_interm_dropout': 0.1,
+        'encoder_dropout': 0.0,
+        'encoder_layer_norm_first': True,
+        'encoder_layer_drop': 0.1,
+        'aux_num_out': 32,
+    },
+    _labels=_get_labels(),
+)
+HUBERT_ASR_XLARGE.__doc__ = """HuBERT model with "Extra Large" configuration.
+Pre-trained on 60,000 hours of unlabeled audio from
+*Libri-Light* dataset [:footcite:`librilight`], and
+fine-tuned for ASR on 960 hours of transcribed audio from
+*LibriSpeech* dataset [:footcite:`7178964`]
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500").
 Originally published by the authors of *HuBERT* [:footcite:`hsu2021hubert`].
 [`Source <https://github.com/pytorch/fairseq/tree/main/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]