Add pretrained weights from wav2vec2.0 and XLSR papers (#1827)

Add pretrained weights from https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models - Wav2Vec 2.0 Base / Large / Large (LV-60) - XLSR-53

Add pretrained weights from wav2vec2.0 and XLSR papers (#1827)
Add pretrained weights from https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models - Wav2Vec 2.0 Base / Large / Large (LV-60) - XLSR-53
e40c9c3c · moto · GitHub · 48cfbf2b · e40c9c3c · e40c9c3c
Unverified Commit e40c9c3c authored Oct 06, 2021 by moto Committed by GitHub Oct 06, 2021
4 changed files
--- a/docs/source/models.rst
+++ b/docs/source/models.rst
@@ -139,6 +139,45 @@ Pre-trained Models
   .. autoproperty:: labels
+   .. autodata:: WAV2VEC2_BASE
+      :no-value:
+   .. autodata:: WAV2VEC2_ASR_BASE_10M
+      :no-value:
+   .. autodata:: WAV2VEC2_ASR_BASE_100H
+      :no-value:
+   .. autodata:: WAV2VEC2_ASR_BASE_960H
+      :no-value:
+   .. autodata:: WAV2VEC2_LARGE
+      :no-value:
+   .. autodata:: WAV2VEC2_ASR_LARGE_10M
+      :no-value:
+   .. autodata:: WAV2VEC2_ASR_LARGE_100H
+      :no-value:
+   .. autodata:: WAV2VEC2_ASR_LARGE_960H
+      :no-value:
+   .. autodata:: WAV2VEC2_LARGE_LV60K
+      :no-value:
+   .. autodata:: WAV2VEC2_ASR_LARGE_LV60K_10M
+      :no-value:
+   .. autodata:: WAV2VEC2_ASR_LARGE_LV60K_100H
+      :no-value:
+   .. autodata:: WAV2VEC2_ASR_LARGE_LV60K_960H
+      :no-value:
+   .. autodata:: WAV2VEC2_XLSR53
+      :no-value:
   .. autodata:: HUBERT_BASE
      :no-value:

--- a/test/integration_tests/wav2vec2_model_test.py
+++ b/test/integration_tests/wav2vec2_model_test.py
 import torchaudio
 from torchaudio.models import (
+    WAV2VEC2_BASE,
+    WAV2VEC2_LARGE,
+    WAV2VEC2_LARGE_LV60K,
+    WAV2VEC2_ASR_BASE_10M,
+    WAV2VEC2_ASR_BASE_100H,
+    WAV2VEC2_ASR_BASE_960H,
+    WAV2VEC2_ASR_LARGE_10M,
+    WAV2VEC2_ASR_LARGE_100H,
+    WAV2VEC2_ASR_LARGE_960H,
+    WAV2VEC2_ASR_LARGE_LV60K_10M,
+    WAV2VEC2_ASR_LARGE_LV60K_100H,
+    WAV2VEC2_ASR_LARGE_LV60K_960H,
+    WAV2VEC2_XLSR53,
    HUBERT_BASE,
    HUBERT_LARGE,
    HUBERT_XLARGE,
@@ -12,6 +25,10 @@ import pytest
 @pytest.mark.parametrize(
    "bundle",
    [
+        WAV2VEC2_BASE,
+        WAV2VEC2_LARGE,
+        WAV2VEC2_LARGE_LV60K,
+        WAV2VEC2_XLSR53,
        HUBERT_BASE,
        HUBERT_LARGE,
        HUBERT_XLARGE,
@@ -25,6 +42,15 @@ def test_pretraining_models(bundle):
 @pytest.mark.parametrize(
    "bundle,expected",
    [
+        (WAV2VEC2_ASR_BASE_10M, 'I|HAD|THAT|CURIYOSSITY|BESID|ME|AT|THIS|MOMENT|'),
+        (WAV2VEC2_ASR_BASE_100H, 'I|HAD|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'),
+        (WAV2VEC2_ASR_BASE_960H, 'I|HAD|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'),
+        (WAV2VEC2_ASR_LARGE_10M, 'I|HAD|THAT|CURIOUSITY|BESIDE|ME|AT|THIS|MOMENT|'),
+        (WAV2VEC2_ASR_LARGE_100H, 'I|HAD|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'),
+        (WAV2VEC2_ASR_LARGE_960H, 'I|HAD|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'),
+        (WAV2VEC2_ASR_LARGE_LV60K_10M, 'I|HAD|THAT|CURIOUSSITY|BESID|ME|AT|THISS|MOMENT|'),
+        (WAV2VEC2_ASR_LARGE_LV60K_100H, 'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'),
+        (WAV2VEC2_ASR_LARGE_LV60K_960H, 'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'),
        (HUBERT_ASR_LARGE, 'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'),
        (HUBERT_ASR_XLARGE, 'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|')
    ]

--- a/torchaudio/models/__init__.py
+++ b/torchaudio/models/__init__.py
@@ -19,6 +19,19 @@ from .wav2vec2 import (
 )
 from .wav2vec2.pretrained import (
    Wav2Vec2PretrainedModelBundle,
+    WAV2VEC2_BASE,
+    WAV2VEC2_LARGE,
+    WAV2VEC2_LARGE_LV60K,
+    WAV2VEC2_ASR_BASE_10M,
+    WAV2VEC2_ASR_BASE_100H,
+    WAV2VEC2_ASR_BASE_960H,
+    WAV2VEC2_ASR_LARGE_10M,
+    WAV2VEC2_ASR_LARGE_100H,
+    WAV2VEC2_ASR_LARGE_960H,
+    WAV2VEC2_ASR_LARGE_LV60K_10M,
+    WAV2VEC2_ASR_LARGE_LV60K_100H,
+    WAV2VEC2_ASR_LARGE_LV60K_960H,
+    WAV2VEC2_XLSR53,
    HUBERT_BASE,
    HUBERT_LARGE,
    HUBERT_XLARGE,
@@ -45,6 +58,19 @@ __all__ = [
    'hubert_ft_large',
    'hubert_ft_xlarge',
    'Wav2Vec2PretrainedModelBundle',
+    'WAV2VEC2_BASE',
+    'WAV2VEC2_LARGE',
+    'WAV2VEC2_LARGE_LV60K',
+    'WAV2VEC2_ASR_BASE_10M',
+    'WAV2VEC2_ASR_BASE_100H',
+    'WAV2VEC2_ASR_BASE_960H',
+    'WAV2VEC2_ASR_LARGE_10M',
+    'WAV2VEC2_ASR_LARGE_100H',
+    'WAV2VEC2_ASR_LARGE_960H',
+    'WAV2VEC2_ASR_LARGE_LV60K_10M',
+    'WAV2VEC2_ASR_LARGE_LV60K_100H',
+    'WAV2VEC2_ASR_LARGE_LV60K_960H',
+    'WAV2VEC2_XLSR53',
    'HUBERT_BASE',
    'HUBERT_LARGE',
    'HUBERT_XLARGE',

--- a/torchaudio/models/wav2vec2/pretrained.py
+++ b/torchaudio/models/wav2vec2/pretrained.py
@@ -123,6 +123,549 @@ def _get_labels():
    )
+WAV2VEC2_BASE = Wav2Vec2PretrainedModelBundle(
+    _path='wav2vec2_fairseq_base_ls960.pth',
+    _params={
+        'extractor_mode': 'group_norm',
+        'extractor_conv_layer_config': [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        'extractor_conv_bias': False,
+        'encoder_embed_dim': 768,
+        'encoder_projection_dropout': 0.1,
+        'encoder_pos_conv_kernel': 128,
+        'encoder_pos_conv_groups': 16,
+        'encoder_num_layers': 12,
+        'encoder_num_heads': 12,
+        'encoder_attention_dropout': 0.1,
+        'encoder_ff_interm_features': 3072,
+        'encoder_ff_interm_dropout': 0.0,
+        'encoder_dropout': 0.1,
+        'encoder_layer_norm_first': False,
+        'encoder_layer_drop': 0.05,
+        "aux_num_out": None,
+    },
+    _labels=None,
+)
+WAV2VEC2_BASE.__doc__ = """wav2vec 2.0 model with "Base" configuration.
+Pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset [:footcite:`7178964`]
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500").
+Not fine-tuned.
+Originally published by the authors of *wav2vec 2.0* [:footcite:`baevski2020wav2vec`].
+[`Source <https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models>`__]
+"""
+WAV2VEC2_ASR_BASE_10M = Wav2Vec2PretrainedModelBundle(
+    _path='wav2vec2_fairseq_base_ls960_asr_ll10m.pth',
+    _params={
+        'extractor_mode': 'group_norm',
+        'extractor_conv_layer_config': [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        'extractor_conv_bias': False,
+        'encoder_embed_dim': 768,
+        'encoder_projection_dropout': 0.1,
+        'encoder_pos_conv_kernel': 128,
+        'encoder_pos_conv_groups': 16,
+        'encoder_num_layers': 12,
+        'encoder_num_heads': 12,
+        'encoder_attention_dropout': 0.1,
+        'encoder_ff_interm_features': 3072,
+        'encoder_ff_interm_dropout': 0.0,
+        'encoder_dropout': 0.1,
+        'encoder_layer_norm_first': False,
+        'encoder_layer_drop': 0.05,
+        "aux_num_out": 32,
+    },
+    _labels=_get_labels(),
+)
+WAV2VEC2_ASR_BASE_10M.__doc__ = """Build "base" wav2vec2 model with an extra linear module
+Pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset [:footcite:`7178964`]
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
+fine-tuned for ASR on 10 minutes of transcribed audio from *Libri-Light* dataset
+[:footcite:`librilight`] ("train-10min" subset).
+Originally published by the authors of *wav2vec 2.0*
+[:footcite:`baevski2020wav2vec`].
+[`Source <https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models>`__]
+"""
+WAV2VEC2_ASR_BASE_100H = Wav2Vec2PretrainedModelBundle(
+    'wav2vec2_fairseq_base_ls960_asr_ls100.pth',
+    {
+        'extractor_mode': 'group_norm',
+        'extractor_conv_layer_config': [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        'extractor_conv_bias': False,
+        'encoder_embed_dim': 768,
+        'encoder_projection_dropout': 0.1,
+        'encoder_pos_conv_kernel': 128,
+        'encoder_pos_conv_groups': 16,
+        'encoder_num_layers': 12,
+        'encoder_num_heads': 12,
+        'encoder_attention_dropout': 0.1,
+        'encoder_ff_interm_features': 3072,
+        'encoder_ff_interm_dropout': 0.0,
+        'encoder_dropout': 0.1,
+        'encoder_layer_norm_first': False,
+        'encoder_layer_drop': 0.05,
+        "aux_num_out": 32,
+    },
+    _labels=_get_labels(),
+)
+WAV2VEC2_ASR_BASE_100H.__doc__ = """Build "base" wav2vec2 model with an extra linear module
+Pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset [:footcite:`7178964`]
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
+fine-tuned for ASR on 100 hours of transcribed audio from "train-clean-100" subset.
+Originally published by the authors of *wav2vec 2.0*
+[:footcite:`baevski2020wav2vec`].
+[`Source <https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models>`__]
+"""
+WAV2VEC2_ASR_BASE_960H = Wav2Vec2PretrainedModelBundle(
+    'wav2vec2_fairseq_base_ls960_asr_ls960.pth',
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 768,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 12,
+        "encoder_num_heads": 12,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 3072,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.1,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.05,
+        "aux_num_out": 32,
+    },
+    _labels=_get_labels(),
+)
+WAV2VEC2_ASR_BASE_960H.__doc__ = """Build "base" wav2vec2 model with an extra linear module
+Pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset [:footcite:`7178964`]
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
+fine-tuned for ASR on the same audio with the corresponding transcripts.
+Originally published by the authors of *wav2vec 2.0*
+[:footcite:`baevski2020wav2vec`].
+[`Source <https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models>`__]
+"""
+WAV2VEC2_LARGE = Wav2Vec2PretrainedModelBundle(
+    'wav2vec2_fairseq_large_ls960.pth',
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.2,
+        "aux_num_out": None,
+    },
+    _labels=None,
+)
+WAV2VEC2_LARGE.__doc__ = """Build "large" wav2vec2 model.
+Pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset [:footcite:`7178964`]
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500").
+Not fine-tuned.
+Originally published by the authors of *wav2vec 2.0*
+[:footcite:`baevski2020wav2vec`].
+[`Source <https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models>`__]
+"""
+WAV2VEC2_ASR_LARGE_10M = Wav2Vec2PretrainedModelBundle(
+    'wav2vec2_fairseq_large_ls960_asr_ll10m.pth',
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.2,
+        "aux_num_out": 32,
+    },
+    _labels=_get_labels(),
+)
+WAV2VEC2_ASR_LARGE_10M.__doc__ = """Build "large" wav2vec2 model with an extra linear module
+Pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset [:footcite:`7178964`]
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
+fine-tuned for ASR on 10 minutes of transcribed audio from *Libri-Light* dataset
+[:footcite:`librilight`] ("train-10min" subset).
+Originally published by the authors of *wav2vec 2.0*
+[:footcite:`baevski2020wav2vec`].
+[`Source <https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models>`__]
+"""
+WAV2VEC2_ASR_LARGE_100H = Wav2Vec2PretrainedModelBundle(
+    'wav2vec2_fairseq_large_ls960_asr_ls100.pth',
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.2,
+        "aux_num_out": 32,
+    },
+    _labels=_get_labels(),
+)
+WAV2VEC2_ASR_LARGE_100H.__doc__ = """Build "large" wav2vec2 model with an extra linear module
+Pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset [:footcite:`7178964`]
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
+fine-tuned for ASR on 100 hours of transcribed audio from
+the same dataset ("train-clean-100" subset).
+Originally published by the authors of *wav2vec 2.0*
+[:footcite:`baevski2020wav2vec`].
+[`Source <https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models>`__]
+"""
+WAV2VEC2_ASR_LARGE_960H = Wav2Vec2PretrainedModelBundle(
+    'wav2vec2_fairseq_large_ls960_asr_ls960.pth',
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.2,
+        "aux_num_out": 32,
+    },
+    _labels=_get_labels(),
+)
+WAV2VEC2_ASR_LARGE_960H.__doc__ = """Build "large" wav2vec2 model with an extra linear module
+Pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset [:footcite:`7178964`]
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
+fine-tuned for ASR on the same audio with the corresponding transcripts.
+Originally published by the authors of *wav2vec 2.0*
+[:footcite:`baevski2020wav2vec`].
+[`Source <https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models>`__]
+"""
+WAV2VEC2_LARGE_LV60K = Wav2Vec2PretrainedModelBundle(
+    'wav2vec2_fairseq_large_lv60k.pth',
+    {
+        "extractor_mode": "layer_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": True,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": True,
+        "encoder_layer_drop": 0.0,
+        "aux_num_out": None,
+    },
+    _labels=None,
+)
+WAV2VEC2_LARGE_LV60K.__doc__ = """Build "large-lv60k" wav2vec2 model.
+Pre-trained on 60,000 hours of unlabeled audio from
+*Libri-Light* dataset [:footcite:`librilight`].
+Not fine-tuned.
+Originally published by the authors of *wav2vec 2.0*
+[:footcite:`baevski2020wav2vec`].
+[`Source <https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models>`__]
+"""
+WAV2VEC2_ASR_LARGE_LV60K_10M = Wav2Vec2PretrainedModelBundle(
+    'wav2vec2_fairseq_large_lv60k_asr_ll10m.pth',
+    {
+        "extractor_mode": "layer_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": True,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": True,
+        "encoder_layer_drop": 0.0,
+        "aux_num_out": 32,
+    },
+    _labels=_get_labels(),
+)
+WAV2VEC2_ASR_LARGE_LV60K_10M.__doc__ = """Build "large-lv60k" wav2vec2 model with an extra linear module
+Pre-trained on 60,000 hours of unlabeled audio from
+*Libri-Light* dataset [:footcite:`librilight`], and
+fine-tuned for ASR on 10 minutes of transcribed audio from
+the same dataset ("train-10min" subset).
+Originally published by the authors of *wav2vec 2.0*
+[:footcite:`baevski2020wav2vec`].
+[`Source <https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models>`__]
+"""
+WAV2VEC2_ASR_LARGE_LV60K_100H = Wav2Vec2PretrainedModelBundle(
+    'wav2vec2_fairseq_large_lv60k_asr_ls100.pth',
+    {
+        "extractor_mode": "layer_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": True,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": True,
+        "encoder_layer_drop": 0.0,
+        "aux_num_out": 32,
+    },
+    _labels=_get_labels(),
+)
+WAV2VEC2_ASR_LARGE_LV60K_100H.__doc__ = """Build "large-lv60k" wav2vec2 model with an extra linear module
+Pre-trained on 60,000 hours of unlabeled audio from
+*Libri-Light* dataset [:footcite:`librilight`], and
+fine-tuned for ASR on 100 hours of transcribed audio from
+*LibriSpeech* dataset [:footcite:`7178964`] ("train-clean-100" subset).
+Originally published by the authors of *wav2vec 2.0*
+[:footcite:`baevski2020wav2vec`].
+[`Source <https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models>`__]
+"""
+WAV2VEC2_ASR_LARGE_LV60K_960H = Wav2Vec2PretrainedModelBundle(
+    'wav2vec2_fairseq_large_lv60k_asr_ls960.pth',
+    {
+        "extractor_mode": "layer_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": True,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": True,
+        "encoder_layer_drop": 0.0,
+        "aux_num_out": 32,
+    },
+    _labels=_get_labels(),
+)
+WAV2VEC2_ASR_LARGE_LV60K_960H.__doc__ = """Build "large-lv60k" wav2vec2 model with an extra linear module
+Pre-trained on 60,000 hours of unlabeled audio from *Libri-Light*
+[:footcite:`librilight`] dataset, and
+fine-tuned for ASR on 960 hours of transcribed audio from
+*LibriSpeech* dataset [:footcite:`7178964`]
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500").
+Originally published by the authors of *wav2vec 2.0*
+[:footcite:`baevski2020wav2vec`].
+[`Source <https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models>`__]
+"""
+WAV2VEC2_XLSR53 = Wav2Vec2PretrainedModelBundle(
+    'wav2vec2_fairseq_large_xlsr53.pth',
+    {
+        "extractor_mode": "layer_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": True,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.0,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.0,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": True,
+        "encoder_layer_drop": 0.0,
+        "aux_num_out": None,
+    },
+    _labels=None,
+)
+WAV2VEC2_XLSR53.__doc__ = """wav2vec 2.0 model with "Base" configuration.
+Trained on 56,000 hours of unlabeled audio from multiple datasets (
+*Multilingual LibriSpeech* [:footcite:`Pratap_2020`],
+*CommonVoice* [:footcite:`ardila2020common`] and
+*BABEL* [:footcite:`Gales2014SpeechRA`]).
+Not fine-tuned.
+Originally published by the authors of
+*Unsupervised Cross-lingual Representation Learning for Speech Recognition*
+[:footcite:`conneau2020unsupervised`].
+[`Source <https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models>`__]
+"""
 HUBERT_BASE = Wav2Vec2PretrainedModelBundle(
    'hubert_fairseq_base_ls960.pth',
    {