Unverified Commit e40c9c3c authored by moto's avatar moto Committed by GitHub
Browse files

Add pretrained weights from wav2vec2.0 and XLSR papers (#1827)

Add pretrained weights from https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models
- Wav2Vec 2.0 Base / Large / Large (LV-60)
- XLSR-53
parent 48cfbf2b
...@@ -139,6 +139,45 @@ Pre-trained Models ...@@ -139,6 +139,45 @@ Pre-trained Models
.. autoproperty:: labels .. autoproperty:: labels
.. autodata:: WAV2VEC2_BASE
:no-value:
.. autodata:: WAV2VEC2_ASR_BASE_10M
:no-value:
.. autodata:: WAV2VEC2_ASR_BASE_100H
:no-value:
.. autodata:: WAV2VEC2_ASR_BASE_960H
:no-value:
.. autodata:: WAV2VEC2_LARGE
:no-value:
.. autodata:: WAV2VEC2_ASR_LARGE_10M
:no-value:
.. autodata:: WAV2VEC2_ASR_LARGE_100H
:no-value:
.. autodata:: WAV2VEC2_ASR_LARGE_960H
:no-value:
.. autodata:: WAV2VEC2_LARGE_LV60K
:no-value:
.. autodata:: WAV2VEC2_ASR_LARGE_LV60K_10M
:no-value:
.. autodata:: WAV2VEC2_ASR_LARGE_LV60K_100H
:no-value:
.. autodata:: WAV2VEC2_ASR_LARGE_LV60K_960H
:no-value:
.. autodata:: WAV2VEC2_XLSR53
:no-value:
.. autodata:: HUBERT_BASE .. autodata:: HUBERT_BASE
:no-value: :no-value:
......
import torchaudio import torchaudio
from torchaudio.models import ( from torchaudio.models import (
WAV2VEC2_BASE,
WAV2VEC2_LARGE,
WAV2VEC2_LARGE_LV60K,
WAV2VEC2_ASR_BASE_10M,
WAV2VEC2_ASR_BASE_100H,
WAV2VEC2_ASR_BASE_960H,
WAV2VEC2_ASR_LARGE_10M,
WAV2VEC2_ASR_LARGE_100H,
WAV2VEC2_ASR_LARGE_960H,
WAV2VEC2_ASR_LARGE_LV60K_10M,
WAV2VEC2_ASR_LARGE_LV60K_100H,
WAV2VEC2_ASR_LARGE_LV60K_960H,
WAV2VEC2_XLSR53,
HUBERT_BASE, HUBERT_BASE,
HUBERT_LARGE, HUBERT_LARGE,
HUBERT_XLARGE, HUBERT_XLARGE,
...@@ -12,6 +25,10 @@ import pytest ...@@ -12,6 +25,10 @@ import pytest
@pytest.mark.parametrize( @pytest.mark.parametrize(
"bundle", "bundle",
[ [
WAV2VEC2_BASE,
WAV2VEC2_LARGE,
WAV2VEC2_LARGE_LV60K,
WAV2VEC2_XLSR53,
HUBERT_BASE, HUBERT_BASE,
HUBERT_LARGE, HUBERT_LARGE,
HUBERT_XLARGE, HUBERT_XLARGE,
...@@ -25,6 +42,15 @@ def test_pretraining_models(bundle): ...@@ -25,6 +42,15 @@ def test_pretraining_models(bundle):
@pytest.mark.parametrize( @pytest.mark.parametrize(
"bundle,expected", "bundle,expected",
[ [
(WAV2VEC2_ASR_BASE_10M, 'I|HAD|THAT|CURIYOSSITY|BESID|ME|AT|THIS|MOMENT|'),
(WAV2VEC2_ASR_BASE_100H, 'I|HAD|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'),
(WAV2VEC2_ASR_BASE_960H, 'I|HAD|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'),
(WAV2VEC2_ASR_LARGE_10M, 'I|HAD|THAT|CURIOUSITY|BESIDE|ME|AT|THIS|MOMENT|'),
(WAV2VEC2_ASR_LARGE_100H, 'I|HAD|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'),
(WAV2VEC2_ASR_LARGE_960H, 'I|HAD|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'),
(WAV2VEC2_ASR_LARGE_LV60K_10M, 'I|HAD|THAT|CURIOUSSITY|BESID|ME|AT|THISS|MOMENT|'),
(WAV2VEC2_ASR_LARGE_LV60K_100H, 'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'),
(WAV2VEC2_ASR_LARGE_LV60K_960H, 'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'),
(HUBERT_ASR_LARGE, 'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'), (HUBERT_ASR_LARGE, 'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'),
(HUBERT_ASR_XLARGE, 'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|') (HUBERT_ASR_XLARGE, 'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|')
] ]
......
...@@ -19,6 +19,19 @@ from .wav2vec2 import ( ...@@ -19,6 +19,19 @@ from .wav2vec2 import (
) )
from .wav2vec2.pretrained import ( from .wav2vec2.pretrained import (
Wav2Vec2PretrainedModelBundle, Wav2Vec2PretrainedModelBundle,
WAV2VEC2_BASE,
WAV2VEC2_LARGE,
WAV2VEC2_LARGE_LV60K,
WAV2VEC2_ASR_BASE_10M,
WAV2VEC2_ASR_BASE_100H,
WAV2VEC2_ASR_BASE_960H,
WAV2VEC2_ASR_LARGE_10M,
WAV2VEC2_ASR_LARGE_100H,
WAV2VEC2_ASR_LARGE_960H,
WAV2VEC2_ASR_LARGE_LV60K_10M,
WAV2VEC2_ASR_LARGE_LV60K_100H,
WAV2VEC2_ASR_LARGE_LV60K_960H,
WAV2VEC2_XLSR53,
HUBERT_BASE, HUBERT_BASE,
HUBERT_LARGE, HUBERT_LARGE,
HUBERT_XLARGE, HUBERT_XLARGE,
...@@ -45,6 +58,19 @@ __all__ = [ ...@@ -45,6 +58,19 @@ __all__ = [
'hubert_ft_large', 'hubert_ft_large',
'hubert_ft_xlarge', 'hubert_ft_xlarge',
'Wav2Vec2PretrainedModelBundle', 'Wav2Vec2PretrainedModelBundle',
'WAV2VEC2_BASE',
'WAV2VEC2_LARGE',
'WAV2VEC2_LARGE_LV60K',
'WAV2VEC2_ASR_BASE_10M',
'WAV2VEC2_ASR_BASE_100H',
'WAV2VEC2_ASR_BASE_960H',
'WAV2VEC2_ASR_LARGE_10M',
'WAV2VEC2_ASR_LARGE_100H',
'WAV2VEC2_ASR_LARGE_960H',
'WAV2VEC2_ASR_LARGE_LV60K_10M',
'WAV2VEC2_ASR_LARGE_LV60K_100H',
'WAV2VEC2_ASR_LARGE_LV60K_960H',
'WAV2VEC2_XLSR53',
'HUBERT_BASE', 'HUBERT_BASE',
'HUBERT_LARGE', 'HUBERT_LARGE',
'HUBERT_XLARGE', 'HUBERT_XLARGE',
......
...@@ -123,6 +123,549 @@ def _get_labels(): ...@@ -123,6 +123,549 @@ def _get_labels():
) )
WAV2VEC2_BASE = Wav2Vec2PretrainedModelBundle(
_path='wav2vec2_fairseq_base_ls960.pth',
_params={
'extractor_mode': 'group_norm',
'extractor_conv_layer_config': [
(512, 10, 5),
(512, 3, 2),
(512, 3, 2),
(512, 3, 2),
(512, 3, 2),
(512, 2, 2),
(512, 2, 2),
],
'extractor_conv_bias': False,
'encoder_embed_dim': 768,
'encoder_projection_dropout': 0.1,
'encoder_pos_conv_kernel': 128,
'encoder_pos_conv_groups': 16,
'encoder_num_layers': 12,
'encoder_num_heads': 12,
'encoder_attention_dropout': 0.1,
'encoder_ff_interm_features': 3072,
'encoder_ff_interm_dropout': 0.0,
'encoder_dropout': 0.1,
'encoder_layer_norm_first': False,
'encoder_layer_drop': 0.05,
"aux_num_out": None,
},
_labels=None,
)
WAV2VEC2_BASE.__doc__ = """wav2vec 2.0 model with "Base" configuration.
Pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset [:footcite:`7178964`]
(the combination of "train-clean-100", "train-clean-360", and "train-other-500").
Not fine-tuned.
Originally published by the authors of *wav2vec 2.0* [:footcite:`baevski2020wav2vec`].
[`Source <https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models>`__]
"""
WAV2VEC2_ASR_BASE_10M = Wav2Vec2PretrainedModelBundle(
_path='wav2vec2_fairseq_base_ls960_asr_ll10m.pth',
_params={
'extractor_mode': 'group_norm',
'extractor_conv_layer_config': [
(512, 10, 5),
(512, 3, 2),
(512, 3, 2),
(512, 3, 2),
(512, 3, 2),
(512, 2, 2),
(512, 2, 2),
],
'extractor_conv_bias': False,
'encoder_embed_dim': 768,
'encoder_projection_dropout': 0.1,
'encoder_pos_conv_kernel': 128,
'encoder_pos_conv_groups': 16,
'encoder_num_layers': 12,
'encoder_num_heads': 12,
'encoder_attention_dropout': 0.1,
'encoder_ff_interm_features': 3072,
'encoder_ff_interm_dropout': 0.0,
'encoder_dropout': 0.1,
'encoder_layer_norm_first': False,
'encoder_layer_drop': 0.05,
"aux_num_out": 32,
},
_labels=_get_labels(),
)
WAV2VEC2_ASR_BASE_10M.__doc__ = """Build "base" wav2vec2 model with an extra linear module
Pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset [:footcite:`7178964`]
(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
fine-tuned for ASR on 10 minutes of transcribed audio from *Libri-Light* dataset
[:footcite:`librilight`] ("train-10min" subset).
Originally published by the authors of *wav2vec 2.0*
[:footcite:`baevski2020wav2vec`].
[`Source <https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models>`__]
"""
WAV2VEC2_ASR_BASE_100H = Wav2Vec2PretrainedModelBundle(
'wav2vec2_fairseq_base_ls960_asr_ls100.pth',
{
'extractor_mode': 'group_norm',
'extractor_conv_layer_config': [
(512, 10, 5),
(512, 3, 2),
(512, 3, 2),
(512, 3, 2),
(512, 3, 2),
(512, 2, 2),
(512, 2, 2),
],
'extractor_conv_bias': False,
'encoder_embed_dim': 768,
'encoder_projection_dropout': 0.1,
'encoder_pos_conv_kernel': 128,
'encoder_pos_conv_groups': 16,
'encoder_num_layers': 12,
'encoder_num_heads': 12,
'encoder_attention_dropout': 0.1,
'encoder_ff_interm_features': 3072,
'encoder_ff_interm_dropout': 0.0,
'encoder_dropout': 0.1,
'encoder_layer_norm_first': False,
'encoder_layer_drop': 0.05,
"aux_num_out": 32,
},
_labels=_get_labels(),
)
WAV2VEC2_ASR_BASE_100H.__doc__ = """Build "base" wav2vec2 model with an extra linear module
Pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset [:footcite:`7178964`]
(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
fine-tuned for ASR on 100 hours of transcribed audio from "train-clean-100" subset.
Originally published by the authors of *wav2vec 2.0*
[:footcite:`baevski2020wav2vec`].
[`Source <https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models>`__]
"""
WAV2VEC2_ASR_BASE_960H = Wav2Vec2PretrainedModelBundle(
'wav2vec2_fairseq_base_ls960_asr_ls960.pth',
{
"extractor_mode": "group_norm",
"extractor_conv_layer_config": [
(512, 10, 5),
(512, 3, 2),
(512, 3, 2),
(512, 3, 2),
(512, 3, 2),
(512, 2, 2),
(512, 2, 2),
],
"extractor_conv_bias": False,
"encoder_embed_dim": 768,
"encoder_projection_dropout": 0.1,
"encoder_pos_conv_kernel": 128,
"encoder_pos_conv_groups": 16,
"encoder_num_layers": 12,
"encoder_num_heads": 12,
"encoder_attention_dropout": 0.1,
"encoder_ff_interm_features": 3072,
"encoder_ff_interm_dropout": 0.0,
"encoder_dropout": 0.1,
"encoder_layer_norm_first": False,
"encoder_layer_drop": 0.05,
"aux_num_out": 32,
},
_labels=_get_labels(),
)
WAV2VEC2_ASR_BASE_960H.__doc__ = """Build "base" wav2vec2 model with an extra linear module
Pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset [:footcite:`7178964`]
(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
fine-tuned for ASR on the same audio with the corresponding transcripts.
Originally published by the authors of *wav2vec 2.0*
[:footcite:`baevski2020wav2vec`].
[`Source <https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models>`__]
"""
WAV2VEC2_LARGE = Wav2Vec2PretrainedModelBundle(
'wav2vec2_fairseq_large_ls960.pth',
{
"extractor_mode": "group_norm",
"extractor_conv_layer_config": [
(512, 10, 5),
(512, 3, 2),
(512, 3, 2),
(512, 3, 2),
(512, 3, 2),
(512, 2, 2),
(512, 2, 2),
],
"extractor_conv_bias": False,
"encoder_embed_dim": 1024,
"encoder_projection_dropout": 0.1,
"encoder_pos_conv_kernel": 128,
"encoder_pos_conv_groups": 16,
"encoder_num_layers": 24,
"encoder_num_heads": 16,
"encoder_attention_dropout": 0.1,
"encoder_ff_interm_features": 4096,
"encoder_ff_interm_dropout": 0.0,
"encoder_dropout": 0.0,
"encoder_layer_norm_first": False,
"encoder_layer_drop": 0.2,
"aux_num_out": None,
},
_labels=None,
)
WAV2VEC2_LARGE.__doc__ = """Build "large" wav2vec2 model.
Pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset [:footcite:`7178964`]
(the combination of "train-clean-100", "train-clean-360", and "train-other-500").
Not fine-tuned.
Originally published by the authors of *wav2vec 2.0*
[:footcite:`baevski2020wav2vec`].
[`Source <https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models>`__]
"""
WAV2VEC2_ASR_LARGE_10M = Wav2Vec2PretrainedModelBundle(
'wav2vec2_fairseq_large_ls960_asr_ll10m.pth',
{
"extractor_mode": "group_norm",
"extractor_conv_layer_config": [
(512, 10, 5),
(512, 3, 2),
(512, 3, 2),
(512, 3, 2),
(512, 3, 2),
(512, 2, 2),
(512, 2, 2),
],
"extractor_conv_bias": False,
"encoder_embed_dim": 1024,
"encoder_projection_dropout": 0.1,
"encoder_pos_conv_kernel": 128,
"encoder_pos_conv_groups": 16,
"encoder_num_layers": 24,
"encoder_num_heads": 16,
"encoder_attention_dropout": 0.1,
"encoder_ff_interm_features": 4096,
"encoder_ff_interm_dropout": 0.0,
"encoder_dropout": 0.0,
"encoder_layer_norm_first": False,
"encoder_layer_drop": 0.2,
"aux_num_out": 32,
},
_labels=_get_labels(),
)
WAV2VEC2_ASR_LARGE_10M.__doc__ = """Build "large" wav2vec2 model with an extra linear module
Pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset [:footcite:`7178964`]
(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
fine-tuned for ASR on 10 minutes of transcribed audio from *Libri-Light* dataset
[:footcite:`librilight`] ("train-10min" subset).
Originally published by the authors of *wav2vec 2.0*
[:footcite:`baevski2020wav2vec`].
[`Source <https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models>`__]
"""
WAV2VEC2_ASR_LARGE_100H = Wav2Vec2PretrainedModelBundle(
'wav2vec2_fairseq_large_ls960_asr_ls100.pth',
{
"extractor_mode": "group_norm",
"extractor_conv_layer_config": [
(512, 10, 5),
(512, 3, 2),
(512, 3, 2),
(512, 3, 2),
(512, 3, 2),
(512, 2, 2),
(512, 2, 2),
],
"extractor_conv_bias": False,
"encoder_embed_dim": 1024,
"encoder_projection_dropout": 0.1,
"encoder_pos_conv_kernel": 128,
"encoder_pos_conv_groups": 16,
"encoder_num_layers": 24,
"encoder_num_heads": 16,
"encoder_attention_dropout": 0.1,
"encoder_ff_interm_features": 4096,
"encoder_ff_interm_dropout": 0.0,
"encoder_dropout": 0.0,
"encoder_layer_norm_first": False,
"encoder_layer_drop": 0.2,
"aux_num_out": 32,
},
_labels=_get_labels(),
)
WAV2VEC2_ASR_LARGE_100H.__doc__ = """Build "large" wav2vec2 model with an extra linear module
Pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset [:footcite:`7178964`]
(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
fine-tuned for ASR on 100 hours of transcribed audio from
the same dataset ("train-clean-100" subset).
Originally published by the authors of *wav2vec 2.0*
[:footcite:`baevski2020wav2vec`].
[`Source <https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models>`__]
"""
WAV2VEC2_ASR_LARGE_960H = Wav2Vec2PretrainedModelBundle(
'wav2vec2_fairseq_large_ls960_asr_ls960.pth',
{
"extractor_mode": "group_norm",
"extractor_conv_layer_config": [
(512, 10, 5),
(512, 3, 2),
(512, 3, 2),
(512, 3, 2),
(512, 3, 2),
(512, 2, 2),
(512, 2, 2),
],
"extractor_conv_bias": False,
"encoder_embed_dim": 1024,
"encoder_projection_dropout": 0.1,
"encoder_pos_conv_kernel": 128,
"encoder_pos_conv_groups": 16,
"encoder_num_layers": 24,
"encoder_num_heads": 16,
"encoder_attention_dropout": 0.1,
"encoder_ff_interm_features": 4096,
"encoder_ff_interm_dropout": 0.0,
"encoder_dropout": 0.0,
"encoder_layer_norm_first": False,
"encoder_layer_drop": 0.2,
"aux_num_out": 32,
},
_labels=_get_labels(),
)
WAV2VEC2_ASR_LARGE_960H.__doc__ = """Build "large" wav2vec2 model with an extra linear module
Pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset [:footcite:`7178964`]
(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
fine-tuned for ASR on the same audio with the corresponding transcripts.
Originally published by the authors of *wav2vec 2.0*
[:footcite:`baevski2020wav2vec`].
[`Source <https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models>`__]
"""
WAV2VEC2_LARGE_LV60K = Wav2Vec2PretrainedModelBundle(
'wav2vec2_fairseq_large_lv60k.pth',
{
"extractor_mode": "layer_norm",
"extractor_conv_layer_config": [
(512, 10, 5),
(512, 3, 2),
(512, 3, 2),
(512, 3, 2),
(512, 3, 2),
(512, 2, 2),
(512, 2, 2),
],
"extractor_conv_bias": True,
"encoder_embed_dim": 1024,
"encoder_projection_dropout": 0.1,
"encoder_pos_conv_kernel": 128,
"encoder_pos_conv_groups": 16,
"encoder_num_layers": 24,
"encoder_num_heads": 16,
"encoder_attention_dropout": 0.1,
"encoder_ff_interm_features": 4096,
"encoder_ff_interm_dropout": 0.0,
"encoder_dropout": 0.0,
"encoder_layer_norm_first": True,
"encoder_layer_drop": 0.0,
"aux_num_out": None,
},
_labels=None,
)
WAV2VEC2_LARGE_LV60K.__doc__ = """Build "large-lv60k" wav2vec2 model.
Pre-trained on 60,000 hours of unlabeled audio from
*Libri-Light* dataset [:footcite:`librilight`].
Not fine-tuned.
Originally published by the authors of *wav2vec 2.0*
[:footcite:`baevski2020wav2vec`].
[`Source <https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models>`__]
"""
WAV2VEC2_ASR_LARGE_LV60K_10M = Wav2Vec2PretrainedModelBundle(
'wav2vec2_fairseq_large_lv60k_asr_ll10m.pth',
{
"extractor_mode": "layer_norm",
"extractor_conv_layer_config": [
(512, 10, 5),
(512, 3, 2),
(512, 3, 2),
(512, 3, 2),
(512, 3, 2),
(512, 2, 2),
(512, 2, 2),
],
"extractor_conv_bias": True,
"encoder_embed_dim": 1024,
"encoder_projection_dropout": 0.1,
"encoder_pos_conv_kernel": 128,
"encoder_pos_conv_groups": 16,
"encoder_num_layers": 24,
"encoder_num_heads": 16,
"encoder_attention_dropout": 0.1,
"encoder_ff_interm_features": 4096,
"encoder_ff_interm_dropout": 0.0,
"encoder_dropout": 0.0,
"encoder_layer_norm_first": True,
"encoder_layer_drop": 0.0,
"aux_num_out": 32,
},
_labels=_get_labels(),
)
WAV2VEC2_ASR_LARGE_LV60K_10M.__doc__ = """Build "large-lv60k" wav2vec2 model with an extra linear module
Pre-trained on 60,000 hours of unlabeled audio from
*Libri-Light* dataset [:footcite:`librilight`], and
fine-tuned for ASR on 10 minutes of transcribed audio from
the same dataset ("train-10min" subset).
Originally published by the authors of *wav2vec 2.0*
[:footcite:`baevski2020wav2vec`].
[`Source <https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models>`__]
"""
WAV2VEC2_ASR_LARGE_LV60K_100H = Wav2Vec2PretrainedModelBundle(
'wav2vec2_fairseq_large_lv60k_asr_ls100.pth',
{
"extractor_mode": "layer_norm",
"extractor_conv_layer_config": [
(512, 10, 5),
(512, 3, 2),
(512, 3, 2),
(512, 3, 2),
(512, 3, 2),
(512, 2, 2),
(512, 2, 2),
],
"extractor_conv_bias": True,
"encoder_embed_dim": 1024,
"encoder_projection_dropout": 0.1,
"encoder_pos_conv_kernel": 128,
"encoder_pos_conv_groups": 16,
"encoder_num_layers": 24,
"encoder_num_heads": 16,
"encoder_attention_dropout": 0.1,
"encoder_ff_interm_features": 4096,
"encoder_ff_interm_dropout": 0.0,
"encoder_dropout": 0.0,
"encoder_layer_norm_first": True,
"encoder_layer_drop": 0.0,
"aux_num_out": 32,
},
_labels=_get_labels(),
)
WAV2VEC2_ASR_LARGE_LV60K_100H.__doc__ = """Build "large-lv60k" wav2vec2 model with an extra linear module
Pre-trained on 60,000 hours of unlabeled audio from
*Libri-Light* dataset [:footcite:`librilight`], and
fine-tuned for ASR on 100 hours of transcribed audio from
*LibriSpeech* dataset [:footcite:`7178964`] ("train-clean-100" subset).
Originally published by the authors of *wav2vec 2.0*
[:footcite:`baevski2020wav2vec`].
[`Source <https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models>`__]
"""
WAV2VEC2_ASR_LARGE_LV60K_960H = Wav2Vec2PretrainedModelBundle(
'wav2vec2_fairseq_large_lv60k_asr_ls960.pth',
{
"extractor_mode": "layer_norm",
"extractor_conv_layer_config": [
(512, 10, 5),
(512, 3, 2),
(512, 3, 2),
(512, 3, 2),
(512, 3, 2),
(512, 2, 2),
(512, 2, 2),
],
"extractor_conv_bias": True,
"encoder_embed_dim": 1024,
"encoder_projection_dropout": 0.1,
"encoder_pos_conv_kernel": 128,
"encoder_pos_conv_groups": 16,
"encoder_num_layers": 24,
"encoder_num_heads": 16,
"encoder_attention_dropout": 0.1,
"encoder_ff_interm_features": 4096,
"encoder_ff_interm_dropout": 0.0,
"encoder_dropout": 0.0,
"encoder_layer_norm_first": True,
"encoder_layer_drop": 0.0,
"aux_num_out": 32,
},
_labels=_get_labels(),
)
WAV2VEC2_ASR_LARGE_LV60K_960H.__doc__ = """Build "large-lv60k" wav2vec2 model with an extra linear module
Pre-trained on 60,000 hours of unlabeled audio from *Libri-Light*
[:footcite:`librilight`] dataset, and
fine-tuned for ASR on 960 hours of transcribed audio from
*LibriSpeech* dataset [:footcite:`7178964`]
(the combination of "train-clean-100", "train-clean-360", and "train-other-500").
Originally published by the authors of *wav2vec 2.0*
[:footcite:`baevski2020wav2vec`].
[`Source <https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models>`__]
"""
WAV2VEC2_XLSR53 = Wav2Vec2PretrainedModelBundle(
'wav2vec2_fairseq_large_xlsr53.pth',
{
"extractor_mode": "layer_norm",
"extractor_conv_layer_config": [
(512, 10, 5),
(512, 3, 2),
(512, 3, 2),
(512, 3, 2),
(512, 3, 2),
(512, 2, 2),
(512, 2, 2),
],
"extractor_conv_bias": True,
"encoder_embed_dim": 1024,
"encoder_projection_dropout": 0.0,
"encoder_pos_conv_kernel": 128,
"encoder_pos_conv_groups": 16,
"encoder_num_layers": 24,
"encoder_num_heads": 16,
"encoder_attention_dropout": 0.0,
"encoder_ff_interm_features": 4096,
"encoder_ff_interm_dropout": 0.0,
"encoder_dropout": 0.0,
"encoder_layer_norm_first": True,
"encoder_layer_drop": 0.0,
"aux_num_out": None,
},
_labels=None,
)
WAV2VEC2_XLSR53.__doc__ = """wav2vec 2.0 model with "Base" configuration.
Trained on 56,000 hours of unlabeled audio from multiple datasets (
*Multilingual LibriSpeech* [:footcite:`Pratap_2020`],
*CommonVoice* [:footcite:`ardila2020common`] and
*BABEL* [:footcite:`Gales2014SpeechRA`]).
Not fine-tuned.
Originally published by the authors of
*Unsupervised Cross-lingual Representation Learning for Speech Recognition*
[:footcite:`conneau2020unsupervised`].
[`Source <https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models>`__]
"""
HUBERT_BASE = Wav2Vec2PretrainedModelBundle( HUBERT_BASE = Wav2Vec2PretrainedModelBundle(
'hubert_fairseq_base_ls960.pth', 'hubert_fairseq_base_ls960.pth',
{ {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment