Commit 384e4471 authored by moto's avatar moto
Browse files

Add the rest of HuBERT pretrained models (#1824)

This commit adds
- HUBERT_LARGE
- HUBERT_XLARGE
- HUBERT_ASR_XLARGE
parent 38c5b10f
...@@ -142,9 +142,18 @@ Pre-trained Models ...@@ -142,9 +142,18 @@ Pre-trained Models
.. autodata:: HUBERT_BASE .. autodata:: HUBERT_BASE
:no-value: :no-value:
.. autodata:: HUBERT_LARGE
:no-value:
.. autodata:: HUBERT_XLARGE
:no-value:
.. autodata:: HUBERT_ASR_LARGE .. autodata:: HUBERT_ASR_LARGE
:no-value: :no-value:
.. autodata:: HUBERT_ASR_XLARGE
:no-value:
Utility Functions Utility Functions
----------------- -----------------
......
import torchaudio import torchaudio
from torchaudio.models import ( from torchaudio.models import (
HUBERT_BASE, HUBERT_BASE,
HUBERT_LARGE,
HUBERT_XLARGE,
HUBERT_ASR_LARGE, HUBERT_ASR_LARGE,
HUBERT_ASR_XLARGE,
) )
import pytest import pytest
...@@ -10,6 +13,8 @@ import pytest ...@@ -10,6 +13,8 @@ import pytest
"bundle", "bundle",
[ [
HUBERT_BASE, HUBERT_BASE,
HUBERT_LARGE,
HUBERT_XLARGE,
] ]
) )
def test_pretraining_models(bundle): def test_pretraining_models(bundle):
...@@ -21,6 +26,7 @@ def test_pretraining_models(bundle): ...@@ -21,6 +26,7 @@ def test_pretraining_models(bundle):
"bundle,expected", "bundle,expected",
[ [
(HUBERT_ASR_LARGE, 'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'), (HUBERT_ASR_LARGE, 'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'),
(HUBERT_ASR_XLARGE, 'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|')
] ]
) )
def test_finetune_asr_model( def test_finetune_asr_model(
......
...@@ -20,7 +20,10 @@ from .wav2vec2 import ( ...@@ -20,7 +20,10 @@ from .wav2vec2 import (
from .wav2vec2.pretrained import ( from .wav2vec2.pretrained import (
Wav2Vec2PretrainedModelBundle, Wav2Vec2PretrainedModelBundle,
HUBERT_BASE, HUBERT_BASE,
HUBERT_LARGE,
HUBERT_XLARGE,
HUBERT_ASR_LARGE, HUBERT_ASR_LARGE,
HUBERT_ASR_XLARGE,
) )
__all__ = [ __all__ = [
...@@ -43,7 +46,10 @@ __all__ = [ ...@@ -43,7 +46,10 @@ __all__ = [
'hubert_ft_xlarge', 'hubert_ft_xlarge',
'Wav2Vec2PretrainedModelBundle', 'Wav2Vec2PretrainedModelBundle',
'HUBERT_BASE', 'HUBERT_BASE',
'HUBERT_LARGE',
'HUBERT_XLARGE',
'HUBERT_ASR_LARGE', 'HUBERT_ASR_LARGE',
'HUBERT_ASR_XLARGE',
'Tacotron2', 'Tacotron2',
'tacotron2', 'tacotron2',
] ]
...@@ -122,6 +122,7 @@ def _get_labels(): ...@@ -122,6 +122,7 @@ def _get_labels():
'Z', 'Z',
) )
HUBERT_BASE = Wav2Vec2PretrainedModelBundle( HUBERT_BASE = Wav2Vec2PretrainedModelBundle(
'hubert_fairseq_base_ls960.pth', 'hubert_fairseq_base_ls960.pth',
{ {
...@@ -154,7 +155,89 @@ HUBERT_BASE = Wav2Vec2PretrainedModelBundle( ...@@ -154,7 +155,89 @@ HUBERT_BASE = Wav2Vec2PretrainedModelBundle(
) )
HUBERT_BASE.__doc__ = """HuBERT model with "Base" configuration. HUBERT_BASE.__doc__ = """HuBERT model with "Base" configuration.
Trained on 960 hours of *LibriSpeech* [:footcite:`7178964`] dataset. Not fine-tuned. Pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset [:footcite:`7178964`]
(the combination of "train-clean-100", "train-clean-360", and "train-other-500").
Not fine-tuned.
Originally published by the authors of *HuBERT* [:footcite:`hsu2021hubert`].
[`Source <https://github.com/pytorch/fairseq/tree/main/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
"""
HUBERT_LARGE = Wav2Vec2PretrainedModelBundle(
'hubert_fairseq_large_ll60k.pth',
{
'extractor_mode': 'layer_norm',
'extractor_conv_layer_config': [
(512, 10, 5),
(512, 3, 2),
(512, 3, 2),
(512, 3, 2),
(512, 3, 2),
(512, 2, 2),
(512, 2, 2),
],
'extractor_conv_bias': False,
'encoder_embed_dim': 1024,
'encoder_projection_dropout': 0.0,
'encoder_pos_conv_kernel': 128,
'encoder_pos_conv_groups': 16,
'encoder_num_layers': 24,
'encoder_num_heads': 16,
'encoder_attention_dropout': 0.0,
'encoder_ff_interm_features': 4096,
'encoder_ff_interm_dropout': 0.0,
'encoder_dropout': 0.0,
'encoder_layer_norm_first': True,
'encoder_layer_drop': 0.0,
'aux_num_out': None,
},
_labels=None,
)
HUBERT_LARGE.__doc__ = """HuBERT model with "Large" configuration.
Pre-trained on 60,000 hours of unlabeled audio from
*Libri-Light* dataset [:footcite:`librilight`].
Not fine-tuned.
Originally published by the authors of *HuBERT* [:footcite:`hsu2021hubert`].
[`Source <https://github.com/pytorch/fairseq/tree/main/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
"""
HUBERT_XLARGE = Wav2Vec2PretrainedModelBundle(
'hubert_fairseq_xlarge_ll60k.pth',
{
'extractor_mode': 'layer_norm',
'extractor_conv_layer_config': [
(512, 10, 5),
(512, 3, 2),
(512, 3, 2),
(512, 3, 2),
(512, 3, 2),
(512, 2, 2),
(512, 2, 2),
],
'extractor_conv_bias': False,
'encoder_embed_dim': 1280,
'encoder_projection_dropout': 0.0,
'encoder_pos_conv_kernel': 128,
'encoder_pos_conv_groups': 16,
'encoder_num_layers': 48,
'encoder_num_heads': 16,
'encoder_attention_dropout': 0.0,
'encoder_ff_interm_features': 5120,
'encoder_ff_interm_dropout': 0.0,
'encoder_dropout': 0.0,
'encoder_layer_norm_first': True,
'encoder_layer_drop': 0.0,
'aux_num_out': None,
},
_labels=None,
)
HUBERT_XLARGE.__doc__ = """HuBERT model with "Extra Large" configuration.
Pre-trained on 60,000 hours of unlabeled audio from
*Libri-Light* dataset [:footcite:`librilight`].
Not fine-tuned.
Originally published by the authors of *HuBERT* [:footcite:`hsu2021hubert`]. Originally published by the authors of *HuBERT* [:footcite:`hsu2021hubert`].
[`Source <https://github.com/pytorch/fairseq/tree/main/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__] [`Source <https://github.com/pytorch/fairseq/tree/main/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
...@@ -192,8 +275,53 @@ HUBERT_ASR_LARGE = Wav2Vec2PretrainedModelBundle( ...@@ -192,8 +275,53 @@ HUBERT_ASR_LARGE = Wav2Vec2PretrainedModelBundle(
) )
HUBERT_ASR_LARGE.__doc__ = """HuBERT model with "Large" configuration. HUBERT_ASR_LARGE.__doc__ = """HuBERT model with "Large" configuration.
Pre-trained on 60,000 hours of *Libri-Light* [:footcite:`librilight`] dataset, and Pre-trained on 60,000 hours of unlabeled audio from
fine-tuned for ASR on 960 hours of *LibriSpeech* [:footcite:`7178964`] dataset. *Libri-Light* dataset [:footcite:`librilight`], and
fine-tuned for ASR on 960 hours of transcribed audio from
*LibriSpeech* dataset [:footcite:`7178964`]
(the combination of "train-clean-100", "train-clean-360", and "train-other-500").
Originally published by the authors of *HuBERT* [:footcite:`hsu2021hubert`].
[`Source <https://github.com/pytorch/fairseq/tree/main/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
"""
HUBERT_ASR_XLARGE = Wav2Vec2PretrainedModelBundle(
'hubert_fairseq_xlarge_ll60k_asr_ls960.pth',
{
'extractor_mode': 'layer_norm',
'extractor_conv_layer_config': [
(512, 10, 5),
(512, 3, 2),
(512, 3, 2),
(512, 3, 2),
(512, 3, 2),
(512, 2, 2),
(512, 2, 2),
],
'extractor_conv_bias': False,
'encoder_embed_dim': 1280,
'encoder_projection_dropout': 0.0,
'encoder_pos_conv_kernel': 128,
'encoder_pos_conv_groups': 16,
'encoder_num_layers': 48,
'encoder_num_heads': 16,
'encoder_attention_dropout': 0.0,
'encoder_ff_interm_features': 5120,
'encoder_ff_interm_dropout': 0.1,
'encoder_dropout': 0.0,
'encoder_layer_norm_first': True,
'encoder_layer_drop': 0.1,
'aux_num_out': 32,
},
_labels=_get_labels(),
)
HUBERT_ASR_XLARGE.__doc__ = """HuBERT model with "Extra Large" configuration.
Pre-trained on 60,000 hours of unlabeled audio from
*Libri-Light* dataset [:footcite:`librilight`], and
fine-tuned for ASR on 960 hours of transcribed audio from
*LibriSpeech* dataset [:footcite:`7178964`]
(the combination of "train-clean-100", "train-clean-360", and "train-other-500").
Originally published by the authors of *HuBERT* [:footcite:`hsu2021hubert`]. Originally published by the authors of *HuBERT* [:footcite:`hsu2021hubert`].
[`Source <https://github.com/pytorch/fairseq/tree/main/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__] [`Source <https://github.com/pytorch/fairseq/tree/main/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment