Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Torchaudio
Commits
384e4471
Commit
384e4471
authored
Oct 05, 2021
by
moto
Browse files
Add the rest of HuBERT pretrained models (#1824)
This commit adds - HUBERT_LARGE - HUBERT_XLARGE - HUBERT_ASR_XLARGE
parent
38c5b10f
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
152 additions
and
3 deletions
+152
-3
docs/source/models.rst
docs/source/models.rst
+9
-0
test/integration_tests/wav2vec2_model_test.py
test/integration_tests/wav2vec2_model_test.py
+6
-0
torchaudio/models/__init__.py
torchaudio/models/__init__.py
+6
-0
torchaudio/models/wav2vec2/pretrained.py
torchaudio/models/wav2vec2/pretrained.py
+131
-3
No files found.
docs/source/models.rst
View file @
384e4471
...
@@ -142,9 +142,18 @@ Pre-trained Models
...
@@ -142,9 +142,18 @@ Pre-trained Models
.. autodata:: HUBERT_BASE
.. autodata:: HUBERT_BASE
:no-value:
:no-value:
.. autodata:: HUBERT_LARGE
:no-value:
.. autodata:: HUBERT_XLARGE
:no-value:
.. autodata:: HUBERT_ASR_LARGE
.. autodata:: HUBERT_ASR_LARGE
:no-value:
:no-value:
.. autodata:: HUBERT_ASR_XLARGE
:no-value:
Utility Functions
Utility Functions
-----------------
-----------------
...
...
test/integration_tests/wav2vec2_model_test.py
View file @
384e4471
import
torchaudio
import
torchaudio
from
torchaudio.models
import
(
from
torchaudio.models
import
(
HUBERT_BASE
,
HUBERT_BASE
,
HUBERT_LARGE
,
HUBERT_XLARGE
,
HUBERT_ASR_LARGE
,
HUBERT_ASR_LARGE
,
HUBERT_ASR_XLARGE
,
)
)
import
pytest
import
pytest
...
@@ -10,6 +13,8 @@ import pytest
...
@@ -10,6 +13,8 @@ import pytest
"bundle"
,
"bundle"
,
[
[
HUBERT_BASE
,
HUBERT_BASE
,
HUBERT_LARGE
,
HUBERT_XLARGE
,
]
]
)
)
def
test_pretraining_models
(
bundle
):
def
test_pretraining_models
(
bundle
):
...
@@ -21,6 +26,7 @@ def test_pretraining_models(bundle):
...
@@ -21,6 +26,7 @@ def test_pretraining_models(bundle):
"bundle,expected"
,
"bundle,expected"
,
[
[
(
HUBERT_ASR_LARGE
,
'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'
),
(
HUBERT_ASR_LARGE
,
'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'
),
(
HUBERT_ASR_XLARGE
,
'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'
)
]
]
)
)
def
test_finetune_asr_model
(
def
test_finetune_asr_model
(
...
...
torchaudio/models/__init__.py
View file @
384e4471
...
@@ -20,7 +20,10 @@ from .wav2vec2 import (
...
@@ -20,7 +20,10 @@ from .wav2vec2 import (
from
.wav2vec2.pretrained
import
(
from
.wav2vec2.pretrained
import
(
Wav2Vec2PretrainedModelBundle
,
Wav2Vec2PretrainedModelBundle
,
HUBERT_BASE
,
HUBERT_BASE
,
HUBERT_LARGE
,
HUBERT_XLARGE
,
HUBERT_ASR_LARGE
,
HUBERT_ASR_LARGE
,
HUBERT_ASR_XLARGE
,
)
)
__all__
=
[
__all__
=
[
...
@@ -43,7 +46,10 @@ __all__ = [
...
@@ -43,7 +46,10 @@ __all__ = [
'hubert_ft_xlarge'
,
'hubert_ft_xlarge'
,
'Wav2Vec2PretrainedModelBundle'
,
'Wav2Vec2PretrainedModelBundle'
,
'HUBERT_BASE'
,
'HUBERT_BASE'
,
'HUBERT_LARGE'
,
'HUBERT_XLARGE'
,
'HUBERT_ASR_LARGE'
,
'HUBERT_ASR_LARGE'
,
'HUBERT_ASR_XLARGE'
,
'Tacotron2'
,
'Tacotron2'
,
'tacotron2'
,
'tacotron2'
,
]
]
torchaudio/models/wav2vec2/pretrained.py
View file @
384e4471
...
@@ -122,6 +122,7 @@ def _get_labels():
...
@@ -122,6 +122,7 @@ def _get_labels():
'Z'
,
'Z'
,
)
)
HUBERT_BASE
=
Wav2Vec2PretrainedModelBundle
(
HUBERT_BASE
=
Wav2Vec2PretrainedModelBundle
(
'hubert_fairseq_base_ls960.pth'
,
'hubert_fairseq_base_ls960.pth'
,
{
{
...
@@ -154,7 +155,89 @@ HUBERT_BASE = Wav2Vec2PretrainedModelBundle(
...
@@ -154,7 +155,89 @@ HUBERT_BASE = Wav2Vec2PretrainedModelBundle(
)
)
HUBERT_BASE
.
__doc__
=
"""HuBERT model with "Base" configuration.
HUBERT_BASE
.
__doc__
=
"""HuBERT model with "Base" configuration.
Trained on 960 hours of *LibriSpeech* [:footcite:`7178964`] dataset. Not fine-tuned.
Pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset [:footcite:`7178964`]
(the combination of "train-clean-100", "train-clean-360", and "train-other-500").
Not fine-tuned.
Originally published by the authors of *HuBERT* [:footcite:`hsu2021hubert`].
[`Source <https://github.com/pytorch/fairseq/tree/main/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
"""
HUBERT_LARGE
=
Wav2Vec2PretrainedModelBundle
(
'hubert_fairseq_large_ll60k.pth'
,
{
'extractor_mode'
:
'layer_norm'
,
'extractor_conv_layer_config'
:
[
(
512
,
10
,
5
),
(
512
,
3
,
2
),
(
512
,
3
,
2
),
(
512
,
3
,
2
),
(
512
,
3
,
2
),
(
512
,
2
,
2
),
(
512
,
2
,
2
),
],
'extractor_conv_bias'
:
False
,
'encoder_embed_dim'
:
1024
,
'encoder_projection_dropout'
:
0.0
,
'encoder_pos_conv_kernel'
:
128
,
'encoder_pos_conv_groups'
:
16
,
'encoder_num_layers'
:
24
,
'encoder_num_heads'
:
16
,
'encoder_attention_dropout'
:
0.0
,
'encoder_ff_interm_features'
:
4096
,
'encoder_ff_interm_dropout'
:
0.0
,
'encoder_dropout'
:
0.0
,
'encoder_layer_norm_first'
:
True
,
'encoder_layer_drop'
:
0.0
,
'aux_num_out'
:
None
,
},
_labels
=
None
,
)
HUBERT_LARGE
.
__doc__
=
"""HuBERT model with "Large" configuration.
Pre-trained on 60,000 hours of unlabeled audio from
*Libri-Light* dataset [:footcite:`librilight`].
Not fine-tuned.
Originally published by the authors of *HuBERT* [:footcite:`hsu2021hubert`].
[`Source <https://github.com/pytorch/fairseq/tree/main/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
"""
HUBERT_XLARGE
=
Wav2Vec2PretrainedModelBundle
(
'hubert_fairseq_xlarge_ll60k.pth'
,
{
'extractor_mode'
:
'layer_norm'
,
'extractor_conv_layer_config'
:
[
(
512
,
10
,
5
),
(
512
,
3
,
2
),
(
512
,
3
,
2
),
(
512
,
3
,
2
),
(
512
,
3
,
2
),
(
512
,
2
,
2
),
(
512
,
2
,
2
),
],
'extractor_conv_bias'
:
False
,
'encoder_embed_dim'
:
1280
,
'encoder_projection_dropout'
:
0.0
,
'encoder_pos_conv_kernel'
:
128
,
'encoder_pos_conv_groups'
:
16
,
'encoder_num_layers'
:
48
,
'encoder_num_heads'
:
16
,
'encoder_attention_dropout'
:
0.0
,
'encoder_ff_interm_features'
:
5120
,
'encoder_ff_interm_dropout'
:
0.0
,
'encoder_dropout'
:
0.0
,
'encoder_layer_norm_first'
:
True
,
'encoder_layer_drop'
:
0.0
,
'aux_num_out'
:
None
,
},
_labels
=
None
,
)
HUBERT_XLARGE
.
__doc__
=
"""HuBERT model with "Extra Large" configuration.
Pre-trained on 60,000 hours of unlabeled audio from
*Libri-Light* dataset [:footcite:`librilight`].
Not fine-tuned.
Originally published by the authors of *HuBERT* [:footcite:`hsu2021hubert`].
Originally published by the authors of *HuBERT* [:footcite:`hsu2021hubert`].
[`Source <https://github.com/pytorch/fairseq/tree/main/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
[`Source <https://github.com/pytorch/fairseq/tree/main/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
...
@@ -192,8 +275,53 @@ HUBERT_ASR_LARGE = Wav2Vec2PretrainedModelBundle(
...
@@ -192,8 +275,53 @@ HUBERT_ASR_LARGE = Wav2Vec2PretrainedModelBundle(
)
)
HUBERT_ASR_LARGE
.
__doc__
=
"""HuBERT model with "Large" configuration.
HUBERT_ASR_LARGE
.
__doc__
=
"""HuBERT model with "Large" configuration.
Pre-trained on 60,000 hours of *Libri-Light* [:footcite:`librilight`] dataset, and
Pre-trained on 60,000 hours of unlabeled audio from
fine-tuned for ASR on 960 hours of *LibriSpeech* [:footcite:`7178964`] dataset.
*Libri-Light* dataset [:footcite:`librilight`], and
fine-tuned for ASR on 960 hours of transcribed audio from
*LibriSpeech* dataset [:footcite:`7178964`]
(the combination of "train-clean-100", "train-clean-360", and "train-other-500").
Originally published by the authors of *HuBERT* [:footcite:`hsu2021hubert`].
[`Source <https://github.com/pytorch/fairseq/tree/main/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
"""
HUBERT_ASR_XLARGE
=
Wav2Vec2PretrainedModelBundle
(
'hubert_fairseq_xlarge_ll60k_asr_ls960.pth'
,
{
'extractor_mode'
:
'layer_norm'
,
'extractor_conv_layer_config'
:
[
(
512
,
10
,
5
),
(
512
,
3
,
2
),
(
512
,
3
,
2
),
(
512
,
3
,
2
),
(
512
,
3
,
2
),
(
512
,
2
,
2
),
(
512
,
2
,
2
),
],
'extractor_conv_bias'
:
False
,
'encoder_embed_dim'
:
1280
,
'encoder_projection_dropout'
:
0.0
,
'encoder_pos_conv_kernel'
:
128
,
'encoder_pos_conv_groups'
:
16
,
'encoder_num_layers'
:
48
,
'encoder_num_heads'
:
16
,
'encoder_attention_dropout'
:
0.0
,
'encoder_ff_interm_features'
:
5120
,
'encoder_ff_interm_dropout'
:
0.1
,
'encoder_dropout'
:
0.0
,
'encoder_layer_norm_first'
:
True
,
'encoder_layer_drop'
:
0.1
,
'aux_num_out'
:
32
,
},
_labels
=
_get_labels
(),
)
HUBERT_ASR_XLARGE
.
__doc__
=
"""HuBERT model with "Extra Large" configuration.
Pre-trained on 60,000 hours of unlabeled audio from
*Libri-Light* dataset [:footcite:`librilight`], and
fine-tuned for ASR on 960 hours of transcribed audio from
*LibriSpeech* dataset [:footcite:`7178964`]
(the combination of "train-clean-100", "train-clean-360", and "train-other-500").
Originally published by the authors of *HuBERT* [:footcite:`hsu2021hubert`].
Originally published by the authors of *HuBERT* [:footcite:`hsu2021hubert`].
[`Source <https://github.com/pytorch/fairseq/tree/main/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
[`Source <https://github.com/pytorch/fairseq/tree/main/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment