Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Torchaudio
Commits
c9e4c75d
Unverified
Commit
c9e4c75d
authored
Oct 05, 2021
by
moto
Committed by
GitHub
Oct 05, 2021
Browse files
Add the rest of HuBERT pretrained models (#1824)
This commit adds - HUBERT_LARGE - HUBERT_XLARGE - HUBERT_ASR_XLARGE
parent
181f0c80
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
152 additions
and
3 deletions
+152
-3
docs/source/models.rst
docs/source/models.rst
+9
-0
test/integration_tests/wav2vec2_model_test.py
test/integration_tests/wav2vec2_model_test.py
+6
-0
torchaudio/models/__init__.py
torchaudio/models/__init__.py
+6
-0
torchaudio/models/wav2vec2/pretrained.py
torchaudio/models/wav2vec2/pretrained.py
+131
-3
No files found.
docs/source/models.rst
View file @
c9e4c75d
...
@@ -142,9 +142,18 @@ Pre-trained Models
...
@@ -142,9 +142,18 @@ Pre-trained Models
.. autodata:: HUBERT_BASE
.. autodata:: HUBERT_BASE
:no-value:
:no-value:
.. autodata:: HUBERT_LARGE
:no-value:
.. autodata:: HUBERT_XLARGE
:no-value:
.. autodata:: HUBERT_ASR_LARGE
.. autodata:: HUBERT_ASR_LARGE
:no-value:
:no-value:
.. autodata:: HUBERT_ASR_XLARGE
:no-value:
Utility Functions
Utility Functions
-----------------
-----------------
...
...
test/integration_tests/wav2vec2_model_test.py
View file @
c9e4c75d
import
torchaudio
import
torchaudio
from
torchaudio.models
import
(
from
torchaudio.models
import
(
HUBERT_BASE
,
HUBERT_BASE
,
HUBERT_LARGE
,
HUBERT_XLARGE
,
HUBERT_ASR_LARGE
,
HUBERT_ASR_LARGE
,
HUBERT_ASR_XLARGE
,
)
)
import
pytest
import
pytest
...
@@ -10,6 +13,8 @@ import pytest
...
@@ -10,6 +13,8 @@ import pytest
"bundle"
,
"bundle"
,
[
[
HUBERT_BASE
,
HUBERT_BASE
,
HUBERT_LARGE
,
HUBERT_XLARGE
,
]
]
)
)
def
test_pretraining_models
(
bundle
):
def
test_pretraining_models
(
bundle
):
...
@@ -21,6 +26,7 @@ def test_pretraining_models(bundle):
...
@@ -21,6 +26,7 @@ def test_pretraining_models(bundle):
"bundle,expected"
,
"bundle,expected"
,
[
[
(
HUBERT_ASR_LARGE
,
'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'
),
(
HUBERT_ASR_LARGE
,
'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'
),
(
HUBERT_ASR_XLARGE
,
'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'
)
]
]
)
)
def
test_finetune_asr_model
(
def
test_finetune_asr_model
(
...
...
torchaudio/models/__init__.py
View file @
c9e4c75d
...
@@ -20,7 +20,10 @@ from .wav2vec2 import (
...
@@ -20,7 +20,10 @@ from .wav2vec2 import (
from
.wav2vec2.pretrained
import
(
from
.wav2vec2.pretrained
import
(
Wav2Vec2PretrainedModelBundle
,
Wav2Vec2PretrainedModelBundle
,
HUBERT_BASE
,
HUBERT_BASE
,
HUBERT_LARGE
,
HUBERT_XLARGE
,
HUBERT_ASR_LARGE
,
HUBERT_ASR_LARGE
,
HUBERT_ASR_XLARGE
,
)
)
__all__
=
[
__all__
=
[
...
@@ -43,7 +46,10 @@ __all__ = [
...
@@ -43,7 +46,10 @@ __all__ = [
'hubert_ft_xlarge'
,
'hubert_ft_xlarge'
,
'Wav2Vec2PretrainedModelBundle'
,
'Wav2Vec2PretrainedModelBundle'
,
'HUBERT_BASE'
,
'HUBERT_BASE'
,
'HUBERT_LARGE'
,
'HUBERT_XLARGE'
,
'HUBERT_ASR_LARGE'
,
'HUBERT_ASR_LARGE'
,
'HUBERT_ASR_XLARGE'
,
'Tacotron2'
,
'Tacotron2'
,
'tacotron2'
,
'tacotron2'
,
]
]
torchaudio/models/wav2vec2/pretrained.py
View file @
c9e4c75d
...
@@ -122,6 +122,7 @@ def _get_labels():
...
@@ -122,6 +122,7 @@ def _get_labels():
'Z'
,
'Z'
,
)
)
HUBERT_BASE
=
Wav2Vec2PretrainedModelBundle
(
HUBERT_BASE
=
Wav2Vec2PretrainedModelBundle
(
'hubert_fairseq_base_ls960.pth'
,
'hubert_fairseq_base_ls960.pth'
,
{
{
...
@@ -154,7 +155,89 @@ HUBERT_BASE = Wav2Vec2PretrainedModelBundle(
...
@@ -154,7 +155,89 @@ HUBERT_BASE = Wav2Vec2PretrainedModelBundle(
)
)
HUBERT_BASE
.
__doc__
=
"""HuBERT model with "Base" configuration.
HUBERT_BASE
.
__doc__
=
"""HuBERT model with "Base" configuration.
Trained on 960 hours of *LibriSpeech* [:footcite:`7178964`] dataset. Not fine-tuned.
Pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset [:footcite:`7178964`]
(the combination of "train-clean-100", "train-clean-360", and "train-other-500").
Not fine-tuned.
Originally published by the authors of *HuBERT* [:footcite:`hsu2021hubert`].
[`Source <https://github.com/pytorch/fairseq/tree/main/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
"""
HUBERT_LARGE
=
Wav2Vec2PretrainedModelBundle
(
'hubert_fairseq_large_ll60k.pth'
,
{
'extractor_mode'
:
'layer_norm'
,
'extractor_conv_layer_config'
:
[
(
512
,
10
,
5
),
(
512
,
3
,
2
),
(
512
,
3
,
2
),
(
512
,
3
,
2
),
(
512
,
3
,
2
),
(
512
,
2
,
2
),
(
512
,
2
,
2
),
],
'extractor_conv_bias'
:
False
,
'encoder_embed_dim'
:
1024
,
'encoder_projection_dropout'
:
0.0
,
'encoder_pos_conv_kernel'
:
128
,
'encoder_pos_conv_groups'
:
16
,
'encoder_num_layers'
:
24
,
'encoder_num_heads'
:
16
,
'encoder_attention_dropout'
:
0.0
,
'encoder_ff_interm_features'
:
4096
,
'encoder_ff_interm_dropout'
:
0.0
,
'encoder_dropout'
:
0.0
,
'encoder_layer_norm_first'
:
True
,
'encoder_layer_drop'
:
0.0
,
'aux_num_out'
:
None
,
},
_labels
=
None
,
)
HUBERT_LARGE
.
__doc__
=
"""HuBERT model with "Large" configuration.
Pre-trained on 60,000 hours of unlabeled audio from
*Libri-Light* dataset [:footcite:`librilight`].
Not fine-tuned.
Originally published by the authors of *HuBERT* [:footcite:`hsu2021hubert`].
[`Source <https://github.com/pytorch/fairseq/tree/main/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
"""
HUBERT_XLARGE
=
Wav2Vec2PretrainedModelBundle
(
'hubert_fairseq_xlarge_ll60k.pth'
,
{
'extractor_mode'
:
'layer_norm'
,
'extractor_conv_layer_config'
:
[
(
512
,
10
,
5
),
(
512
,
3
,
2
),
(
512
,
3
,
2
),
(
512
,
3
,
2
),
(
512
,
3
,
2
),
(
512
,
2
,
2
),
(
512
,
2
,
2
),
],
'extractor_conv_bias'
:
False
,
'encoder_embed_dim'
:
1280
,
'encoder_projection_dropout'
:
0.0
,
'encoder_pos_conv_kernel'
:
128
,
'encoder_pos_conv_groups'
:
16
,
'encoder_num_layers'
:
48
,
'encoder_num_heads'
:
16
,
'encoder_attention_dropout'
:
0.0
,
'encoder_ff_interm_features'
:
5120
,
'encoder_ff_interm_dropout'
:
0.0
,
'encoder_dropout'
:
0.0
,
'encoder_layer_norm_first'
:
True
,
'encoder_layer_drop'
:
0.0
,
'aux_num_out'
:
None
,
},
_labels
=
None
,
)
HUBERT_XLARGE
.
__doc__
=
"""HuBERT model with "Extra Large" configuration.
Pre-trained on 60,000 hours of unlabeled audio from
*Libri-Light* dataset [:footcite:`librilight`].
Not fine-tuned.
Originally published by the authors of *HuBERT* [:footcite:`hsu2021hubert`].
Originally published by the authors of *HuBERT* [:footcite:`hsu2021hubert`].
[`Source <https://github.com/pytorch/fairseq/tree/main/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
[`Source <https://github.com/pytorch/fairseq/tree/main/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
...
@@ -192,8 +275,53 @@ HUBERT_ASR_LARGE = Wav2Vec2PretrainedModelBundle(
...
@@ -192,8 +275,53 @@ HUBERT_ASR_LARGE = Wav2Vec2PretrainedModelBundle(
)
)
HUBERT_ASR_LARGE
.
__doc__
=
"""HuBERT model with "Large" configuration.
HUBERT_ASR_LARGE
.
__doc__
=
"""HuBERT model with "Large" configuration.
Pre-trained on 60,000 hours of *Libri-Light* [:footcite:`librilight`] dataset, and
Pre-trained on 60,000 hours of unlabeled audio from
fine-tuned for ASR on 960 hours of *LibriSpeech* [:footcite:`7178964`] dataset.
*Libri-Light* dataset [:footcite:`librilight`], and
fine-tuned for ASR on 960 hours of transcribed audio from
*LibriSpeech* dataset [:footcite:`7178964`]
(the combination of "train-clean-100", "train-clean-360", and "train-other-500").
Originally published by the authors of *HuBERT* [:footcite:`hsu2021hubert`].
[`Source <https://github.com/pytorch/fairseq/tree/main/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
"""
HUBERT_ASR_XLARGE
=
Wav2Vec2PretrainedModelBundle
(
'hubert_fairseq_xlarge_ll60k_asr_ls960.pth'
,
{
'extractor_mode'
:
'layer_norm'
,
'extractor_conv_layer_config'
:
[
(
512
,
10
,
5
),
(
512
,
3
,
2
),
(
512
,
3
,
2
),
(
512
,
3
,
2
),
(
512
,
3
,
2
),
(
512
,
2
,
2
),
(
512
,
2
,
2
),
],
'extractor_conv_bias'
:
False
,
'encoder_embed_dim'
:
1280
,
'encoder_projection_dropout'
:
0.0
,
'encoder_pos_conv_kernel'
:
128
,
'encoder_pos_conv_groups'
:
16
,
'encoder_num_layers'
:
48
,
'encoder_num_heads'
:
16
,
'encoder_attention_dropout'
:
0.0
,
'encoder_ff_interm_features'
:
5120
,
'encoder_ff_interm_dropout'
:
0.1
,
'encoder_dropout'
:
0.0
,
'encoder_layer_norm_first'
:
True
,
'encoder_layer_drop'
:
0.1
,
'aux_num_out'
:
32
,
},
_labels
=
_get_labels
(),
)
HUBERT_ASR_XLARGE
.
__doc__
=
"""HuBERT model with "Extra Large" configuration.
Pre-trained on 60,000 hours of unlabeled audio from
*Libri-Light* dataset [:footcite:`librilight`], and
fine-tuned for ASR on 960 hours of transcribed audio from
*LibriSpeech* dataset [:footcite:`7178964`]
(the combination of "train-clean-100", "train-clean-360", and "train-other-500").
Originally published by the authors of *HuBERT* [:footcite:`hsu2021hubert`].
Originally published by the authors of *HuBERT* [:footcite:`hsu2021hubert`].
[`Source <https://github.com/pytorch/fairseq/tree/main/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
[`Source <https://github.com/pytorch/fairseq/tree/main/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment