Unverified Commit 5c8541b7 authored by moto's avatar moto Committed by GitHub
Browse files

Add wav2vec2 ASR Italian pretrained model from voxpopuli (#1954)

parent 108e93af
...@@ -177,6 +177,14 @@ VOXPOPULI_ASR_BASE_10K_FR ...@@ -177,6 +177,14 @@ VOXPOPULI_ASR_BASE_10K_FR
.. autodata:: VOXPOPULI_ASR_BASE_10K_FR .. autodata:: VOXPOPULI_ASR_BASE_10K_FR
:no-value: :no-value:
VOXPOPULI_ASR_BASE_10K_IT
~~~~~~~~~~~~~~~~~~~~~~~~~
.. container:: py attribute
.. autodata:: VOXPOPULI_ASR_BASE_10K_IT
:no-value:
HUBERT_ASR_LARGE HUBERT_ASR_LARGE
~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~
......
...@@ -37,6 +37,7 @@ _FILES = { ...@@ -37,6 +37,7 @@ _FILES = {
'de': '20090505-0900-PLENARY-16-de_20090505-21_56_00_8.flac', 'de': '20090505-0900-PLENARY-16-de_20090505-21_56_00_8.flac',
'es': '20130207-0900-PLENARY-7-es_20130207-13_02_05_5.flac', 'es': '20130207-0900-PLENARY-7-es_20130207-13_02_05_5.flac',
'fr': '20121212-0900-PLENARY-5-fr_20121212-11_37_04_10.flac', 'fr': '20121212-0900-PLENARY-5-fr_20121212-11_37_04_10.flac',
'it': '20170516-0900-PLENARY-16-it_20170516-18_56_31_1.flac',
} }
......
...@@ -21,6 +21,7 @@ from torchaudio.pipelines import ( ...@@ -21,6 +21,7 @@ from torchaudio.pipelines import (
VOXPOPULI_ASR_BASE_10K_ES, VOXPOPULI_ASR_BASE_10K_ES,
VOXPOPULI_ASR_BASE_10K_DE, VOXPOPULI_ASR_BASE_10K_DE,
VOXPOPULI_ASR_BASE_10K_FR, VOXPOPULI_ASR_BASE_10K_FR,
VOXPOPULI_ASR_BASE_10K_IT,
) )
import pytest import pytest
...@@ -59,6 +60,7 @@ def test_pretraining_models(bundle): ...@@ -59,6 +60,7 @@ def test_pretraining_models(bundle):
(VOXPOPULI_ASR_BASE_10K_ES, 'es', "la|primera|que|es|imprescindible|pensar|a|pequeña|a|escala|para|implicar|y|complementar|así|la|actuación|global"), # noqa: E501 (VOXPOPULI_ASR_BASE_10K_ES, 'es', "la|primera|que|es|imprescindible|pensar|a|pequeña|a|escala|para|implicar|y|complementar|así|la|actuación|global"), # noqa: E501
(VOXPOPULI_ASR_BASE_10K_DE, 'de', "dabei|spielt|auch|eine|sorgfältige|berichterstattung|eine|wichtige|rolle"), (VOXPOPULI_ASR_BASE_10K_DE, 'de', "dabei|spielt|auch|eine|sorgfältige|berichterstattung|eine|wichtige|rolle"),
(VOXPOPULI_ASR_BASE_10K_FR, 'fr', 'la|commission|va|faire|des|propositions|sur|ce|sujet|comment|mettre|en|place|cette|capacité|fiscale|et|le|conseil|européen|y|reviendra|sour|les|sujets|au|moins|de|mars'), # noqa: E501 (VOXPOPULI_ASR_BASE_10K_FR, 'fr', 'la|commission|va|faire|des|propositions|sur|ce|sujet|comment|mettre|en|place|cette|capacité|fiscale|et|le|conseil|européen|y|reviendra|sour|les|sujets|au|moins|de|mars'), # noqa: E501
(VOXPOPULI_ASR_BASE_10K_IT, 'it', 'credo|che|illatino|non|sia|contemplato|tra|le|traduzioni|e|quindi|mi|attengo|allitaliano') # noqa: E501
] ]
) )
def test_finetune_asr_model( def test_finetune_asr_model(
......
...@@ -17,6 +17,7 @@ from ._wav2vec2.impl import ( ...@@ -17,6 +17,7 @@ from ._wav2vec2.impl import (
VOXPOPULI_ASR_BASE_10K_ES, VOXPOPULI_ASR_BASE_10K_ES,
VOXPOPULI_ASR_BASE_10K_DE, VOXPOPULI_ASR_BASE_10K_DE,
VOXPOPULI_ASR_BASE_10K_FR, VOXPOPULI_ASR_BASE_10K_FR,
VOXPOPULI_ASR_BASE_10K_IT,
HUBERT_BASE, HUBERT_BASE,
HUBERT_LARGE, HUBERT_LARGE,
HUBERT_XLARGE, HUBERT_XLARGE,
...@@ -50,6 +51,7 @@ __all__ = [ ...@@ -50,6 +51,7 @@ __all__ = [
'VOXPOPULI_ASR_BASE_10K_ES', 'VOXPOPULI_ASR_BASE_10K_ES',
'VOXPOPULI_ASR_BASE_10K_DE', 'VOXPOPULI_ASR_BASE_10K_DE',
'VOXPOPULI_ASR_BASE_10K_FR', 'VOXPOPULI_ASR_BASE_10K_FR',
'VOXPOPULI_ASR_BASE_10K_IT',
'HUBERT_BASE', 'HUBERT_BASE',
'HUBERT_LARGE', 'HUBERT_LARGE',
'HUBERT_XLARGE', 'HUBERT_XLARGE',
......
...@@ -1123,3 +1123,50 @@ redistributed with the same license. ...@@ -1123,3 +1123,50 @@ redistributed with the same license.
Please refer to :func:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage. Please refer to :func:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
""" # noqa: E501 """ # noqa: E501
VOXPOPULI_ASR_BASE_10K_IT = Wav2Vec2ASRBundle(
'wav2vec2_voxpopuli_base_10k_asr_it.pt',
{
"extractor_mode": "group_norm",
"extractor_conv_layer_config": [
(512, 10, 5),
(512, 3, 2),
(512, 3, 2),
(512, 3, 2),
(512, 3, 2),
(512, 2, 2),
(512, 2, 2),
],
"extractor_conv_bias": False,
"encoder_embed_dim": 768,
"encoder_projection_dropout": 0.0,
"encoder_pos_conv_kernel": 128,
"encoder_pos_conv_groups": 16,
"encoder_num_layers": 12,
"encoder_num_heads": 12,
"encoder_attention_dropout": 0.0,
"encoder_ff_interm_features": 3072,
"encoder_ff_interm_dropout": 0.1,
"encoder_dropout": 0.0,
"encoder_layer_norm_first": False,
"encoder_layer_drop": 0.1,
"aux_num_out": 37,
},
_labels=utils._get_it_labels(),
_sample_rate=16000,
_remove_aux_axis=(1, 2, 3),
)
VOXPOPULI_ASR_BASE_10K_IT.__doc__ = """wav2vec 2.0 model with "Base" configuration.
Pre-trained on 10k hours of unlabeled audio from *VoxPopuli* dataset [:footcite:`voxpopuli`]
("10k" subset, consisting of 23 languages).
Fine-tuned for ASR on 91 hours of transcribed audio from "it" subset.
Originally published by the authors of *VoxPopuli* [:footcite:`voxpopuli`] under CC BY-NC 4.0 and
redistributed with the same license.
[`License <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#license>`__,
`Source <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#asr-and-lm>`__]
Please refer to :func:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
""" # noqa: E501
...@@ -151,3 +151,44 @@ def _get_fr_labels(): ...@@ -151,3 +151,44 @@ def _get_fr_labels():
"ü", "ü",
"æ", "æ",
) )
def _get_it_labels():
return (
"|",
"e",
"i",
"a",
"o",
"n",
"t",
"r",
"l",
"s",
"c",
"d",
"u",
"p",
"m",
"g",
"v",
"h",
"z",
"f",
"b",
"q",
"à",
"è",
"ù",
"é",
"ò",
"ì",
"k",
"y",
"x",
"w",
"j",
"ó",
"í",
"ï",
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment