Unverified Commit cbf267c3 authored by moto's avatar moto Committed by GitHub
Browse files

Add pretrained French ASR from voxpopuli (#1919)

parent 9355e20e
......@@ -153,6 +153,14 @@ WAV2VEC2_ASR_LARGE_LV60K_960H
.. autodata:: WAV2VEC2_ASR_LARGE_LV60K_960H
:no-value:
VOXPOPULI_ASR_BASE_10K_FR
~~~~~~~~~~~~~~~~~~~~~~~~~
.. container:: py attribute
.. autodata:: VOXPOPULI_ASR_BASE_10K_FR
:no-value:
HUBERT_ASR_LARGE
~~~~~~~~~~~~~~~~
......
@article{voxpopuli,
author = {Changhan Wang and
Morgane Rivi{\`{e}}re and
Ann Lee and
Anne Wu and
Chaitanya Talnikar and
Daniel Haziza and
Mary Williamson and
Juan Miguel Pino and
Emmanuel Dupoux},
title = {VoxPopuli: {A} Large-Scale Multilingual Speech Corpus for Representation
Learning, Semi-Supervised Learning and Interpretation},
journal = {CoRR},
volume = {abs/2101.00390},
year = {2021},
url = {https://arxiv.org/abs/2101.00390},
eprinttype = {arXiv},
eprint = {2101.00390},
timestamp = {Thu, 12 Aug 2021 15:37:06 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2101-00390.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{specaugment,
title={SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition},
url={http://dx.doi.org/10.21437/Interspeech.2019-2680},
......
......@@ -34,6 +34,7 @@ def ctc_decoder():
_FILES = {
'en': 'Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.flac',
'fr': '20121212-0900-PLENARY-5-fr_20121212-11_37_04_10.flac',
}
......
......@@ -18,6 +18,7 @@ from torchaudio.pipelines import (
HUBERT_XLARGE,
HUBERT_ASR_LARGE,
HUBERT_ASR_XLARGE,
VOXPOPULI_ASR_BASE_10K_FR,
)
import pytest
......@@ -53,6 +54,7 @@ def test_pretraining_models(bundle):
(WAV2VEC2_ASR_LARGE_LV60K_960H, 'en', 'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'),
(HUBERT_ASR_LARGE, 'en', 'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'),
(HUBERT_ASR_XLARGE, 'en', 'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'),
(VOXPOPULI_ASR_BASE_10K_FR, 'fr', 'la|commission|va|faire|des|propositions|sur|ce|sujet|comment|mettre|en|place|cette|capacité|fiscale|et|le|conseil|européen|y|reviendra|sour|les|sujets|au|moins|de|mars'), # noqa: E501
]
)
def test_finetune_asr_model(
......
......@@ -14,6 +14,7 @@ from ._wav2vec2.impl import (
WAV2VEC2_ASR_LARGE_LV60K_100H,
WAV2VEC2_ASR_LARGE_LV60K_960H,
WAV2VEC2_XLSR53,
VOXPOPULI_ASR_BASE_10K_FR,
HUBERT_BASE,
HUBERT_LARGE,
HUBERT_XLARGE,
......@@ -44,6 +45,7 @@ __all__ = [
'WAV2VEC2_ASR_LARGE_LV60K_100H',
'WAV2VEC2_ASR_LARGE_LV60K_960H',
'WAV2VEC2_XLSR53',
'VOXPOPULI_ASR_BASE_10K_FR',
'HUBERT_BASE',
'HUBERT_LARGE',
'HUBERT_XLARGE',
......
......@@ -969,3 +969,49 @@ redistributed with the same license.
Please refer to :func:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
""" # noqa: E501
VOXPOPULI_ASR_BASE_10K_FR = Wav2Vec2ASRBundle(
'wav2vec2_voxpopuli_base_10k_asr_fr.pt',
{
"extractor_mode": "group_norm",
"extractor_conv_layer_config": [
(512, 10, 5),
(512, 3, 2),
(512, 3, 2),
(512, 3, 2),
(512, 3, 2),
(512, 2, 2),
(512, 2, 2),
],
"extractor_conv_bias": False,
"encoder_embed_dim": 768,
"encoder_projection_dropout": 0.0,
"encoder_pos_conv_kernel": 128,
"encoder_pos_conv_groups": 16,
"encoder_num_layers": 12,
"encoder_num_heads": 12,
"encoder_attention_dropout": 0.0,
"encoder_ff_interm_features": 3072,
"encoder_ff_interm_dropout": 0.1,
"encoder_dropout": 0.0,
"encoder_layer_norm_first": False,
"encoder_layer_drop": 0.1,
"aux_num_out": 43
},
_labels=utils._get_fr_labels(),
_sample_rate=16000,
)
VOXPOPULI_ASR_BASE_10K_FR.__doc__ = """wav2vec 2.0 model with "Base" configuration.
Pre-trained on 10k hours of unlabeled audio from *VoxPopuli* dataset [:footcite:`voxpopuli`]
("10k" subset, consisting of 23 languages).
Fine-tuned for ASR on 211 hours of transcribed audio from "fr" subset.
Originally published by the authors of *VoxPopuli* [:footcite:`voxpopuli`] under CC BY-NC 4.0 and
redistributed with the same license.
[`License <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#license>`__,
`Source <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#asr-and-lm>`__]
Please refer to :func:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
""" # noqa: E501
......@@ -29,3 +29,50 @@ def _get_en_labels():
'Q',
'Z',
)
def _get_fr_labels():
return (
"|",
"e",
"s",
"n",
"i",
"t",
"r",
"a",
"o",
"u",
"l",
"d",
"c",
"p",
"m",
"é",
"v",
"q",
"f",
"g",
"b",
"h",
"x",
"à",
"j",
"è",
"y",
"ê",
"z",
"ô",
"k",
"ç",
"œ",
"û",
"ù",
"î",
"â",
"w",
"ï",
"ë",
"ü",
"æ",
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment