Add wav2vec2 ASR German pretrained model from voxpopuli (#1953)

* Add wav2vec2 ASR German pretrained model from voxpopuli

Add wav2vec2 ASR German pretrained model from voxpopuli (#1953)
* Add wav2vec2 ASR German pretrained model from voxpopuli
e15431b7 · moto · GitHub · 184466a9 · e15431b7 · e15431b7
Unverified Commit e15431b7 authored Nov 01, 2021 by moto Committed by GitHub Nov 01, 2021
6 changed files
--- a/docs/source/pipelines.rst
+++ b/docs/source/pipelines.rst
@@ -153,6 +153,14 @@ WAV2VEC2_ASR_LARGE_LV60K_960H
   .. autodata:: WAV2VEC2_ASR_LARGE_LV60K_960H
      :no-value:

+VOXPOPULI_ASR_BASE_10K_DE
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. container:: py attribute
+
+   .. autodata:: VOXPOPULI_ASR_BASE_10K_DE
+      :no-value:
+
 VOXPOPULI_ASR_BASE_10K_ES
 ~~~~~~~~~~~~~~~~~~~~~~~~~


--- a/test/integration_tests/conftest.py
+++ b/test/integration_tests/conftest.py
@@ -34,6 +34,7 @@ def ctc_decoder():

 _FILES = {
    'en': 'Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.flac',
+    'de': '20090505-0900-PLENARY-16-de_20090505-21_56_00_8.flac',
    'es': '20130207-0900-PLENARY-7-es_20130207-13_02_05_5.flac',
    'fr': '20121212-0900-PLENARY-5-fr_20121212-11_37_04_10.flac',
 }

--- a/test/integration_tests/wav2vec2_pipeline_test.py
+++ b/test/integration_tests/wav2vec2_pipeline_test.py
@@ -19,6 +19,7 @@ from torchaudio.pipelines import (
    HUBERT_ASR_LARGE,
    HUBERT_ASR_XLARGE,
    VOXPOPULI_ASR_BASE_10K_ES,
+    VOXPOPULI_ASR_BASE_10K_DE,
    VOXPOPULI_ASR_BASE_10K_FR,
 )
 import pytest
@@ -56,6 +57,7 @@ def test_pretraining_models(bundle):
        (HUBERT_ASR_LARGE, 'en', 'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'),
        (HUBERT_ASR_XLARGE, 'en', 'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'),
        (VOXPOPULI_ASR_BASE_10K_ES, 'es', "la|primera|que|es|imprescindible|pensar|a|pequeña|a|escala|para|implicar|y|complementar|así|la|actuación|global"),  # noqa: E501
+        (VOXPOPULI_ASR_BASE_10K_DE, 'de', "dabei|spielt|auch|eine|sorgfältige|berichterstattung|eine|wichtige|rolle"),
        (VOXPOPULI_ASR_BASE_10K_FR, 'fr', 'la|commission|va|faire|des|propositions|sur|ce|sujet|comment|mettre|en|place|cette|capacité|fiscale|et|le|conseil|européen|y|reviendra|sour|les|sujets|au|moins|de|mars'),  # noqa: E501
    ]
 )

--- a/torchaudio/pipelines/__init__.py
+++ b/torchaudio/pipelines/__init__.py
@@ -15,6 +15,7 @@ from ._wav2vec2.impl import (
    WAV2VEC2_ASR_LARGE_LV60K_960H,
    WAV2VEC2_XLSR53,
    VOXPOPULI_ASR_BASE_10K_ES,
+    VOXPOPULI_ASR_BASE_10K_DE,
    VOXPOPULI_ASR_BASE_10K_FR,
    HUBERT_BASE,
    HUBERT_LARGE,
@@ -47,6 +48,7 @@ __all__ = [
    'WAV2VEC2_ASR_LARGE_LV60K_960H',
    'WAV2VEC2_XLSR53',
    'VOXPOPULI_ASR_BASE_10K_ES',
+    'VOXPOPULI_ASR_BASE_10K_DE',
    'VOXPOPULI_ASR_BASE_10K_FR',
    'HUBERT_BASE',
    'HUBERT_LARGE',

--- a/torchaudio/pipelines/_wav2vec2/impl.py
+++ b/torchaudio/pipelines/_wav2vec2/impl.py
@@ -986,6 +986,53 @@ Please refer to :func:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
 """  # noqa: E501


+VOXPOPULI_ASR_BASE_10K_DE = Wav2Vec2ASRBundle(
+    'wav2vec2_voxpopuli_base_10k_asr_de.pt',
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 768,
+        "encoder_projection_dropout": 0.0,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 12,
+        "encoder_num_heads": 12,
+        "encoder_attention_dropout": 0.0,
+        "encoder_ff_interm_features": 3072,
+        "encoder_ff_interm_dropout": 0.1,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.1,
+        "aux_num_out": 32,
+    },
+    _labels=utils._get_de_labels(),
+    _sample_rate=16000,
+    _remove_aux_axis=(1, 2, 3, 35),
+)
+VOXPOPULI_ASR_BASE_10K_DE.__doc__ = """wav2vec 2.0 model with "Base" configuration.
+
+Pre-trained on 10k hours of unlabeled audio from *VoxPopuli* dataset [:footcite:`voxpopuli`]
+("10k" subset, consisting of 23 languages).
+Fine-tuned for ASR on 282 hours of transcribed audio from "de" subset.
+
+Originally published by the authors of *VoxPopuli* [:footcite:`voxpopuli`] under CC BY-NC 4.0 and
+redistributed with the same license.
+[`License <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#license>`__,
+`Source <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#asr-and-lm>`__]
+
+Please refer to :func:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
+"""  # noqa: E501
+
+
 VOXPOPULI_ASR_BASE_10K_ES = Wav2Vec2ASRBundle(
    'wav2vec2_voxpopuli_base_10k_asr_es.pt',
    {

--- a/torchaudio/pipelines/_wav2vec2/utils.py
+++ b/torchaudio/pipelines/_wav2vec2/utils.py
@@ -31,6 +31,42 @@ def _get_en_labels():
    )


+def _get_de_labels():
+    return (
+        "|",
+        "e",
+        "n",
+        "i",
+        "r",
+        "s",
+        "t",
+        "a",
+        "d",
+        "h",
+        "u",
+        "l",
+        "g",
+        "c",
+        "m",
+        "o",
+        "b",
+        "w",
+        "f",
+        "k",
+        "z",
+        "p",
+        "v",
+        "ü",
+        "ä",
+        "ö",
+        "j",
+        "ß",
+        "y",
+        "x",
+        "q",
+    )
+
+
 def _get_es_labels():
    return (
        "|",