commonvoice_test.py 5.78 KB
Newer Older
1
import csv
Aziz's avatar
Aziz committed
2
import os
3
from pathlib import Path
4
from typing import Dict, Tuple
5

Aziz's avatar
Aziz committed
6
from torch import Tensor
7
from torchaudio.datasets import COMMONVOICE
8
from torchaudio_unittest.common_utils import (
9
10
    get_whitenoise,
    normalize_wav,
11
12
13
    save_wav,
    TempDirMixin,
    TorchaudioTestCase,
14
15
)

Aziz's avatar
Aziz committed
16
17
_ORIGINAL_EXT_AUDIO = COMMONVOICE._ext_audio
_SAMPLE_RATE = 48000
18
_HEADERS = ["client_ids", "path", "sentence", "up_votes", "down_votes", "age", "gender", "accent"]
19
_EN_TRAIN_CSV_CONTENTS = [
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
    [
        "9d16c5d980247861130e0480e2719f448be73d86a496c36d01a477cbdecd8cfd1399403d7a77bf458d211a70711b2da0845c",
        "common_voice_en_18885784.wav",
        "He was accorded a State funeral, and was buried in Drayton and Toowoomba Cemetery.",
        "2",
        "0",
        "",
        "",
        "",
    ],
    [
        "c82eb9291328620f06025a1f8112b909099e447e485e99236cb87df008650250e79fea5ca772061fb6a370830847b9c44d20",
        "common_voice_en_556542.wav",
        "Once more into the breach",
        "2",
        "0",
        "thirties",
        "male",
        "us",
    ],
    [
        "f74d880c5ad4c5917f314a604d3fc4805159d255796fb9f8defca35333ecc002bdf53dc463503c12674ea840b21b4a507b7c",
        "common_voice_en_18607573.wav",
        "Caddy, show Miss Clare and Miss Summerson their rooms.",
        "2",
        "0",
        "twenties",
        "male",
        "canada",
    ],
50
51
52
53
54
55
56
57
]

_FR_TRAIN_CSV_CONTENTS = [
    [
        "a2e8e1e1cc74d08c92a53d7b9ff84e077eb90410edd85b8882f16fd037cecfcb6a19413c6c63ce6458cfea9579878fa91cef"
        "18343441c601cae0597a4b0d3144",
        "89e67e7682b36786a0b4b4022c4d42090c86edd96c78c12d30088e62522b8fe466ea4912e6a1055dfb91b296a0743e0a2bbe"
        "16cebac98ee5349e3e8262cb9329",
58
59
60
61
62
63
64
        "Or sur ce point nous n’avons aucune réponse de votre part.",
        "2",
        "0",
        "twenties",
        "male",
        "france",
    ],
65
66
67
68
69
    [
        "a2e8e1e1cc74d08c92a53d7b9ff84e077eb90410edd85b8882f16fd037cecfcb6a19413c6c63ce6458cfea9579878fa91cef18"
        "343441c601cae0597a4b0d3144",
        "87d71819a26179e93acfee149d0b21b7bf5e926e367d80b2b3792d45f46e04853a514945783ff764c1fc237b4eb0ee2b0a7a7"
        "cbd395acbdfcfa9d76a6e199bbd",
70
71
72
73
74
75
76
        "Monsieur de La Verpillière, laissez parler le ministre",
        "2",
        "0",
        "twenties",
        "male",
        "france",
    ],
77
78
79
80
81
82
83
]


def get_mock_dataset(root_dir, train_csv_contents, ext_audio) -> Tuple[Tensor, int, Dict[str, str]]:
    """
    prepares mocked dataset
    """
Aziz's avatar
Aziz committed
84
    mocked_data = []
85
86
    # Note: extension is changed to wav for the sake of test
    # Note: the first content is missing values for `age`, `gender` and `accent` as in the original data.
Aziz's avatar
Aziz committed
87
88
89
90
    # Tsv file name difference does not mean different subset, testing as a whole dataset here
    tsv_filename = os.path.join(root_dir, "train.tsv")
    audio_base_path = os.path.join(root_dir, "clips")
    os.makedirs(audio_base_path, exist_ok=True)
91
92
    with open(tsv_filename, "w", newline="") as tsv:
        writer = csv.writer(tsv, delimiter="\t")
Aziz's avatar
Aziz committed
93
        writer.writerow(_HEADERS)
94
95
        for i, content in enumerate(train_csv_contents):
            content[2] = str(content[2].encode("utf-8"))
Aziz's avatar
Aziz committed
96
            writer.writerow(content)
97
98
99
100
101
            if not content[1].endswith(ext_audio):
                audio_path = os.path.join(audio_base_path, content[1] + ext_audio)
            else:
                audio_path = os.path.join(audio_base_path, content[1])

102
            data = get_whitenoise(sample_rate=_SAMPLE_RATE, duration=1, n_channels=1, seed=i, dtype="float32")
Aziz's avatar
Aziz committed
103
104
105
106
107
108
            save_wav(audio_path, data, _SAMPLE_RATE)
            # Append data entry
            mocked_data.append((normalize_wav(data), _SAMPLE_RATE, dict(zip(_HEADERS, content))))
    return mocked_data


109
110
111
112
113
def get_mock_dataset_en(root_dir, ext_audio) -> Tuple[Tensor, int, Dict[str, str]]:
    """
    prepares english mocked dataset
    """
    return get_mock_dataset(root_dir, _EN_TRAIN_CSV_CONTENTS, ext_audio)
Aziz's avatar
Aziz committed
114
115


116
117
118
119
120
def get_mock_dataset_fr(root_dir, ext_audio) -> Tuple[Tensor, int, Dict[str, str]]:
    """
    prepares french mocked dataset
    """
    return get_mock_dataset(root_dir, _FR_TRAIN_CSV_CONTENTS, ext_audio)
Aziz's avatar
Aziz committed
121

122
123

class BaseTestCommonVoice(TempDirMixin):
Aziz's avatar
Aziz committed
124
    root_dir = None
125
    data = None
126
127
128

    @classmethod
    def setUpClass(cls):
129
        super().setUpClass()
130
        cls.root_dir = cls.get_base_temp_dir()
Aziz's avatar
Aziz committed
131
132
133
134
        COMMONVOICE._ext_audio = ".wav"

    @classmethod
    def tearDownClass(cls):
135
        super().tearDownClass()
Aziz's avatar
Aziz committed
136
        COMMONVOICE._ext_audio = _ORIGINAL_EXT_AUDIO
137

138
    def _test_commonvoice(self, dataset):
139
140
141
142
143
        n_ite = 0
        for i, (waveform, sample_rate, dictionary) in enumerate(dataset):
            expected_dictionary = self.data[i][2]
            expected_data = self.data[i][0]
            self.assertEqual(expected_data, waveform, atol=5e-5, rtol=1e-8)
Aziz's avatar
Aziz committed
144
            assert sample_rate == _SAMPLE_RATE
145
146
147
            assert dictionary == expected_dictionary
            n_ite += 1
        assert n_ite == len(self.data)
148

149
150

class TestCommonVoiceEN(BaseTestCommonVoice, TorchaudioTestCase):
151
    backend = "default"
152
153
154
155
156
157
158
    root_dir = None

    @classmethod
    def setUpClass(cls):
        super().setUpClass()
        cls.data = get_mock_dataset_en(cls.root_dir, COMMONVOICE._ext_audio)

159
    def test_commonvoice_str(self):
160
        dataset = COMMONVOICE(self.root_dir)
161
162
163
        self._test_commonvoice(dataset)

    def test_commonvoice_path(self):
164
        dataset = COMMONVOICE(Path(self.root_dir))
165
        self._test_commonvoice(dataset)
Aziz's avatar
Aziz committed
166
167


168
class TestCommonVoiceFR(BaseTestCommonVoice, TorchaudioTestCase):
169
    backend = "default"
Aziz's avatar
Aziz committed
170
171
172
173
    root_dir = None

    @classmethod
    def setUpClass(cls):
174
175
        super().setUpClass()
        cls.data = get_mock_dataset_fr(cls.root_dir, COMMONVOICE._ext_audio)
Aziz's avatar
Aziz committed
176
177
178
179

    def test_commonvoice_str(self):
        dataset = COMMONVOICE(self.root_dir)
        self._test_commonvoice(dataset)