commonvoice_test.py 6.37 KB
Newer Older
1
import csv
Aziz's avatar
Aziz committed
2
import os
3
from pathlib import Path
Aziz's avatar
Aziz committed
4
from typing import Tuple, Dict
5

Aziz's avatar
Aziz committed
6
from torch import Tensor
7
from torchaudio_unittest.common_utils import (
8
9
10
11
12
13
14
    TempDirMixin,
    TorchaudioTestCase,
    get_whitenoise,
    save_wav,
    normalize_wav,
)

Aziz's avatar
Aziz committed
15
from torchaudio.datasets import COMMONVOICE
16

Aziz's avatar
Aziz committed
17
18
19
_ORIGINAL_EXT_AUDIO = COMMONVOICE._ext_audio
_SAMPLE_RATE = 48000
_HEADERS = [u"client_ids", u"path", u"sentence", u"up_votes", u"down_votes", u"age", u"gender", u"accent"]
20

Aziz's avatar
Aziz committed
21
22
23

def get_mock_dataset_en(root_dir) -> Tuple[Tensor, int, Dict[str, str]]:
    mocked_data = []
24
25
    # Note: extension is changed to wav for the sake of test
    # Note: the first content is missing values for `age`, `gender` and `accent` as in the original data.
Aziz's avatar
Aziz committed
26
    _en_train_csv_contents = [
27
        ["9d16c5d980247861130e0480e2719f448be73d86a496c36d01a477cbdecd8cfd1399403d7a77bf458d211a70711b2da0845c",
Aziz's avatar
Aziz committed
28
29
30
         "common_voice_en_18885784.wav",
         "He was accorded a State funeral, and was buried in Drayton and Toowoomba Cemetery.", "2", "0", "", "",
         ""],
31
        ["c82eb9291328620f06025a1f8112b909099e447e485e99236cb87df008650250e79fea5ca772061fb6a370830847b9c44d20",
Aziz's avatar
Aziz committed
32
         "common_voice_en_556542.wav", "Once more into the breach", "2", "0", "thirties", "male", "us"],
33
        ["f74d880c5ad4c5917f314a604d3fc4805159d255796fb9f8defca35333ecc002bdf53dc463503c12674ea840b21b4a507b7c",
Aziz's avatar
Aziz committed
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
         "common_voice_en_18607573.wav",
         "Caddy, show Miss Clare and Miss Summerson their rooms.", "2", "0", "twenties", "male", "canada"],
    ]
    # Tsv file name difference does not mean different subset, testing as a whole dataset here
    tsv_filename = os.path.join(root_dir, "train.tsv")
    audio_base_path = os.path.join(root_dir, "clips")
    os.makedirs(audio_base_path, exist_ok=True)
    with open(tsv_filename, "w", newline='') as tsv:
        writer = csv.writer(tsv, delimiter='\t')
        writer.writerow(_HEADERS)
        for i, content in enumerate(_en_train_csv_contents):
            writer.writerow(content)
            # Generate and store audio
            audio_path = os.path.join(audio_base_path, content[1])
            data = get_whitenoise(sample_rate=_SAMPLE_RATE, duration=1, n_channels=1, seed=i, dtype='float32')
            save_wav(audio_path, data, _SAMPLE_RATE)
            # Append data entry
            mocked_data.append((normalize_wav(data), _SAMPLE_RATE, dict(zip(_HEADERS, content))))
    return mocked_data


def get_mock_dataset_fr(root_dir) -> Tuple[Tensor, int, Dict[str, str]]:
    mocked_data = []
    _fr_train_csv_contents = [
        [
            "a2e8e1e1cc74d08c92a53d7b9ff84e077eb90410edd85b8882f16fd037cecfcb6a19413c6c63ce6458cfea9579878fa91cef"
            "18343441c601cae0597a4b0d3144",
            "89e67e7682b36786a0b4b4022c4d42090c86edd96c78c12d30088e62522b8fe466ea4912e6a1055dfb91b296a0743e0a2bbe"
            "16cebac98ee5349e3e8262cb9329",
            "Or sur ce point nous n’avons aucune réponse de votre part.", "2", "0", "twenties", "male", "france"],
        [
            "a2e8e1e1cc74d08c92a53d7b9ff84e077eb90410edd85b8882f16fd037cecfcb6a19413c6c63ce6458cfea9579878fa91cef18"
            "343441c601cae0597a4b0d3144",
            "87d71819a26179e93acfee149d0b21b7bf5e926e367d80b2b3792d45f46e04853a514945783ff764c1fc237b4eb0ee2b0a7a7"
            "cbd395acbdfcfa9d76a6e199bbd",
            "Monsieur de La Verpillière, laissez parler le ministre", "2", "0", "twenties", "male", "france"],

71
    ]
Aziz's avatar
Aziz committed
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
    # Tsv file name difference does not mean different subset, testing as a whole dataset here
    tsv_filename = os.path.join(root_dir, "train.tsv")
    audio_base_path = os.path.join(root_dir, "clips")
    os.makedirs(audio_base_path, exist_ok=True)
    with open(tsv_filename, "w", newline='') as tsv:
        writer = csv.writer(tsv, delimiter='\t')
        writer.writerow(_HEADERS)
        for i, content in enumerate(_fr_train_csv_contents):
            content[2] = str(content[2].encode("utf-8"))
            writer.writerow(content)
            # Generate and store audio
            audio_path = os.path.join(audio_base_path, content[1] + _ORIGINAL_EXT_AUDIO)
            data = get_whitenoise(sample_rate=_SAMPLE_RATE, duration=1, n_channels=1, seed=i, dtype='float32')
            save_wav(audio_path, data, _SAMPLE_RATE)

            # Append data entry
            mocked_data.append((normalize_wav(data), _SAMPLE_RATE, dict(zip(_HEADERS, content))))
    return mocked_data


class TestCommonVoiceEN(TempDirMixin, TorchaudioTestCase):
    backend = 'default'
    root_dir = None
95
96
97
98

    @classmethod
    def setUpClass(cls):
        cls.root_dir = cls.get_base_temp_dir()
Aziz's avatar
Aziz committed
99
100
101
102
103
104
        cls.data = get_mock_dataset_en(cls.root_dir)
        COMMONVOICE._ext_audio = ".wav"

    @classmethod
    def tearDownClass(cls):
        COMMONVOICE._ext_audio = _ORIGINAL_EXT_AUDIO
105

106
    def _test_commonvoice(self, dataset):
107
108
109
110
111
        n_ite = 0
        for i, (waveform, sample_rate, dictionary) in enumerate(dataset):
            expected_dictionary = self.data[i][2]
            expected_data = self.data[i][0]
            self.assertEqual(expected_data, waveform, atol=5e-5, rtol=1e-8)
Aziz's avatar
Aziz committed
112
            assert sample_rate == _SAMPLE_RATE
113
114
115
            assert dictionary == expected_dictionary
            n_ite += 1
        assert n_ite == len(self.data)
116
117

    def test_commonvoice_str(self):
118
        dataset = COMMONVOICE(self.root_dir)
119
120
121
        self._test_commonvoice(dataset)

    def test_commonvoice_path(self):
122
        dataset = COMMONVOICE(Path(self.root_dir))
123
        self._test_commonvoice(dataset)
Aziz's avatar
Aziz committed
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153


class TestCommonVoiceFR(TempDirMixin, TorchaudioTestCase):
    backend = 'default'
    root_dir = None

    @classmethod
    def setUpClass(cls):
        cls.root_dir = cls.get_base_temp_dir()
        cls.data = get_mock_dataset_fr(cls.root_dir)
        COMMONVOICE._ext_audio = ".mp3"

    @classmethod
    def tearDownClass(cls):
        COMMONVOICE._ext_audio = _ORIGINAL_EXT_AUDIO

    def _test_commonvoice(self, dataset):
        n_ite = 0
        for i, (waveform, sample_rate, dictionary) in enumerate(dataset):
            expected_dictionary = self.data[i][2]
            expected_data = self.data[i][0]
            self.assertEqual(expected_data, waveform, atol=5e-5, rtol=1e-8)
            assert sample_rate == _SAMPLE_RATE
            assert dictionary == expected_dictionary
            n_ite += 1
        assert n_ite == len(self.data)

    def test_commonvoice_str(self):
        dataset = COMMONVOICE(self.root_dir)
        self._test_commonvoice(dataset)