librispeech_test.py

import os
from pathlib import Path

from torchaudio_unittest.common_utils import (
    TempDirMixin,
    TorchaudioTestCase,
    get_whitenoise,
    save_wav,
    normalize_wav,
)

from torchaudio.datasets import librispeech

# Used to generate a unique transcript for each dummy audio file
_NUMBERS = [
    'ZERO',
    'ONE',
    'TWO',
    'THREE',
    'FOUR',
    'FIVE',
    'SIX',
    'SEVEN',
    'EIGHT',
    'NINE'
]


def get_mock_dataset(root_dir):
    """
    root_dir: directory to the mocked dataset
    """
    mocked_data = []
    dataset_dir = os.path.join(
        root_dir, librispeech.FOLDER_IN_ARCHIVE, librispeech.URL
    )
    os.makedirs(dataset_dir, exist_ok=True)
    sample_rate = 16000  # 16kHz
    seed = 0

    for speaker_id in range(5):
        speaker_path = os.path.join(dataset_dir, str(speaker_id))
        os.makedirs(speaker_path, exist_ok=True)

        for chapter_id in range(3):
            chapter_path = os.path.join(speaker_path, str(chapter_id))
            os.makedirs(chapter_path, exist_ok=True)
            trans_content = []

            for utterance_id in range(10):
                filename = f'{speaker_id}-{chapter_id}-{utterance_id:04d}.wav'
                path = os.path.join(chapter_path, filename)

                transcript = ' '.join(
                    [_NUMBERS[x] for x in [speaker_id, chapter_id, utterance_id]]
                )
                trans_content.append(
                    f'{speaker_id}-{chapter_id}-{utterance_id:04d} {transcript}'
                )

                data = get_whitenoise(
                    sample_rate=sample_rate,
                    duration=0.01,
                    n_channels=1,
                    dtype='float32',
                    seed=seed
                )
                save_wav(path, data, sample_rate)
                sample = (
                    normalize_wav(data),
                    sample_rate,
                    transcript,
                    speaker_id,
                    chapter_id,
                    utterance_id
                )
                mocked_data.append(sample)

                seed += 1

            trans_filename = f'{speaker_id}-{chapter_id}.trans.txt'
            trans_path = os.path.join(chapter_path, trans_filename)
            with open(trans_path, 'w') as f:
                f.write('\n'.join(trans_content))
    return mocked_data


class TestLibriSpeech(TempDirMixin, TorchaudioTestCase):
    backend = 'default'

    root_dir = None
    samples = []

    @classmethod
    def setUpClass(cls):
        cls.root_dir = cls.get_base_temp_dir()
        cls.samples = get_mock_dataset(cls.root_dir)

    @classmethod
    def tearDownClass(cls):
        # In case of test failure
        librispeech.LIBRISPEECH._ext_audio = '.flac'

    def _test_librispeech(self, dataset):
        num_samples = 0
        for i, (
                data, sample_rate, transcript, speaker_id, chapter_id, utterance_id
        ) in enumerate(dataset):
            self.assertEqual(data, self.samples[i][0], atol=5e-5, rtol=1e-8)
            assert sample_rate == self.samples[i][1]
            assert transcript == self.samples[i][2]
            assert speaker_id == self.samples[i][3]
            assert chapter_id == self.samples[i][4]
            assert utterance_id == self.samples[i][5]
            num_samples += 1

        assert num_samples == len(self.samples)
        librispeech.LIBRISPEECH._ext_audio = '.flac'

    def test_librispeech_str(self):
        librispeech.LIBRISPEECH._ext_audio = '.wav'
        dataset = librispeech.LIBRISPEECH(self.root_dir)
        self._test_librispeech(dataset)

    def test_librispeech_path(self):
        librispeech.LIBRISPEECH._ext_audio = '.wav'
        dataset = librispeech.LIBRISPEECH(Path(self.root_dir))
        self._test_librispeech(dataset)