Commit 66a67d2e authored by Caroline Chen's avatar Caroline Chen Committed by Facebook GitHub Bot
Browse files

Add fluent speech commands (#2480)

Summary: Pull Request resolved: https://github.com/pytorch/audio/pull/2480

Reviewed By: nateanl

Differential Revision: D37249571

Pulled By: carolineechen

fbshipit-source-id: caefeec4253c91f2579655a0c1735edaeed51be9
parent 10195316
...@@ -136,6 +136,13 @@ QUESST14 ...@@ -136,6 +136,13 @@ QUESST14
:members: :members:
:special-members: __getitem__ :special-members: __getitem__
FluentSpeechCommands
~~~~~~~~~~~~~~~~~~~~
.. autoclass:: FluentSpeechCommands
:members:
:special-members: __getitem__
References References
~~~~~~~~~~ ~~~~~~~~~~
......
...@@ -359,3 +359,11 @@ ...@@ -359,3 +359,11 @@
title="YesNo", title="YesNo",
url="http://www.openslr.org/1/" url="http://www.openslr.org/1/"
} }
@inproceedings{fluent,
author = {Loren Lugosch and Mirco Ravanelli and Patrick Ignoto and Vikrant Singh Tomar and Yoshua Bengio},
editor = {Gernot Kubin and Zdravko Kacic},
title = {Speech Model Pre-Training for End-to-End Spoken Language Understanding},
booktitle = {Proc. of Interspeech},
pages = {814--818},
year = {2019},
}
import csv
import os
import random
import string
from pathlib import Path
from torchaudio.datasets import fluentcommands
from torchaudio_unittest.common_utils import (
get_whitenoise,
save_wav,
TempDirMixin,
TorchaudioTestCase,
)
HEADER = ["", "path", "speakerId", "transcription", "action", "object", "location"]
SLOTS = ["action", "object", "location"]
ACTIONS = ["activate", "deactivate"]
OBJECTS = ["lights", "volume"]
LOCATIONS = ["none", "kitchen", "bedroom"]
NUM_SPEAKERS = 5
SAMPLES_PER_SPEAKER = 10
SAMPLE_RATE = 16000
def _gen_rand_str(n: int, seed: int):
random.seed(seed)
return "".join(random.choices(string.ascii_letters + string.digits, k=n))
def _gen_csv(dataset_dir: str, subset: str, init_seed: int):
data = []
data.append(HEADER)
idx = 0
seed = init_seed
for _ in range(NUM_SPEAKERS):
speaker_id = _gen_rand_str(5, seed=seed)
speaker_dir = os.path.join(dataset_dir, "wavs", "speakers", speaker_id)
os.makedirs(speaker_dir, exist_ok=True)
for _ in range(SAMPLES_PER_SPEAKER):
seed += 1
filename = _gen_rand_str(10, seed=seed)
path = f"wavs/speakers/{speaker_id}/{filename}.wav"
random.seed(seed)
transcription = ""
act = random.choice(ACTIONS)
obj = random.choice(OBJECTS)
loc = random.choice(LOCATIONS)
data.append([idx, path, speaker_id, transcription, act, obj, loc])
idx += 1
csv_path = os.path.join(dataset_dir, "data", f"{subset}_data.csv")
with open(csv_path, "w") as csv_file:
file_writer = csv.writer(csv_file)
file_writer.writerows(data)
return data
def _save_samples(dataset_dir: str, subset: str, seed: int):
# generate csv file
data = _gen_csv(dataset_dir, subset, seed)
# iterate through csv file, save wavs to corresponding files
header = data[0]
data = data[1:] # remove header
path_idx = header.index("path")
samples = []
for row in data:
wav = get_whitenoise(
sample_rate=SAMPLE_RATE,
duration=0.01,
n_channels=1,
seed=seed,
)
filename = row[path_idx]
wav_file = os.path.join(dataset_dir, filename)
save_wav(wav_file, wav, SAMPLE_RATE)
path = Path(wav_file).stem
speaker_id, transcription, act, obj, loc = row[2:]
sample = wav, SAMPLE_RATE, path, speaker_id, transcription, act, obj, loc
samples.append(sample)
seed += 1
return samples
def get_mock_dataset(dataset_dir: str):
data_folder = os.path.join(dataset_dir, "data")
wav_folder = os.path.join(dataset_dir, "wavs", "speakers")
os.makedirs(data_folder, exist_ok=True)
os.makedirs(wav_folder, exist_ok=True)
mocked_train_samples = _save_samples(dataset_dir, "train", 1)
mocked_valid_samples = _save_samples(dataset_dir, "valid", 111)
mocked_test_samples = _save_samples(dataset_dir, "test", 1111)
return mocked_train_samples, mocked_valid_samples, mocked_test_samples
class TestFluentSpeechCommands(TempDirMixin, TorchaudioTestCase):
root_dir = None
backend = "default"
mocked_train_samples = []
mocked_valid_samples = []
mocked_test_samples = []
@classmethod
def setUpClass(cls):
cls.root_dir = cls.get_base_temp_dir()
dataset_dir = os.path.join(cls.root_dir, "fluent_speech_commands_dataset")
(
cls.mocked_train_samples,
cls.mocked_valid_samples,
cls.mocked_test_samples,
) = get_mock_dataset(dataset_dir)
def _testFluentCommands(self, dataset, samples):
num_samples = 0
for i, data in enumerate(dataset):
self.assertEqual(data, samples[i])
num_samples += 1
assert num_samples == len(samples)
def testFluentCommandsTrain(self):
dataset = fluentcommands.FluentSpeechCommands(self.root_dir, subset="train")
self._testFluentCommands(dataset, self.mocked_train_samples)
def testFluentCommandsValid(self):
dataset = fluentcommands.FluentSpeechCommands(self.root_dir, subset="valid")
self._testFluentCommands(dataset, self.mocked_valid_samples)
def testFluentCommandsTest(self):
dataset = fluentcommands.FluentSpeechCommands(self.root_dir, subset="test")
self._testFluentCommands(dataset, self.mocked_test_samples)
...@@ -2,6 +2,7 @@ from .cmuarctic import CMUARCTIC ...@@ -2,6 +2,7 @@ from .cmuarctic import CMUARCTIC
from .cmudict import CMUDict from .cmudict import CMUDict
from .commonvoice import COMMONVOICE from .commonvoice import COMMONVOICE
from .dr_vctk import DR_VCTK from .dr_vctk import DR_VCTK
from .fluentcommands import FluentSpeechCommands
from .gtzan import GTZAN from .gtzan import GTZAN
from .librilight_limited import LibriLightLimited from .librilight_limited import LibriLightLimited
from .librimix import LibriMix from .librimix import LibriMix
...@@ -31,4 +32,5 @@ __all__ = [ ...@@ -31,4 +32,5 @@ __all__ = [
"LIBRITTS", "LIBRITTS",
"TEDLIUM", "TEDLIUM",
"QUESST14", "QUESST14",
"FluentSpeechCommands",
] ]
import csv
import os
from pathlib import Path
from typing import Union
import torchaudio
from torch.utils.data import Dataset
class FluentSpeechCommands(Dataset):
"""Create *Fluent Speech Commands* [:footcite:`fluent`] Dataset
Args:
root (str of Path): Path to the directory where the dataset is found.
subset (str, optional): subset of the dataset to use. Options: [`"train"`, `"valid"`, `"test"`].
(Default: ``"train"``)
"""
def __init__(self, root: Union[str, Path], subset: str = "train"):
assert subset in ["train", "valid", "test"], "`subset` must be one of ['train', 'valid', 'test']"
root = os.fspath(root)
self._path = os.path.join(root, "fluent_speech_commands_dataset")
subset_path = os.path.join(self._path, "data", f"{subset}_data.csv")
with open(subset_path) as subset_csv:
subset_reader = csv.reader(subset_csv)
data = list(subset_reader)
self.header = data[0]
self.data = data[1:]
def __len__(self):
return len(self.data)
def __getitem__(self, n: int):
"""Load the n-th sample from the dataset.
Args:
n (int): The index of the sample to be loaded
Returns:
(Tensor, int, Path, int, str, str, str, str):
``(waveform, sample_rate, path, speaker_id, transcription, action, object, location)``
"""
sample = self.data[n]
wav_path = os.path.join(self._path, sample[self.header.index("path")])
wav, sample_rate = torchaudio.load(wav_path)
path = Path(wav_path).stem
speaker_id, transcription, action, obj, location = sample[2:]
return wav, sample_rate, path, speaker_id, transcription, action, obj, location
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment