Add fluent speech commands (#2480)

Summary: Pull Request resolved: https://github.com/pytorch/audio/pull/2480 Reviewed By: nateanl Differential Revision: D37249571 Pulled By: carolineechen fbshipit-source-id: caefeec4253c91f2579655a0c1735edaeed51be9

Add fluent speech commands (#2480)
Summary: Pull Request resolved: https://github.com/pytorch/audio/pull/2480 Reviewed By: nateanl Differential Revision: D37249571 Pulled By: carolineechen fbshipit-source-id: caefeec4253c91f2579655a0c1735edaeed51be9
66a67d2e · Caroline Chen · Facebook GitHub Bot · 10195316 · 66a67d2e · 66a67d2e
Commit 66a67d2e authored Jun 20, 2022 by Caroline Chen Committed by Facebook GitHub Bot Jun 20, 2022
5 changed files
--- a/docs/source/datasets.rst
+++ b/docs/source/datasets.rst
@@ -136,6 +136,13 @@ QUESST14
  :members:
  :special-members: __getitem__
+FluentSpeechCommands
+~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: FluentSpeechCommands
+  :members:
+  :special-members: __getitem__
 References
 ~~~~~~~~~~

--- a/docs/source/refs.bib
+++ b/docs/source/refs.bib
@@ -359,3 +359,11 @@
  title="YesNo",
  url="http://www.openslr.org/1/"
 }
+@inproceedings{fluent,
+  author    = {Loren Lugosch and Mirco Ravanelli and Patrick Ignoto and Vikrant Singh Tomar and Yoshua Bengio},
+  editor    = {Gernot Kubin and Zdravko Kacic},
+  title     = {Speech Model Pre-Training for End-to-End Spoken Language Understanding},
+  booktitle = {Proc. of Interspeech},
+  pages     = {814--818},
+  year      = {2019},
+}
--- a/test/torchaudio_unittest/datasets/fluentcommands_test.py
+++ b/test/torchaudio_unittest/datasets/fluentcommands_test.py
+import csv
+import os
+import random
+import string
+from pathlib import Path
+from torchaudio.datasets import fluentcommands
+from torchaudio_unittest.common_utils import (
+    get_whitenoise,
+    save_wav,
+    TempDirMixin,
+    TorchaudioTestCase,
+)
+HEADER = ["", "path", "speakerId", "transcription", "action", "object", "location"]
+SLOTS = ["action", "object", "location"]
+ACTIONS = ["activate", "deactivate"]
+OBJECTS = ["lights", "volume"]
+LOCATIONS = ["none", "kitchen", "bedroom"]
+NUM_SPEAKERS = 5
+SAMPLES_PER_SPEAKER = 10
+SAMPLE_RATE = 16000
+def _gen_rand_str(n: int, seed: int):
+    random.seed(seed)
+    return "".join(random.choices(string.ascii_letters + string.digits, k=n))
+def _gen_csv(dataset_dir: str, subset: str, init_seed: int):
+    data = []
+    data.append(HEADER)
+    idx = 0
+    seed = init_seed
+    for _ in range(NUM_SPEAKERS):
+        speaker_id = _gen_rand_str(5, seed=seed)
+        speaker_dir = os.path.join(dataset_dir, "wavs", "speakers", speaker_id)
+        os.makedirs(speaker_dir, exist_ok=True)
+        for _ in range(SAMPLES_PER_SPEAKER):
+            seed += 1
+            filename = _gen_rand_str(10, seed=seed)
+            path = f"wavs/speakers/{speaker_id}/{filename}.wav"
+            random.seed(seed)
+            transcription = ""
+            act = random.choice(ACTIONS)
+            obj = random.choice(OBJECTS)
+            loc = random.choice(LOCATIONS)
+            data.append([idx, path, speaker_id, transcription, act, obj, loc])
+            idx += 1
+    csv_path = os.path.join(dataset_dir, "data", f"{subset}_data.csv")
+    with open(csv_path, "w") as csv_file:
+        file_writer = csv.writer(csv_file)
+        file_writer.writerows(data)
+    return data
+def _save_samples(dataset_dir: str, subset: str, seed: int):
+    # generate csv file
+    data = _gen_csv(dataset_dir, subset, seed)
+    # iterate through csv file, save wavs to corresponding files
+    header = data[0]
+    data = data[1:]  # remove header
+    path_idx = header.index("path")
+    samples = []
+    for row in data:
+        wav = get_whitenoise(
+            sample_rate=SAMPLE_RATE,
+            duration=0.01,
+            n_channels=1,
+            seed=seed,
+        )
+        filename = row[path_idx]
+        wav_file = os.path.join(dataset_dir, filename)
+        save_wav(wav_file, wav, SAMPLE_RATE)
+        path = Path(wav_file).stem
+        speaker_id, transcription, act, obj, loc = row[2:]
+        sample = wav, SAMPLE_RATE, path, speaker_id, transcription, act, obj, loc
+        samples.append(sample)
+        seed += 1
+    return samples
+def get_mock_dataset(dataset_dir: str):
+    data_folder = os.path.join(dataset_dir, "data")
+    wav_folder = os.path.join(dataset_dir, "wavs", "speakers")
+    os.makedirs(data_folder, exist_ok=True)
+    os.makedirs(wav_folder, exist_ok=True)
+    mocked_train_samples = _save_samples(dataset_dir, "train", 1)
+    mocked_valid_samples = _save_samples(dataset_dir, "valid", 111)
+    mocked_test_samples = _save_samples(dataset_dir, "test", 1111)
+    return mocked_train_samples, mocked_valid_samples, mocked_test_samples
+class TestFluentSpeechCommands(TempDirMixin, TorchaudioTestCase):
+    root_dir = None
+    backend = "default"
+    mocked_train_samples = []
+    mocked_valid_samples = []
+    mocked_test_samples = []
+    @classmethod
+    def setUpClass(cls):
+        cls.root_dir = cls.get_base_temp_dir()
+        dataset_dir = os.path.join(cls.root_dir, "fluent_speech_commands_dataset")
+        (
+            cls.mocked_train_samples,
+            cls.mocked_valid_samples,
+            cls.mocked_test_samples,
+        ) = get_mock_dataset(dataset_dir)
+    def _testFluentCommands(self, dataset, samples):
+        num_samples = 0
+        for i, data in enumerate(dataset):
+            self.assertEqual(data, samples[i])
+            num_samples += 1
+        assert num_samples == len(samples)
+    def testFluentCommandsTrain(self):
+        dataset = fluentcommands.FluentSpeechCommands(self.root_dir, subset="train")
+        self._testFluentCommands(dataset, self.mocked_train_samples)
+    def testFluentCommandsValid(self):
+        dataset = fluentcommands.FluentSpeechCommands(self.root_dir, subset="valid")
+        self._testFluentCommands(dataset, self.mocked_valid_samples)
+    def testFluentCommandsTest(self):
+        dataset = fluentcommands.FluentSpeechCommands(self.root_dir, subset="test")
+        self._testFluentCommands(dataset, self.mocked_test_samples)
--- a/torchaudio/datasets/__init__.py
+++ b/torchaudio/datasets/__init__.py
@@ -2,6 +2,7 @@ from .cmuarctic import CMUARCTIC
 from .cmudict import CMUDict
 from .commonvoice import COMMONVOICE
 from .dr_vctk import DR_VCTK
+from .fluentcommands import FluentSpeechCommands
 from .gtzan import GTZAN
 from .librilight_limited import LibriLightLimited
 from .librimix import LibriMix
@@ -31,4 +32,5 @@ __all__ = [
    "LIBRITTS",
    "TEDLIUM",
    "QUESST14",
+    "FluentSpeechCommands",
 ]
--- a/torchaudio/datasets/fluentcommands.py
+++ b/torchaudio/datasets/fluentcommands.py
+import csv
+import os
+from pathlib import Path
+from typing import Union
+import torchaudio
+from torch.utils.data import Dataset
+class FluentSpeechCommands(Dataset):
+    """Create *Fluent Speech Commands* [:footcite:`fluent`] Dataset
+    Args:
+        root (str of Path): Path to the directory where the dataset is found.
+        subset (str, optional): subset of the dataset to use. Options: [`"train"`, `"valid"`, `"test"`].
+            (Default: ``"train"``)
+    """
+    def __init__(self, root: Union[str, Path], subset: str = "train"):
+        assert subset in ["train", "valid", "test"], "`subset` must be one of ['train', 'valid', 'test']"
+        root = os.fspath(root)
+        self._path = os.path.join(root, "fluent_speech_commands_dataset")
+        subset_path = os.path.join(self._path, "data", f"{subset}_data.csv")
+        with open(subset_path) as subset_csv:
+            subset_reader = csv.reader(subset_csv)
+            data = list(subset_reader)
+        self.header = data[0]
+        self.data = data[1:]
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, n: int):
+        """Load the n-th sample from the dataset.
+        Args:
+            n (int): The index of the sample to be loaded
+        Returns:
+            (Tensor, int, Path, int, str, str, str, str):
+            ``(waveform, sample_rate, path, speaker_id, transcription, action, object, location)``
+        """
+        sample = self.data[n]
+        wav_path = os.path.join(self._path, sample[self.header.index("path")])
+        wav, sample_rate = torchaudio.load(wav_path)
+        path = Path(wav_path).stem
+        speaker_id, transcription, action, obj, location = sample[2:]
+        return wav, sample_rate, path, speaker_id, transcription, action, obj, location