Introduce MUSAN dataset (#2888)

Summary: Introduces the MUSAN dataset (https://www.openslr.org/17/), which contains music, speech, and noise recordings. Pull Request resolved: https://github.com/pytorch/audio/pull/2888 Reviewed By: xiaohui-zhang Differential Revision: D41762164 Pulled By: hwangjeff fbshipit-source-id: 14d5baaa4d40f065dd5d99bf7f2e0a73aa6c31a9

Introduce MUSAN dataset (#2888)
Summary: Introduces the MUSAN dataset (https://www.openslr.org/17/), which contains music, speech, and noise recordings. Pull Request resolved: https://github.com/pytorch/audio/pull/2888 Reviewed By: xiaohui-zhang Differential Revision: D41762164 Pulled By: hwangjeff fbshipit-source-id: 14d5baaa4d40f065dd5d99bf7f2e0a73aa6c31a9
45c7d05a · hwangjeff · Facebook GitHub Bot · e97f3a32 · 45c7d05a · 45c7d05a
Commit 45c7d05a authored Dec 06, 2022 by hwangjeff Committed by Facebook GitHub Bot Dec 06, 2022
8 changed files
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -93,6 +93,7 @@ model implementations and application components.
   :hidden:

   prototype
+   prototype.datasets
   prototype.functional
   prototype.models
   prototype.pipelines

--- a/docs/source/prototype.datasets.rst
+++ b/docs/source/prototype.datasets.rst
+.. py:module:: torchaudio.prototype.datasets
+
+torchaudio.prototype.datasets
+=============================
+
+.. currentmodule:: torchaudio.prototype.datasets
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: autosummary/dataset_class.rst
+
+    Musan
--- a/docs/source/prototype.rst
+++ b/docs/source/prototype.rst
@@ -17,6 +17,7 @@ imported explicitly, e.g.
   import torchaudio.prototype.models

 .. toctree::
+   prototype.datasets
   prototype.functional
   prototype.models
   prototype.pipelines

--- a/docs/source/refs.bib
+++ b/docs/source/refs.bib
@@ -472,3 +472,10 @@ abstract = {End-to-end spoken language translation (SLT) has recently gained pop
  pages={3586--3589},
  doi={10.21437/Interspeech.2015-711}
 }
+@misc{musan2015,
+  author = {David Snyder and Guoguo Chen and Daniel Povey},
+  title = {{MUSAN}: {A} {M}usic, {S}peech, and {N}oise {C}orpus},
+  year = {2015},
+  eprint = {1510.08484},
+  note = {arXiv:1510.08484v1}
+}
--- a/test/torchaudio_unittest/prototype/datasets/__init__.py
+++ b/test/torchaudio_unittest/prototype/datasets/__init__.py
--- a/test/torchaudio_unittest/prototype/datasets/musan_test.py
+++ b/test/torchaudio_unittest/prototype/datasets/musan_test.py
+import os
+from collections import defaultdict
+
+from parameterized import parameterized
+from torchaudio.prototype.datasets import Musan
+from torchaudio_unittest.common_utils import get_whitenoise, save_wav, TempDirMixin, TorchaudioTestCase
+
+
+_SUBSET_TO_SUBDIRS = {
+    "music": ["fma", "fma-western-art", "hd-classical", "jamendo", "rfm"],
+    "noise": ["free-sound", "sound-bible"],
+    "speech": ["librivox", "us-gov"],
+}
+_SAMPLE_RATE = 16_000
+
+
+def _get_mock_dataset(dataset_dir):
+    """
+    Creates the following directory structure:
+        music
+            fma
+            fma-western-art
+            hd-classical
+            jamendo
+            rfm
+        noise
+            free-sound
+            sound-bible
+        speech
+            librivox
+            us-gov
+
+    Then, within each leaf subdirectory, adds a WAV file containing white noise @ 16KHz.
+    """
+    mocked_samples = {}
+
+    seed = 0
+    os.makedirs(dataset_dir, exist_ok=True)
+    for subset, subdirs in _SUBSET_TO_SUBDIRS.items():
+        subset_samples = defaultdict(dict)
+        for subdir in subdirs:
+            subdir_path = os.path.join(dataset_dir, subset, subdir)
+            os.makedirs(subdir_path, exist_ok=True)
+            file_name = f"{subset}_{subdir}.wav"
+            file_path = os.path.join(subdir_path, file_name)
+
+            data = get_whitenoise(sample_rate=_SAMPLE_RATE, duration=10.00, n_channels=1, dtype="float32", seed=seed)
+            save_wav(file_path, data, _SAMPLE_RATE)
+            subset_samples[file_name] = (data, file_path)
+
+            seed += 1
+        mocked_samples[subset] = subset_samples
+    return mocked_samples
+
+
+class MusanTest(TempDirMixin, TorchaudioTestCase):
+    @classmethod
+    def setUpClass(cls):
+        dataset_dir = os.path.join(cls.get_base_temp_dir(), "musan")
+        cls.samples = _get_mock_dataset(dataset_dir)
+
+    @parameterized.expand([("music",), ("noise",), ("speech",)])
+    def test_musan(self, subset):
+        dataset = Musan(self.get_base_temp_dir(), subset)
+        for data, sample_rate, file_name in dataset:
+            self.assertTrue(file_name in self.samples[subset])
+            self.assertEqual(data, self.samples[subset][file_name][0])
+            self.assertEqual(sample_rate, _SAMPLE_RATE)
+
+    @parameterized.expand([("music",), ("noise",), ("speech",)])
+    def test_musan_metadata(self, subset):
+        dataset = Musan(self.get_base_temp_dir(), subset)
+        for idx in range(len(dataset)):
+            file_path, sample_rate, file_name = dataset.get_metadata(idx)
+            self.assertTrue(file_name in self.samples[subset])
+            self.assertEqual(file_path, self.samples[subset][file_name][1])
+            self.assertEqual(sample_rate, _SAMPLE_RATE)
--- a/torchaudio/prototype/datasets/__init__.py
+++ b/torchaudio/prototype/datasets/__init__.py
+from .musan import Musan
+
+
+__all__ = ["Musan"]
--- a/torchaudio/prototype/datasets/musan.py
+++ b/torchaudio/prototype/datasets/musan.py
+from pathlib import Path
+from typing import Tuple, Union
+
+import torch
+from torch.utils.data import Dataset
+from torchaudio.datasets.utils import _load_waveform
+
+
+_SUBSETS = ["music", "noise", "speech"]
+_SAMPLE_RATE = 16_000
+
+
+class Musan(Dataset):
+    r"""*MUSAN* :cite:`musan2015` dataset.
+
+    Args:
+        root (str or Path): Root directory where the dataset's top-level directory exists.
+        subset (str): Subset of the dataset to use. Options: [``"music"``, ``"noise"``, ``"speech"``].
+    """
+
+    def __init__(self, root: Union[str, Path], subset: str):
+        if subset not in _SUBSETS:
+            raise ValueError(f"Invalid subset '{subset}' given. Please provide one of {_SUBSETS}")
+
+        subset_path = Path(root) / subset
+        self._walker = [str(p) for p in subset_path.glob("*/*.*")]
+
+    def get_metadata(self, n: int) -> Tuple[str, int, str]:
+        r"""Get metadata for the n-th sample in the dataset. Returns filepath instead of waveform,
+        but otherwise returns the same fields as :py:func:`__getitem__`.
+
+        Args:
+            n (int): Index of sample to be loaded.
+
+        Returns:
+            (str, int, str):
+                str
+                    Path to audio.
+                int
+                    Sample rate.
+                str
+                    File name.
+        """
+        audio_path = self._walker[n]
+        return audio_path, _SAMPLE_RATE, Path(audio_path).name
+
+    def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str]:
+        r"""Return the n-th sample in the dataset.
+
+        Args:
+            n (int): Index of sample to be loaded.
+
+        Returns:
+            (torch.Tensor, int, str):
+                torch.Tensor
+                    Waveform.
+                int
+                    Sample rate.
+                str
+                    File name.
+        """
+        audio_path, sample_rate, filename = self.get_metadata(n)
+        path = Path(audio_path)
+        return _load_waveform(path.parent, path.name, sample_rate), sample_rate, filename
+
+    def __len__(self) -> int:
+        return len(self._walker)