Commit 45c7d05a authored by hwangjeff's avatar hwangjeff Committed by Facebook GitHub Bot
Browse files

Introduce MUSAN dataset (#2888)

Summary:
Introduces the MUSAN dataset (https://www.openslr.org/17/), which contains music, speech, and noise recordings.

Pull Request resolved: https://github.com/pytorch/audio/pull/2888

Reviewed By: xiaohui-zhang

Differential Revision: D41762164

Pulled By: hwangjeff

fbshipit-source-id: 14d5baaa4d40f065dd5d99bf7f2e0a73aa6c31a9
parent e97f3a32
......@@ -93,6 +93,7 @@ model implementations and application components.
:hidden:
prototype
prototype.datasets
prototype.functional
prototype.models
prototype.pipelines
......
.. py:module:: torchaudio.prototype.datasets
torchaudio.prototype.datasets
=============================
.. currentmodule:: torchaudio.prototype.datasets
.. autosummary::
:toctree: generated
:nosignatures:
:template: autosummary/dataset_class.rst
Musan
......@@ -17,6 +17,7 @@ imported explicitly, e.g.
import torchaudio.prototype.models
.. toctree::
prototype.datasets
prototype.functional
prototype.models
prototype.pipelines
......
......@@ -472,3 +472,10 @@ abstract = {End-to-end spoken language translation (SLT) has recently gained pop
pages={3586--3589},
doi={10.21437/Interspeech.2015-711}
}
@misc{musan2015,
author = {David Snyder and Guoguo Chen and Daniel Povey},
title = {{MUSAN}: {A} {M}usic, {S}peech, and {N}oise {C}orpus},
year = {2015},
eprint = {1510.08484},
note = {arXiv:1510.08484v1}
}
import os
from collections import defaultdict
from parameterized import parameterized
from torchaudio.prototype.datasets import Musan
from torchaudio_unittest.common_utils import get_whitenoise, save_wav, TempDirMixin, TorchaudioTestCase
_SUBSET_TO_SUBDIRS = {
"music": ["fma", "fma-western-art", "hd-classical", "jamendo", "rfm"],
"noise": ["free-sound", "sound-bible"],
"speech": ["librivox", "us-gov"],
}
_SAMPLE_RATE = 16_000
def _get_mock_dataset(dataset_dir):
"""
Creates the following directory structure:
music
fma
fma-western-art
hd-classical
jamendo
rfm
noise
free-sound
sound-bible
speech
librivox
us-gov
Then, within each leaf subdirectory, adds a WAV file containing white noise @ 16KHz.
"""
mocked_samples = {}
seed = 0
os.makedirs(dataset_dir, exist_ok=True)
for subset, subdirs in _SUBSET_TO_SUBDIRS.items():
subset_samples = defaultdict(dict)
for subdir in subdirs:
subdir_path = os.path.join(dataset_dir, subset, subdir)
os.makedirs(subdir_path, exist_ok=True)
file_name = f"{subset}_{subdir}.wav"
file_path = os.path.join(subdir_path, file_name)
data = get_whitenoise(sample_rate=_SAMPLE_RATE, duration=10.00, n_channels=1, dtype="float32", seed=seed)
save_wav(file_path, data, _SAMPLE_RATE)
subset_samples[file_name] = (data, file_path)
seed += 1
mocked_samples[subset] = subset_samples
return mocked_samples
class MusanTest(TempDirMixin, TorchaudioTestCase):
@classmethod
def setUpClass(cls):
dataset_dir = os.path.join(cls.get_base_temp_dir(), "musan")
cls.samples = _get_mock_dataset(dataset_dir)
@parameterized.expand([("music",), ("noise",), ("speech",)])
def test_musan(self, subset):
dataset = Musan(self.get_base_temp_dir(), subset)
for data, sample_rate, file_name in dataset:
self.assertTrue(file_name in self.samples[subset])
self.assertEqual(data, self.samples[subset][file_name][0])
self.assertEqual(sample_rate, _SAMPLE_RATE)
@parameterized.expand([("music",), ("noise",), ("speech",)])
def test_musan_metadata(self, subset):
dataset = Musan(self.get_base_temp_dir(), subset)
for idx in range(len(dataset)):
file_path, sample_rate, file_name = dataset.get_metadata(idx)
self.assertTrue(file_name in self.samples[subset])
self.assertEqual(file_path, self.samples[subset][file_name][1])
self.assertEqual(sample_rate, _SAMPLE_RATE)
from .musan import Musan
__all__ = ["Musan"]
from pathlib import Path
from typing import Tuple, Union
import torch
from torch.utils.data import Dataset
from torchaudio.datasets.utils import _load_waveform
_SUBSETS = ["music", "noise", "speech"]
_SAMPLE_RATE = 16_000
class Musan(Dataset):
r"""*MUSAN* :cite:`musan2015` dataset.
Args:
root (str or Path): Root directory where the dataset's top-level directory exists.
subset (str): Subset of the dataset to use. Options: [``"music"``, ``"noise"``, ``"speech"``].
"""
def __init__(self, root: Union[str, Path], subset: str):
if subset not in _SUBSETS:
raise ValueError(f"Invalid subset '{subset}' given. Please provide one of {_SUBSETS}")
subset_path = Path(root) / subset
self._walker = [str(p) for p in subset_path.glob("*/*.*")]
def get_metadata(self, n: int) -> Tuple[str, int, str]:
r"""Get metadata for the n-th sample in the dataset. Returns filepath instead of waveform,
but otherwise returns the same fields as :py:func:`__getitem__`.
Args:
n (int): Index of sample to be loaded.
Returns:
(str, int, str):
str
Path to audio.
int
Sample rate.
str
File name.
"""
audio_path = self._walker[n]
return audio_path, _SAMPLE_RATE, Path(audio_path).name
def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str]:
r"""Return the n-th sample in the dataset.
Args:
n (int): Index of sample to be loaded.
Returns:
(torch.Tensor, int, str):
torch.Tensor
Waveform.
int
Sample rate.
str
File name.
"""
audio_path, sample_rate, filename = self.get_metadata(n)
path = Path(audio_path)
return _load_waveform(path.parent, path.name, sample_rate), sample_rate, filename
def __len__(self) -> int:
return len(self._walker)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment