Add VoxCeleb1 dataset (#2349)

Summary: This PR adds two dataset classes of VoxCeleb1 corpus. - `VoxCeleb1Identification` Each data sample contains the waveform, sample rate, speaker id, and the file id. - `VoxCeleb1Verification` Each data sample contains a pair of waveforms, sample rate, the label indicating if they are from the same speaker, and the file ids. Pull Request resolved: https://github.com/pytorch/audio/pull/2349 Reviewed By: carolineechen Differential Revision: D35927921 Pulled By: nateanl fbshipit-source-id: 3e07ddd329178777698841565053eb59befe6449

Add VoxCeleb1 dataset (#2349)
Summary: This PR adds two dataset classes of VoxCeleb1 corpus. - `VoxCeleb1Identification` Each data sample contains the waveform, sample rate, speaker id, and the file id. - `VoxCeleb1Verification` Each data sample contains a pair of waveforms, sample rate, the label indicating if they are from the same speaker, and the file ids. Pull Request resolved: https://github.com/pytorch/audio/pull/2349 Reviewed By: carolineechen Differential Revision: D35927921 Pulled By: nateanl fbshipit-source-id: 3e07ddd329178777698841565053eb59befe6449
21b2d139 · Zhaoheng Ni · Facebook GitHub Bot · 49551eed · 21b2d139 · 21b2d139
Commit 21b2d139 authored Jun 27, 2022 by Zhaoheng Ni Committed by Facebook GitHub Bot Jun 27, 2022
5 changed files
--- a/docs/source/datasets.rst
+++ b/docs/source/datasets.rst
@@ -114,6 +114,22 @@ VCTK_092
  :special-members: __getitem__


+VoxCeleb1Identification
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: VoxCeleb1Identification
+  :members:
+  :special-members: __getitem__
+
+
+VoxCeleb1Verification
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: VoxCeleb1Verification
+  :members:
+  :special-members: __getitem__
+
+
 DR_VCTK
 ~~~~~~~~


--- a/docs/source/refs.bib
+++ b/docs/source/refs.bib
@@ -367,6 +367,7 @@
  year         = 2019,
  doi          = {10.5281/zenodo.3338373},
  url          = {https://doi.org/10.5281/zenodo.3338373}
+}
 @inproceedings{fluent,
  author    = {Loren Lugosch and Mirco Ravanelli and Patrick Ignoto and Vikrant Singh Tomar and Yoshua Bengio},
  editor    = {Gernot Kubin and Zdravko Kacic},
@@ -375,3 +376,9 @@
  pages     = {814--818},
  year      = {2019},
 }
+@article{nagrani2017voxceleb,
+  title={Voxceleb: a large-scale speaker identification dataset},
+  author={Nagrani, Arsha and Chung, Joon Son and Zisserman, Andrew},
+  journal={arXiv preprint arXiv:1706.08612},
+  year={2017}
+}
--- a/test/torchaudio_unittest/datasets/voxceleb1_test.py
+++ b/test/torchaudio_unittest/datasets/voxceleb1_test.py
+import os
+
+from torchaudio.datasets import voxceleb1
+from torchaudio_unittest.common_utils import get_whitenoise, save_wav, TempDirMixin, TorchaudioTestCase
+
+
+_NUM_SPEAKERS = 3
+_NUM_YOUTUBE = 5
+
+
+def _save_sample(dataset_dir: str, sample_rate: int, speaker_id: int, youtube_id: int, utterance_id: int, seed: int):
+    """Create and save audio samples to corresponding files
+
+    Args:
+        dataset_dir (str): The directory of the dataset.
+        sample_rate (int): Sample rate of waveform.
+        speaker_id (int): The index of speaker sub directory.
+        youtube_id (int): The index of youtube sub directory.
+        utterance_id (int): The utterance index.
+        seed (int): The seed to generate the waveform.
+
+    Returns:
+        Tuple[torch.Tensor, int, int, str, str]
+        The waveform Tensor, sample rate, speaker label, file_name, and the file path.
+    """
+    # add random string before youtube_id
+    youtube_id = "Zxhsj" + str(youtube_id)
+    path = os.path.join(dataset_dir, "id10" + str(speaker_id), youtube_id)
+    os.makedirs(path, exist_ok=True)
+    filename = str(utterance_id) + ".wav"
+    file_path = os.path.join(path, filename)
+    waveform = get_whitenoise(
+        sample_rate=sample_rate,
+        duration=0.01,
+        n_channels=1,
+        seed=seed,
+    )
+    save_wav(file_path, waveform, sample_rate)
+    file_name = "-".join(["id10" + str(speaker_id), youtube_id, str(utterance_id)])
+    file_path = "/".join(["id10" + str(speaker_id), youtube_id, str(utterance_id) + ".wav"])
+    return waveform, sample_rate, speaker_id, file_name, file_path
+
+
+def get_mock_iden_dataset(root_dir: str, meta_file: str):
+    """Get the mocked dataset for VoxCeleb1Identification dataset.
+
+    Args:
+        root_dir (str): Directory to the mocked dataset
+        meta_file (str): The file name which stores the file list.
+
+    Returns:
+        Tuple[List, List, List]:
+        The mocked samples for train, dev, and test subsets.
+    """
+    os.makedirs(root_dir, exist_ok=True)
+    wav_dir = os.path.join(root_dir, "wav")
+    os.makedirs(wav_dir, exist_ok=True)
+
+    mocked_train_samples, mocked_dev_samples, mocked_test_samples = [], [], []
+    sample_rate = 16000
+    seed = 0
+    idx = 1
+
+    with open(os.path.join(root_dir, meta_file), "w") as f:
+        for speaker_id in range(_NUM_SPEAKERS):
+            for youtube_id in range(_NUM_YOUTUBE):
+                waveform, sample_rate, speaker_id, file_name, file_path = _save_sample(
+                    wav_dir, sample_rate, speaker_id, youtube_id, idx, seed
+                )
+                sample = (waveform, sample_rate, speaker_id, file_name)
+                if idx % 1 == 0:
+                    mocked_train_samples.append(sample)
+                    f.write(f"1 {file_path}\n")
+                elif idx % 2 == 0:
+                    mocked_dev_samples.append(sample)
+                    f.write(f"2 {file_path}\n")
+                else:
+                    mocked_test_samples.append(sample)
+                    f.write(f"3 {file_path}\n")
+                idx += 1
+    return (
+        mocked_train_samples,
+        mocked_dev_samples,
+        mocked_test_samples,
+    )
+
+
+def get_mock_veri_dataset(root_dir: str, meta_file: str):
+    """Get the mocked dataset for VoxCeleb1Verification dataset.
+
+    Args:
+        root_dir (str): Directory to the mocked dataset
+        meta_file (str): The file name which stores the file list.
+
+    Returns:
+        List[Sample]:
+        The mocked samples.
+    """
+    os.makedirs(root_dir, exist_ok=True)
+    wav_dir = os.path.join(root_dir, "wav")
+    os.makedirs(wav_dir, exist_ok=True)
+
+    mocked_samples = []
+    sample_rate = 16000
+    seed = 0
+    idx = 1
+
+    with open(os.path.join(root_dir, meta_file), "w") as f:
+        for speaker_id1 in range(_NUM_SPEAKERS):
+            for speaker_id2 in range(_NUM_SPEAKERS):
+                for youtube_id in range(_NUM_YOUTUBE):
+                    waveform_spk1, sample_rate, _, file_name_spk1, file_path_spk1 = _save_sample(
+                        wav_dir, sample_rate, speaker_id1, youtube_id, idx, seed
+                    )
+                    waveform_spk2, sample_rate, _, file_name_spk2, file_path_spk2 = _save_sample(
+                        wav_dir, sample_rate, speaker_id1, youtube_id, idx + 1, seed
+                    )
+                    if speaker_id1 == speaker_id2:
+                        label = 1
+                    else:
+                        label = 0
+                    sample = (waveform_spk1, waveform_spk2, sample_rate, label, file_name_spk1, file_name_spk2)
+                    mocked_samples.append(sample)
+                    f.write(f"{label} {file_path_spk1} {file_path_spk2}\n")
+                    idx += 2
+    return mocked_samples
+
+
+class TestVoxCeleb1Identification(TempDirMixin, TorchaudioTestCase):
+    root_dir = None
+    backend = "default"
+
+    meta_file = "iden_list.txt"
+    train_samples = {}
+    dev_samples = {}
+    test_samples = {}
+
+    @classmethod
+    def setUpClass(cls):
+        cls.root_dir = cls.get_base_temp_dir()
+        (cls.train_samples, cls.dev_samples, cls.test_samples) = get_mock_iden_dataset(cls.root_dir, cls.meta_file)
+
+    def _testVoxCeleb1Identification(self, dataset, data_samples):
+        num_samples = 0
+        for i, (waveform, sample_rate, speaker_id, file_id) in enumerate(dataset):
+            self.assertEqual(waveform, data_samples[i][0])
+            assert sample_rate == data_samples[i][1]
+            assert speaker_id == data_samples[i][2]
+            assert file_id == data_samples[i][3]
+            num_samples += 1
+
+        assert num_samples == len(data_samples)
+
+    def testVoxCeleb1SubsetTrain(self):
+        dataset = voxceleb1.VoxCeleb1Identification(self.root_dir, subset="train", meta_url=self.meta_file)
+        self._testVoxCeleb1Identification(dataset, self.train_samples)
+
+    def testVoxCeleb1SubsetDev(self):
+        dataset = voxceleb1.VoxCeleb1Identification(self.root_dir, subset="dev", meta_url=self.meta_file)
+        self._testVoxCeleb1Identification(dataset, self.dev_samples)
+
+    def testVoxCeleb1SubsetTest(self):
+        dataset = voxceleb1.VoxCeleb1Identification(self.root_dir, subset="test", meta_url=self.meta_file)
+        self._testVoxCeleb1Identification(dataset, self.test_samples)
+
+
+class TestVoxCeleb1Verification(TempDirMixin, TorchaudioTestCase):
+    root_dir = None
+    backend = "default"
+
+    meta_file = "veri_test.txt"
+    train_samples = {}
+    dev_samples = {}
+    test_samples = {}
+
+    @classmethod
+    def setUpClass(cls):
+        cls.root_dir = cls.get_base_temp_dir()
+        (cls.samples) = get_mock_veri_dataset(cls.root_dir, cls.meta_file)
+
+    def testVoxCeleb1Verification(self):
+        dataset = voxceleb1.VoxCeleb1Verification(self.root_dir, meta_url=self.meta_file)
+        num_samples = 0
+        for i, (waveform_spk1, waveform_spk2, sample_rate, label, file_id_spk1, file_id_spk2) in enumerate(dataset):
+            self.assertEqual(waveform_spk1, self.samples[i][0])
+            self.assertEqual(waveform_spk2, self.samples[i][1])
+            assert sample_rate == self.samples[i][2]
+            assert label == self.samples[i][3]
+            assert file_id_spk1 == self.samples[i][4]
+            assert file_id_spk2 == self.samples[i][5]
+            num_samples += 1
+
+        assert num_samples == len(self.samples)
--- a/torchaudio/datasets/__init__.py
+++ b/torchaudio/datasets/__init__.py
@@ -14,6 +14,7 @@ from .quesst14 import QUESST14
 from .speechcommands import SPEECHCOMMANDS
 from .tedlium import TEDLIUM
 from .vctk import VCTK_092
+from .voxceleb1 import VoxCeleb1Identification, VoxCeleb1Verification
 from .yesno import YESNO


@@ -35,4 +36,6 @@ __all__ = [
    "QUESST14",
    "MUSDB_HQ",
    "FluentSpeechCommands",
+    "VoxCeleb1Identification",
+    "VoxCeleb1Verification",
 ]
--- a/torchaudio/datasets/voxceleb1.py
+++ b/torchaudio/datasets/voxceleb1.py
+import os
+from pathlib import Path
+from typing import List, Tuple, Union
+
+import torchaudio
+from torch import Tensor
+from torch.hub import download_url_to_file
+from torch.utils.data import Dataset
+from torchaudio.datasets.utils import extract_archive
+
+
+_ARCHIVE_CONFIGS = {
+    "dev": {
+        "archive_name": "vox1_dev_wav.zip",
+        "urls": [
+            "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partaa",
+            "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partab",
+            "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partac",
+            "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partad",
+        ],
+        "checksums": [
+            "21ec6ca843659ebc2fdbe04b530baa4f191ad4b0971912672d92c158f32226a0",
+            "311d21e0c8cbf33573a4fce6c80e5a279d80736274b381c394319fc557159a04",
+            "92b64465f2b2a3dc0e4196ae8dd6828cbe9ddd1f089419a11e4cbfe2e1750df0",
+            "00e6190c770b27f27d2a3dd26ee15596b17066b715ac111906861a7d09a211a5",
+        ],
+    },
+    "test": {
+        "archive_name": "vox1_test_wav.zip",
+        "url": "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_test_wav.zip",
+        "checksum": "8de57f347fe22b2c24526e9f444f689ecf5096fc2a92018cf420ff6b5b15eaea",
+    },
+}
+_IDEN_SPLIT_URL = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/iden_split.txt"
+_VERI_TEST_URL = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test.txt"
+
+
+def _download_extract_wavs(root: str):
+    for archive in ["dev", "test"]:
+        archive_name = _ARCHIVE_CONFIGS[archive]["archive_name"]
+        archive_path = os.path.join(root, archive_name)
+        # The zip file of dev data is splited to 4 chunks.
+        # Download and combine them into one file before extraction.
+        if archive == "dev":
+            urls = _ARCHIVE_CONFIGS[archive]["urls"]
+            checksums = _ARCHIVE_CONFIGS[archive]["checksums"]
+            with open(archive_path, "wb") as f:
+                for url, checksum in zip(urls, checksums):
+                    file_path = os.path.join(root, os.path.basename(url))
+                    download_url_to_file(url, file_path, hash_prefix=checksum)
+                    with open(file_path, "rb") as f_split:
+                        f.write(f_split.read())
+        else:
+            url = _ARCHIVE_CONFIGS[archive]["url"]
+            checksum = _ARCHIVE_CONFIGS[archive]["checksum"]
+            download_url_to_file(url, archive_path, hash_prefix=checksum)
+        extract_archive(archive_path)
+
+
+def _get_flist(root: str, file_path: str, subset: str) -> List[str]:
+    f_list = []
+    if subset == "train":
+        index = 1
+    elif subset == "dev":
+        index = 2
+    else:
+        index = 3
+    with open(file_path, "r") as f:
+        for line in f:
+            id, path = line.split()
+            if int(id) == index:
+                f_list.append(path)
+    return sorted(f_list)
+
+
+def _get_paired_flist(root: str, veri_test_path: str):
+    f_list = []
+    with open(veri_test_path, "r") as f:
+        for line in f:
+            label, path1, path2 = line.split()
+            f_list.append((label, path1, path2))
+    return f_list
+
+
+def _get_file_id(file_path: str, _ext_audio: str):
+    speaker_id, youtube_id, utterance_id = file_path.split("/")[-3:]
+    utterance_id = utterance_id.replace(_ext_audio, "")
+    file_id = "-".join([speaker_id, youtube_id, utterance_id])
+    return file_id
+
+
+class VoxCeleb1(Dataset):
+    """Create *VoxCeleb1* [:footcite:`nagrani2017voxceleb`] Dataset.
+
+    Args:
+        root (str or Path): Path to the directory where the dataset is found or downloaded.
+        download (bool, optional):
+            Whether to download the dataset if it is not found at root path. (Default: ``False``).
+    """
+
+    _ext_audio = ".wav"
+
+    def __init__(self, root: Union[str, Path], download: bool = False) -> None:
+        # Get string representation of 'root' in case Path object is passed
+        root = os.fspath(root)
+        self._path = os.path.join(root, "wav")
+        if not os.path.isdir(self._path):
+            if not download:
+                raise RuntimeError(
+                    f"Dataset not found at {self._path}. Please set `download=True` to download the dataset."
+                )
+            _download_extract_wavs(root)
+
+    def __getitem__(self, n: int):
+        raise NotImplementedError
+
+    def __len__(self) -> int:
+        raise NotImplementedError
+
+
+class VoxCeleb1Identification(VoxCeleb1):
+    """Create *VoxCeleb1* [:footcite:`nagrani2017voxceleb`] Dataset for speaker identification task.
+    Each data sample contains the waveform, sample rate, speaker id, and the file id.
+
+    Args:
+        root (str or Path): Path to the directory where the dataset is found or downloaded.
+        subset (str, optional): Subset of the dataset to use. Options: ["train", "dev", "test"]. (Default: ``"train"``)
+        meta_url (str, optional): The url of meta file that contains the list of subset labels and file paths.
+            The format of each row is ``subset file_path". For example: ``1 id10006/nLEBBc9oIFs/00003.wav``.
+            ``1``, ``2``, ``3`` mean ``train``, ``dev``, and ``test`` subest, respectively.
+            (Default: ``"https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/iden_split.txt"``)
+        download (bool, optional):
+            Whether to download the dataset if it is not found at root path. (Default: ``False``).
+    """
+
+    def __init__(
+        self, root: Union[str, Path], subset: str = "train", meta_url: str = _IDEN_SPLIT_URL, download: bool = False
+    ) -> None:
+        super().__init__(root, download)
+        assert subset in ["train", "dev", "test"], "`subset` must be one of ['train', 'dev', 'test']"
+        # download the iden_split.txt to get the train, dev, test lists.
+        meta_list_path = os.path.join(root, os.path.basename(meta_url))
+        if not os.path.exists(meta_list_path):
+            download_url_to_file(meta_url, meta_list_path)
+        self._flist = _get_flist(self._path, meta_list_path, subset)
+
+    def __getitem__(self, n: int) -> Tuple[Tensor, int, int, str]:
+        """Load the n-th sample from the dataset.
+
+        Args:
+            n (int): The index of the sample to be loaded
+
+        Returns:
+            (Tensor, int, int, str):
+            ``(waveform, sample_rate, speaker_id, file_id)``
+        """
+        file_path = self._flist[n]
+        file_id = _get_file_id(file_path, self._ext_audio)
+        speaker_id = file_id.split("-")[0]
+        speaker_id = int(speaker_id[3:])
+        waveform, sample_rate = torchaudio.load(os.path.join(self._path, file_path))
+        return (waveform, sample_rate, speaker_id, file_id)
+
+    def __len__(self) -> int:
+        return len(self._flist)
+
+
+class VoxCeleb1Verification(VoxCeleb1):
+    """Create *VoxCeleb1* [:footcite:`nagrani2017voxceleb`] Dataset for speaker verification task.
+    Each data sample contains a pair of waveforms, sample rate, the label indicating if they are
+    from the same speaker, and the file ids.
+
+    Args:
+        root (str or Path): Path to the directory where the dataset is found or downloaded.
+        meta_url (str, optional): The url of meta file that contains a list of utterance pairs
+            and the corresponding labels. The format of each row is ``label file_path1 file_path2".
+            For example: ``1 id10270/x6uYqmx31kE/00001.wav id10270/8jEAjG6SegY/00008.wav``.
+            ``1`` means the two utterances are from the same speaker, ``0`` means not.
+            (Default: ``"https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test.txt"``)
+        download (bool, optional):
+            Whether to download the dataset if it is not found at root path. (Default: ``False``).
+    """
+
+    def __init__(self, root: Union[str, Path], meta_url: str = _VERI_TEST_URL, download: bool = False) -> None:
+        super().__init__(root, download)
+        # download the veri_test.txt to get the list of training pairs and labels.
+        meta_list_path = os.path.join(root, os.path.basename(meta_url))
+        if not os.path.exists(meta_list_path):
+            download_url_to_file(meta_url, meta_list_path)
+        self._flist = _get_paired_flist(self._path, meta_list_path)
+
+    def __getitem__(self, n: int) -> Tuple[Tensor, Tensor, int, int, str, str]:
+        """Load the n-th sample from the dataset.
+
+        Args:
+            n (int): The index of the sample to be loaded.
+
+        Returns:
+            (Tensor, Tensor, int, int, str, str):
+            ``(waveform_spk1, waveform_spk2, sample_rate, label, file_id_spk1, file_id_spk2)``
+        """
+        label, file_path_spk1, file_path_spk2 = self._flist[n]
+        label = int(label)
+        file_id_spk1 = _get_file_id(file_path_spk1, self._ext_audio)
+        file_id_spk2 = _get_file_id(file_path_spk2, self._ext_audio)
+        waveform_spk1, sample_rate = torchaudio.load(os.path.join(self._path, file_path_spk1))
+        waveform_spk2, sample_rate2 = torchaudio.load(os.path.join(self._path, file_path_spk2))
+        assert sample_rate == sample_rate2
+        return (waveform_spk1, waveform_spk2, sample_rate, label, file_id_spk1, file_id_spk2)
+
+    def __len__(self) -> int:
+        return len(self._flist)