Update VCTK_092 interface and add tests (#875)

* Tweak docstring, audio_ext, load method signature and constructor of VCTK_092 * Add test for VCTK_092 dataset.

Update VCTK_092 interface and add tests (#875)
* Tweak docstring, audio_ext, load method signature and constructor of VCTK_092 * Add test for VCTK_092 dataset.
2205cc9e · JianwuXu · GitHub · 4bfebd85 · 2205cc9e · 2205cc9e
Unverified Commit 2205cc9e authored Aug 20, 2020 by JianwuXu Committed by GitHub Aug 20, 2020
4 changed files
--- a/docs/source/datasets.rst
+++ b/docs/source/datasets.rst
@@ -93,8 +93,6 @@ VCTK_092
 ~~~~~~~~
 .. autoclass:: VCTK_092
-  :members: __getitem__
-  :special-members:
 YESNO

--- a/test/torchaudio_unittest/datasets/vctk_test.py
+++ b/test/torchaudio_unittest/datasets/vctk_test.py
+import os
+from torchaudio.datasets import vctk
+from torchaudio_unittest.common_utils import (
+    TempDirMixin,
+    TorchaudioTestCase,
+    get_whitenoise,
+    save_wav,
+    normalize_wav,
+)
+# Used to generate a unique utterance for each dummy audio file
+UTTERANCE = [
+    'Please call Stella',
+    'Ask her to bring these things',
+    'with her from the store',
+    'Six spoons of fresh snow peas, five thick slabs of blue cheese, and maybe a snack for her brother Bob',
+    'We also need a small plastic snake and a big toy frog for the kids',
+    'She can scoop these things into three red bags, and we will go meet her Wednesday at the train station',
+    'When the sunlight strikes raindrops in the air, they act as a prism and form a rainbow',
+    'The rainbow is a division of white light into many beautiful colors',
+    'These take the shape of a long round arch, with its path high above, and its two ends \
+        apparently beyond the horizon',
+    'There is, according to legend, a boiling pot of gold at one end'
+]
+class TestVCTK(TempDirMixin, TorchaudioTestCase):
+    backend = 'default'
+    root_dir = None
+    samples = []
+    @classmethod
+    def setUpClass(cls):
+        cls.root_dir = cls.get_base_temp_dir()
+        dataset_dir = os.path.join(cls.root_dir, 'VCTK-Corpus-0.92')
+        os.makedirs(dataset_dir, exist_ok=True)
+        sample_rate = 48000
+        seed = 0
+        for speaker in range(225, 230):
+            speaker_id = 'p' + str(speaker)
+            audio_dir = os.path.join(dataset_dir, 'wav48_silence_trimmed', speaker_id)
+            os.makedirs(audio_dir, exist_ok=True)
+            file_dir = os.path.join(dataset_dir, 'txt', speaker_id)
+            os.makedirs(file_dir, exist_ok=True)
+            for utterance_id in range(1, 11):
+                filename = f'{speaker_id}_{utterance_id:03d}_mic2'
+                audio_file_path = os.path.join(audio_dir, filename + '.wav')
+                data = get_whitenoise(
+                    sample_rate=sample_rate,
+                    duration=0.01,
+                    n_channels=1,
+                    dtype='float32',
+                    seed=seed
+                )
+                save_wav(audio_file_path, data, sample_rate)
+                txt_file_path = os.path.join(file_dir, filename[:-5] + '.txt')
+                utterance = UTTERANCE[utterance_id - 1]
+                with open(txt_file_path, 'w') as f:
+                    f.write(utterance)
+                sample = (
+                    normalize_wav(data),
+                    sample_rate,
+                    utterance,
+                    speaker_id,
+                    utterance_id
+                )
+                cls.samples.append(sample)
+                seed += 1
+    def test_vctk(self):
+        dataset = vctk.VCTK_092(self.root_dir, audio_ext=".wav")
+        num_samples = 0
+        for i, (data, sample_rate, utterance, speaker_id, utterance_id) in enumerate(dataset):
+            self.assertEqual(data, self.samples[i][0], atol=5e-5, rtol=1e-8)
+            assert sample_rate == self.samples[i][1]
+            assert utterance == self.samples[i][2]
+            assert speaker_id == self.samples[i][3]
+            assert int(utterance_id) == self.samples[i][4]
+            num_samples += 1
+        assert num_samples == len(self.samples)
--- a/torchaudio/datasets/__init__.py
+++ b/torchaudio/datasets/__init__.py
@@ -2,7 +2,7 @@ from .commonvoice import COMMONVOICE
 from .librispeech import LIBRISPEECH
 from .speechcommands import SPEECHCOMMANDS
 from .utils import bg_iterator, diskcache_iterator
-from .vctk import VCTK
+from .vctk import VCTK, VCTK_092
 from .gtzan import GTZAN
 from .yesno import YESNO
 from .ljspeech import LJSPEECH
@@ -14,6 +14,7 @@ __all__ = (
    "LIBRISPEECH",
    "SPEECHCOMMANDS",
    "VCTK",
+    "VCTK_092",
    "YESNO",
    "LJSPEECH",
    "GTZAN",

--- a/torchaudio/datasets/vctk.py
+++ b/torchaudio/datasets/vctk.py
@@ -15,11 +15,12 @@ from torchaudio.datasets.utils import (
 URL = "https://datashare.is.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip"
 FOLDER_IN_ARCHIVE = "VCTK-Corpus"
 _CHECKSUMS = {
-    "https://datashare.is.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip":
+    "https://datashare.is.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip": "8a6ba2946b36fcbef0212cad601f4bfa"
-    "8a6ba2946b36fcbef0212cad601f4bfa"
 }
-Sample = namedtuple('Sample', ['waveform', 'sample_rate', 'utterance', 'speaker_id', 'utterance_id'])
+Sample = namedtuple(
+    "Sample", ["waveform", "sample_rate", "utterance", "speaker_id", "utterance_id"]
+)
 def load_vctk_item(fileid: str,
@@ -142,23 +143,45 @@ class VCTK(Dataset):
 class VCTK_092(Dataset):
-    """
+    """Create VCTK 0.92 Dataset
-    Create a Dataset for VCTK 0.92, the latest version of the VCTK dataset.
-    Each item is a tuple of the form: (waveform, sample_rate, utterance, speaker_id, utterance_id)
+    An item is a ``namedtuple`` of (``waveform``, ``sample_rate``, ``utterance``,
-    Folder `p315` will be ignored due to the non-existent corresponding text files.
+    ``speaker_id``, ``utterance_id``)
-    For more information about the dataset visit: https://datashare.is.ed.ac.uk/handle/10283/3443
+    Args:
+        root (str): Root directory where the dataset's top level directory is found.
+        mic_id (str): Microphone ID. Either ``"mic1"`` or ``"mic2"``
+        download (bool, optional): Download the dataset if not found in the given directory.
+        url (str, optional): URL from which the dataset is downloaded.
+        audio_ext (str, optional): Custom audio extension if dataset is converted to non-default audio format.
+    Note:
+        * All the speeches from speaker ``p315`` will be skipped due to the lack of the corresponding text files.
+        * All the speeches from ``p280`` will be skipped for ``mic_id="mic2"`` due to the lack of the audio files.
+        * Some of the speeches from speaker ``p362`` will be skipped due to the lack of  the audio files.
+        * See Also: https://datashare.is.ed.ac.uk/handle/10283/3443
    """
    def __init__(
-        self, root: str, url: str = URL, download: bool = False, mic_id: str = "mic2"
+        self,
-    ) -> None:
+        root: str,
+        mic_id: str = "mic2",
+        download: bool = False,
+        url: str = URL,
+        audio_ext=".flac",
+    ):
+        if mic_id not in ["mic1", "mic2"]:
+            raise RuntimeError(
+                f'`mic_id` has to be either "mic1" or "mic2". Found: {mic_id}'
+            )
-        archive = os.path.join(root, os.path.basename("VCTK-Corpus-0.92.zip"))
+        archive = os.path.join(root, "VCTK-Corpus-0.92.zip")
        self._path = os.path.join(root, "VCTK-Corpus-0.92")
        self._txt_dir = os.path.join(self._path, "txt")
        self._audio_dir = os.path.join(self._path, "wav48_silence_trimmed")
        self._mic_id = mic_id
+        self._audio_ext = audio_ext
        if download:
            if not os.path.isdir(self._path):
@@ -187,16 +210,18 @@ class VCTK_092(Dataset):
        different parameters required by the user.
        """
        for speaker_id in self._speaker_ids:
+            if speaker_id == "p280" and mic_id == "mic2":
+                continue
            utterance_dir = os.path.join(self._txt_dir, speaker_id)
            for utterance_file in sorted(
                f for f in os.listdir(utterance_dir) if f.endswith(".txt")
            ):
                utterance_id = os.path.splitext(utterance_file)[0]
                audio_path_mic = os.path.join(
-                    self._audio_dir, speaker_id, f"{utterance_id}_{mic_id}.flac"
+                    self._audio_dir,
+                    speaker_id,
+                    f"{utterance_id}_{mic_id}{self._audio_ext}",
                )
-                if speaker_id == "p280" and mic_id == "mic2":
-                    break
                if speaker_id == "p362" and not os.path.isfile(audio_path_mic):
                    continue
                self._sample_ids.append(utterance_id.split("_"))
@@ -208,12 +233,14 @@ class VCTK_092(Dataset):
    def _load_audio(self, file_path) -> Tuple[Tensor, int]:
        return torchaudio.load(file_path)
-    def load_sample(self, speaker_id: str, utterance_id: str, mic_id: str) -> Sample:
+    def _load_sample(self, speaker_id: str, utterance_id: str, mic_id: str) -> Sample:
        utterance_path = os.path.join(
            self._txt_dir, speaker_id, f"{speaker_id}_{utterance_id}.txt"
        )
        audio_path = os.path.join(
-            self._audio_dir, speaker_id, f"{speaker_id}_{utterance_id}_{mic_id}.flac"
+            self._audio_dir,
+            speaker_id,
+            f"{speaker_id}_{utterance_id}_{mic_id}{self._audio_ext}",
        )
        # Reading text
@@ -226,7 +253,7 @@ class VCTK_092(Dataset):
    def __getitem__(self, n: int) -> Sample:
        speaker_id, utterance_id = self._sample_ids[n]
-        return self.load_sample(speaker_id, utterance_id, self._mic_id)
+        return self._load_sample(speaker_id, utterance_id, self._mic_id)
    def __len__(self) -> int:
        return len(self._sample_ids)