"vscode:/vscode.git/clone" did not exist on "bdb8b2b7f1bd1a56d20889e87d56302e46000ad8"
Unverified Commit 2205cc9e authored by JianwuXu's avatar JianwuXu Committed by GitHub
Browse files

Update VCTK_092 interface and add tests (#875)

* Tweak docstring, audio_ext, load method signature and constructor of VCTK_092

* Add test for VCTK_092 dataset.
parent 4bfebd85
...@@ -93,8 +93,6 @@ VCTK_092 ...@@ -93,8 +93,6 @@ VCTK_092
~~~~~~~~ ~~~~~~~~
.. autoclass:: VCTK_092 .. autoclass:: VCTK_092
:members: __getitem__
:special-members:
YESNO YESNO
......
import os
from torchaudio.datasets import vctk
from torchaudio_unittest.common_utils import (
TempDirMixin,
TorchaudioTestCase,
get_whitenoise,
save_wav,
normalize_wav,
)
# Used to generate a unique utterance for each dummy audio file
UTTERANCE = [
'Please call Stella',
'Ask her to bring these things',
'with her from the store',
'Six spoons of fresh snow peas, five thick slabs of blue cheese, and maybe a snack for her brother Bob',
'We also need a small plastic snake and a big toy frog for the kids',
'She can scoop these things into three red bags, and we will go meet her Wednesday at the train station',
'When the sunlight strikes raindrops in the air, they act as a prism and form a rainbow',
'The rainbow is a division of white light into many beautiful colors',
'These take the shape of a long round arch, with its path high above, and its two ends \
apparently beyond the horizon',
'There is, according to legend, a boiling pot of gold at one end'
]
class TestVCTK(TempDirMixin, TorchaudioTestCase):
backend = 'default'
root_dir = None
samples = []
@classmethod
def setUpClass(cls):
cls.root_dir = cls.get_base_temp_dir()
dataset_dir = os.path.join(cls.root_dir, 'VCTK-Corpus-0.92')
os.makedirs(dataset_dir, exist_ok=True)
sample_rate = 48000
seed = 0
for speaker in range(225, 230):
speaker_id = 'p' + str(speaker)
audio_dir = os.path.join(dataset_dir, 'wav48_silence_trimmed', speaker_id)
os.makedirs(audio_dir, exist_ok=True)
file_dir = os.path.join(dataset_dir, 'txt', speaker_id)
os.makedirs(file_dir, exist_ok=True)
for utterance_id in range(1, 11):
filename = f'{speaker_id}_{utterance_id:03d}_mic2'
audio_file_path = os.path.join(audio_dir, filename + '.wav')
data = get_whitenoise(
sample_rate=sample_rate,
duration=0.01,
n_channels=1,
dtype='float32',
seed=seed
)
save_wav(audio_file_path, data, sample_rate)
txt_file_path = os.path.join(file_dir, filename[:-5] + '.txt')
utterance = UTTERANCE[utterance_id - 1]
with open(txt_file_path, 'w') as f:
f.write(utterance)
sample = (
normalize_wav(data),
sample_rate,
utterance,
speaker_id,
utterance_id
)
cls.samples.append(sample)
seed += 1
def test_vctk(self):
dataset = vctk.VCTK_092(self.root_dir, audio_ext=".wav")
num_samples = 0
for i, (data, sample_rate, utterance, speaker_id, utterance_id) in enumerate(dataset):
self.assertEqual(data, self.samples[i][0], atol=5e-5, rtol=1e-8)
assert sample_rate == self.samples[i][1]
assert utterance == self.samples[i][2]
assert speaker_id == self.samples[i][3]
assert int(utterance_id) == self.samples[i][4]
num_samples += 1
assert num_samples == len(self.samples)
...@@ -2,7 +2,7 @@ from .commonvoice import COMMONVOICE ...@@ -2,7 +2,7 @@ from .commonvoice import COMMONVOICE
from .librispeech import LIBRISPEECH from .librispeech import LIBRISPEECH
from .speechcommands import SPEECHCOMMANDS from .speechcommands import SPEECHCOMMANDS
from .utils import bg_iterator, diskcache_iterator from .utils import bg_iterator, diskcache_iterator
from .vctk import VCTK from .vctk import VCTK, VCTK_092
from .gtzan import GTZAN from .gtzan import GTZAN
from .yesno import YESNO from .yesno import YESNO
from .ljspeech import LJSPEECH from .ljspeech import LJSPEECH
...@@ -14,6 +14,7 @@ __all__ = ( ...@@ -14,6 +14,7 @@ __all__ = (
"LIBRISPEECH", "LIBRISPEECH",
"SPEECHCOMMANDS", "SPEECHCOMMANDS",
"VCTK", "VCTK",
"VCTK_092",
"YESNO", "YESNO",
"LJSPEECH", "LJSPEECH",
"GTZAN", "GTZAN",
......
...@@ -15,11 +15,12 @@ from torchaudio.datasets.utils import ( ...@@ -15,11 +15,12 @@ from torchaudio.datasets.utils import (
URL = "https://datashare.is.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip" URL = "https://datashare.is.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip"
FOLDER_IN_ARCHIVE = "VCTK-Corpus" FOLDER_IN_ARCHIVE = "VCTK-Corpus"
_CHECKSUMS = { _CHECKSUMS = {
"https://datashare.is.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip": "https://datashare.is.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip": "8a6ba2946b36fcbef0212cad601f4bfa"
"8a6ba2946b36fcbef0212cad601f4bfa"
} }
Sample = namedtuple('Sample', ['waveform', 'sample_rate', 'utterance', 'speaker_id', 'utterance_id']) Sample = namedtuple(
"Sample", ["waveform", "sample_rate", "utterance", "speaker_id", "utterance_id"]
)
def load_vctk_item(fileid: str, def load_vctk_item(fileid: str,
...@@ -142,23 +143,45 @@ class VCTK(Dataset): ...@@ -142,23 +143,45 @@ class VCTK(Dataset):
class VCTK_092(Dataset): class VCTK_092(Dataset):
""" """Create VCTK 0.92 Dataset
Create a Dataset for VCTK 0.92, the latest version of the VCTK dataset.
Each item is a tuple of the form: (waveform, sample_rate, utterance, speaker_id, utterance_id) An item is a ``namedtuple`` of (``waveform``, ``sample_rate``, ``utterance``,
Folder `p315` will be ignored due to the non-existent corresponding text files. ``speaker_id``, ``utterance_id``)
For more information about the dataset visit: https://datashare.is.ed.ac.uk/handle/10283/3443
Args:
root (str): Root directory where the dataset's top level directory is found.
mic_id (str): Microphone ID. Either ``"mic1"`` or ``"mic2"``
download (bool, optional): Download the dataset if not found in the given directory.
url (str, optional): URL from which the dataset is downloaded.
audio_ext (str, optional): Custom audio extension if dataset is converted to non-default audio format.
Note:
* All the speeches from speaker ``p315`` will be skipped due to the lack of the corresponding text files.
* All the speeches from ``p280`` will be skipped for ``mic_id="mic2"`` due to the lack of the audio files.
* Some of the speeches from speaker ``p362`` will be skipped due to the lack of the audio files.
* See Also: https://datashare.is.ed.ac.uk/handle/10283/3443
""" """
def __init__( def __init__(
self, root: str, url: str = URL, download: bool = False, mic_id: str = "mic2" self,
) -> None: root: str,
mic_id: str = "mic2",
download: bool = False,
url: str = URL,
audio_ext=".flac",
):
if mic_id not in ["mic1", "mic2"]:
raise RuntimeError(
f'`mic_id` has to be either "mic1" or "mic2". Found: {mic_id}'
)
archive = os.path.join(root, os.path.basename("VCTK-Corpus-0.92.zip")) archive = os.path.join(root, "VCTK-Corpus-0.92.zip")
self._path = os.path.join(root, "VCTK-Corpus-0.92") self._path = os.path.join(root, "VCTK-Corpus-0.92")
self._txt_dir = os.path.join(self._path, "txt") self._txt_dir = os.path.join(self._path, "txt")
self._audio_dir = os.path.join(self._path, "wav48_silence_trimmed") self._audio_dir = os.path.join(self._path, "wav48_silence_trimmed")
self._mic_id = mic_id self._mic_id = mic_id
self._audio_ext = audio_ext
if download: if download:
if not os.path.isdir(self._path): if not os.path.isdir(self._path):
...@@ -187,16 +210,18 @@ class VCTK_092(Dataset): ...@@ -187,16 +210,18 @@ class VCTK_092(Dataset):
different parameters required by the user. different parameters required by the user.
""" """
for speaker_id in self._speaker_ids: for speaker_id in self._speaker_ids:
if speaker_id == "p280" and mic_id == "mic2":
continue
utterance_dir = os.path.join(self._txt_dir, speaker_id) utterance_dir = os.path.join(self._txt_dir, speaker_id)
for utterance_file in sorted( for utterance_file in sorted(
f for f in os.listdir(utterance_dir) if f.endswith(".txt") f for f in os.listdir(utterance_dir) if f.endswith(".txt")
): ):
utterance_id = os.path.splitext(utterance_file)[0] utterance_id = os.path.splitext(utterance_file)[0]
audio_path_mic = os.path.join( audio_path_mic = os.path.join(
self._audio_dir, speaker_id, f"{utterance_id}_{mic_id}.flac" self._audio_dir,
speaker_id,
f"{utterance_id}_{mic_id}{self._audio_ext}",
) )
if speaker_id == "p280" and mic_id == "mic2":
break
if speaker_id == "p362" and not os.path.isfile(audio_path_mic): if speaker_id == "p362" and not os.path.isfile(audio_path_mic):
continue continue
self._sample_ids.append(utterance_id.split("_")) self._sample_ids.append(utterance_id.split("_"))
...@@ -208,12 +233,14 @@ class VCTK_092(Dataset): ...@@ -208,12 +233,14 @@ class VCTK_092(Dataset):
def _load_audio(self, file_path) -> Tuple[Tensor, int]: def _load_audio(self, file_path) -> Tuple[Tensor, int]:
return torchaudio.load(file_path) return torchaudio.load(file_path)
def load_sample(self, speaker_id: str, utterance_id: str, mic_id: str) -> Sample: def _load_sample(self, speaker_id: str, utterance_id: str, mic_id: str) -> Sample:
utterance_path = os.path.join( utterance_path = os.path.join(
self._txt_dir, speaker_id, f"{speaker_id}_{utterance_id}.txt" self._txt_dir, speaker_id, f"{speaker_id}_{utterance_id}.txt"
) )
audio_path = os.path.join( audio_path = os.path.join(
self._audio_dir, speaker_id, f"{speaker_id}_{utterance_id}_{mic_id}.flac" self._audio_dir,
speaker_id,
f"{speaker_id}_{utterance_id}_{mic_id}{self._audio_ext}",
) )
# Reading text # Reading text
...@@ -226,7 +253,7 @@ class VCTK_092(Dataset): ...@@ -226,7 +253,7 @@ class VCTK_092(Dataset):
def __getitem__(self, n: int) -> Sample: def __getitem__(self, n: int) -> Sample:
speaker_id, utterance_id = self._sample_ids[n] speaker_id, utterance_id = self._sample_ids[n]
return self.load_sample(speaker_id, utterance_id, self._mic_id) return self._load_sample(speaker_id, utterance_id, self._mic_id)
def __len__(self) -> int: def __len__(self) -> int:
return len(self._sample_ids) return len(self._sample_ids)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment