Unverified Commit 4b8aad7a authored by jimchen90's avatar jimchen90 Committed by GitHub
Browse files

Add LibriTTS dataset (#790)



* Add libritts

Add LibriTTS dataset draft

* Add libritts

Use two separate ids for utterance_id.

* Update output form

Use full_id as utterance_id.

* Update format

Add space and test black format

* Update test method

* Add audio and text test

Generate audio and test files on-the-fly in test 

* Update format

* Fix test error and remove assets libritts

The test error is fixed by sorting the file in 4th element instead of 2nd element in samples. Since the files are generated on-the-fly, so the the libritts files in assets are removed.

* Add seed in `get_whitenoise` function

* Change utterance to text

Change `_utterance` to `_text`.
Co-authored-by: default avatarJi Chen <jimchen90@devfair0160.h2.fair>
parent 209858ea
...@@ -57,6 +57,14 @@ LIBRISPEECH ...@@ -57,6 +57,14 @@ LIBRISPEECH
:special-members: :special-members:
LIBRITTS
~~~~~~~~
.. autoclass:: LIBRITTS
:members: __getitem__
:special-members:
LJSPEECH LJSPEECH
~~~~~~~~ ~~~~~~~~
......
...@@ -10,6 +10,7 @@ from torchaudio.datasets.yesno import YESNO ...@@ -10,6 +10,7 @@ from torchaudio.datasets.yesno import YESNO
from torchaudio.datasets.ljspeech import LJSPEECH from torchaudio.datasets.ljspeech import LJSPEECH
from torchaudio.datasets.gtzan import GTZAN from torchaudio.datasets.gtzan import GTZAN
from torchaudio.datasets.cmuarctic import CMUARCTIC from torchaudio.datasets.cmuarctic import CMUARCTIC
from torchaudio.datasets.libritts import LIBRITTS
from .common_utils import ( from .common_utils import (
TempDirMixin, TempDirMixin,
...@@ -110,5 +111,67 @@ class TestYesNo(TempDirMixin, TorchaudioTestCase): ...@@ -110,5 +111,67 @@ class TestYesNo(TempDirMixin, TorchaudioTestCase):
assert label == expected_label assert label == expected_label
class TestLibriTTS(TempDirMixin, TorchaudioTestCase):
backend = 'default'
root_dir = None
data = []
utterance_ids = [
[19, 198, '000000', '000000'],
[26, 495, '000004', '000000'],
]
original_text = 'this is the original text.'
normalized_text = 'this is the normalized text.'
@classmethod
def setUpClass(cls):
cls.root_dir = cls.get_base_temp_dir()
base_dir = os.path.join(cls.root_dir, 'LibriTTS', 'train-clean-100')
for i, utterance_id in enumerate(cls.utterance_ids):
filename = f'{"_".join(str(u) for u in utterance_id)}.wav'
file_dir = os.path.join(base_dir, str(utterance_id[0]), str(utterance_id[1]))
os.makedirs(file_dir, exist_ok=True)
path = os.path.join(file_dir, filename)
data = get_whitenoise(sample_rate=8000, duration=6, n_channels=1, dtype='int16', seed=i)
save_wav(path, data, 8000)
cls.data.append(normalize_wav(data))
original_text_filename = f'{"_".join(str(u) for u in utterance_id)}.original.txt'
path_original = os.path.join(file_dir, original_text_filename)
f = open(path_original, 'w')
f.write(cls.original_text)
f.close()
normalized_text_filename = f'{"_".join(str(u) for u in utterance_id)}.normalized.txt'
path_normalized = os.path.join(file_dir, normalized_text_filename)
f = open(path_normalized, 'w')
f.write(cls.normalized_text)
f.close()
def test_libritts(self):
dataset = LIBRITTS(self.root_dir)
samples = list(dataset)
samples.sort(key=lambda s: s[4])
for i, (waveform,
sample_rate,
original_text,
normalized_text,
speaker_id,
chapter_id,
utterance_id) in enumerate(samples):
expected_ids = self.utterance_ids[i]
expected_data = self.data[i]
self.assertEqual(expected_data, waveform, atol=5e-5, rtol=1e-8)
assert sample_rate == 8000
assert speaker_id == expected_ids[0]
assert chapter_id == expected_ids[1]
assert original_text == self.original_text
assert normalized_text == self.normalized_text
assert utterance_id == f'{"_".join(str(u) for u in expected_ids[-4:])}'
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -7,6 +7,7 @@ from .gtzan import GTZAN ...@@ -7,6 +7,7 @@ from .gtzan import GTZAN
from .yesno import YESNO from .yesno import YESNO
from .ljspeech import LJSPEECH from .ljspeech import LJSPEECH
from .cmuarctic import CMUARCTIC from .cmuarctic import CMUARCTIC
from .libritts import LIBRITTS
__all__ = ( __all__ = (
"COMMONVOICE", "COMMONVOICE",
...@@ -17,6 +18,7 @@ __all__ = ( ...@@ -17,6 +18,7 @@ __all__ = (
"LJSPEECH", "LJSPEECH",
"GTZAN", "GTZAN",
"CMUARCTIC", "CMUARCTIC",
"LIBRITTS"
"diskcache_iterator", "diskcache_iterator",
"bg_iterator", "bg_iterator",
) )
import os
from typing import Tuple
import torchaudio
from torch import Tensor
from torch.utils.data import Dataset
from torchaudio.datasets.utils import (
download_url,
extract_archive,
walk_files,
)
URL = "train-clean-100"
FOLDER_IN_ARCHIVE = "LibriTTS"
_CHECKSUMS = {
"http://www.openslr.org/60/dev-clean.tar.gz": "0c3076c1e5245bb3f0af7d82087ee207",
"http://www.openslr.org/60/dev-other.tar.gz": "815555d8d75995782ac3ccd7f047213d",
"http://www.openslr.org/60/test-clean.tar.gz": "7bed3bdb047c4c197f1ad3bc412db59f",
"http://www.openslr.org/60/test-other.tar.gz": "ae3258249472a13b5abef2a816f733e4",
"http://www.openslr.org/60/train-clean-100.tar.gz": "4a8c202b78fe1bc0c47916a98f3a2ea8",
"http://www.openslr.org/60/train-clean-360.tar.gz": "a84ef10ddade5fd25df69596a2767b2d",
"http://www.openslr.org/60/train-other-500.tar.gz": "7b181dd5ace343a5f38427999684aa6f",
}
def load_libritts_item(
fileid: str,
path: str,
ext_audio: str,
ext_original_txt: str,
ext_normalized_txt: str,
) -> Tuple[Tensor, int, str, str, int, int, str]:
speaker_id, chapter_id, segment_id, utterance_id = fileid.split("_")
utterance_id = fileid
normalized_text = utterance_id + ext_normalized_txt
normalized_text = os.path.join(path, speaker_id, chapter_id, normalized_text)
original_text = utterance_id + ext_original_txt
original_text = os.path.join(path, speaker_id, chapter_id, original_text)
file_audio = utterance_id + ext_audio
file_audio = os.path.join(path, speaker_id, chapter_id, file_audio)
# Load audio
waveform, sample_rate = torchaudio.load(file_audio)
# Load original text
with open(original_text) as ft:
original_text = ft.readline()
# Load normalized text
with open(normalized_text, "r") as ft:
normalized_text = ft.readline()
return (
waveform,
sample_rate,
original_text,
normalized_text,
int(speaker_id),
int(chapter_id),
utterance_id,
)
class LIBRITTS(Dataset):
"""
Create a Dataset for LibriTTS. Each item is a tuple of the form:
waveform, sample_rate, original_text, normalized_text, speaker_id, chapter_id, utterance_id
"""
_ext_original_txt = ".original.txt"
_ext_normalized_txt = ".normalized.txt"
_ext_audio = ".wav"
def __init__(
self,
root: str,
url: str = URL,
folder_in_archive: str = FOLDER_IN_ARCHIVE,
download: bool = False,
) -> None:
if url in [
"dev-clean",
"dev-other",
"test-clean",
"test-other",
"train-clean-100",
"train-clean-360",
"train-other-500",
]:
ext_archive = ".tar.gz"
base_url = "http://www.openslr.org/resources/60/"
url = os.path.join(base_url, url + ext_archive)
basename = os.path.basename(url)
archive = os.path.join(root, basename)
basename = basename.split(".")[0]
folder_in_archive = os.path.join(folder_in_archive, basename)
self._path = os.path.join(root, folder_in_archive)
if download:
if not os.path.isdir(self._path):
if not os.path.isfile(archive):
checksum = _CHECKSUMS.get(url, None)
download_url(url, root, hash_value=checksum)
extract_archive(archive)
walker = walk_files(
self._path, suffix=self._ext_audio, prefix=False, remove_suffix=True
)
self._walker = list(walker)
def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str, int, int, str]:
fileid = self._walker[n]
return load_libritts_item(
fileid,
self._path,
self._ext_audio,
self._ext_original_txt,
self._ext_normalized_txt,
)
def __len__(self) -> int:
return len(self._walker)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment