LJ Speech dataset (#439)

* LJ Speech dataset * refactoring as per @vincentqb's suggestions

LJ Speech dataset (#439)
* LJ Speech dataset * refactoring as per @vincentqb's suggestions
32bae85c · Taras Sereda · GitHub · 445e14d1 · 32bae85c · 32bae85c
Unverified Commit 32bae85c authored Feb 20, 2020 by Taras Sereda Committed by GitHub Feb 20, 2020
5 changed files
--- a/test/assets/LJSpeech-1.1/metadata.csv
+++ b/test/assets/LJSpeech-1.1/metadata.csv
+LJ001-0001|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition
--- a/test/assets/LJSpeech-1.1/wavs/LJ001-0001.wav
+++ b/test/assets/LJSpeech-1.1/wavs/LJ001-0001.wav
--- a/test/test_datasets.py
+++ b/test/test_datasets.py
@@ -6,6 +6,7 @@ from torchaudio.datasets.librispeech import LIBRISPEECH
 from torchaudio.datasets.utils import diskcache_iterator, bg_iterator
 from torchaudio.datasets.vctk import VCTK
 from torchaudio.datasets.yesno import YESNO
+from torchaudio.datasets.ljspeech import LJSPEECH
 import common_utils
@@ -47,6 +48,10 @@ class TestDatasets(unittest.TestCase):
        for d in data:
            pass
+    def test_ljspeech(self):
+        data = LJSPEECH(self.path)
+        data[0]
 if __name__ == "__main__":
    unittest.main()
--- a/torchaudio/datasets/__init__.py
+++ b/torchaudio/datasets/__init__.py
@@ -3,12 +3,14 @@ from .librispeech import LIBRISPEECH
 from .utils import bg_iterator, diskcache_iterator
 from .vctk import VCTK
 from .yesno import YESNO
+from .ljspeech import LJSPEECH
 __all__ = (
    "COMMONVOICE",
    "LIBRISPEECH",
    "VCTK",
    "YESNO",
+    "LJSPEECH",
    "diskcache_iterator",
    "bg_iterator",
 )
--- a/torchaudio/datasets/ljspeech.py
+++ b/torchaudio/datasets/ljspeech.py
+import os
+import csv
+import torchaudio
+from torchaudio.datasets.utils import download_url, extract_archive, unicode_csv_reader
+from torch.utils.data import Dataset
+URL = "https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2"
+FOLDER_IN_ARCHIVE = "wavs"
+def load_ljspeech_item(line, path, ext_audio):
+    assert len(line) == 3
+    fileid, transcript, normalized_transcript = line
+    fileid_audio = fileid + ext_audio
+    fileid_audio = os.path.join(path, fileid_audio)
+    # Load audio
+    waveform, sample_rate = torchaudio.load(fileid_audio)
+    return (
+        waveform,
+        sample_rate,
+        transcript,
+        normalized_transcript,
+    )
+class LJSPEECH(Dataset):
+    """
+    Create a Dataset for LJSpeech-1.1. Each item is a tuple of the form:
+    waveform, sample_rate, transcript, normalized_transcript
+    """
+    _ext_audio = ".wav"
+    _ext_archive = '.tar.bz2'
+    def __init__(
+            self, root, url=URL, folder_in_archive=FOLDER_IN_ARCHIVE, download=False
+    ):
+        basename = os.path.basename(url)
+        archive = os.path.join(root, basename)
+        basename = basename.split(self._ext_archive)[0]
+        folder_in_archive = os.path.join(basename, folder_in_archive)
+        self._path = os.path.join(root, folder_in_archive)
+        self._metadata_path = os.path.join(root, basename, 'metadata.csv')
+        if download:
+            if not os.path.isdir(self._path):
+                if not os.path.isfile(archive):
+                    download_url(url, root)
+                extract_archive(archive)
+        with open(self._metadata_path, "r") as metadata:
+            walker = unicode_csv_reader(metadata, delimiter="|", quoting=csv.QUOTE_NONE)
+            self._walker = list(walker)
+    def __getitem__(self, n):
+        line = self._walker[n]
+        return load_ljspeech_item(line, self._path, self._ext_audio)
+    def __len__(self):
+        return len(self._walker)