Using Path and glob instead of walk_files (#1069)

- yesno - librispeech - libritts - speechcommands Co-authored-by: krishnakalyan3 <skalyan@cloudera.com> Co-authored-by: Vincent Quenneville-Belair <vincentqb@gmail.com>

Using Path and glob instead of walk_files (#1069)
- yesno - librispeech - libritts - speechcommands Co-authored-by: krishnakalyan3 <skalyan@cloudera.com> Co-authored-by: Vincent Quenneville-Belair <vincentqb@gmail.com>
d25a4ddf · Krishna Kalyan · GitHub · 79c97fb0 · d25a4ddf · d25a4ddf
Unverified Commit d25a4ddf authored Dec 15, 2020 by Krishna Kalyan Committed by GitHub Dec 15, 2020
4 changed files
--- a/torchaudio/datasets/librispeech.py
+++ b/torchaudio/datasets/librispeech.py
@@ -8,7 +8,6 @@ from torch.utils.data import Dataset
 from torchaudio.datasets.utils import (
    download_url,
    extract_archive,
-    walk_files,
 )

 URL = "train-clean-100"
@@ -125,10 +124,7 @@ class LIBRISPEECH(Dataset):
                    download_url(url, root, hash_value=checksum)
                extract_archive(archive)

-        walker = walk_files(
-            self._path, suffix=self._ext_audio, prefix=False, remove_suffix=True
-        )
-        self._walker = list(walker)
+        self._walker = sorted(str(p.stem) for p in Path(self._path).glob('*/*/*' + self._ext_audio))

    def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]:
        """Load the n-th sample from the dataset.

--- a/torchaudio/datasets/libritts.py
+++ b/torchaudio/datasets/libritts.py
@@ -8,7 +8,6 @@ from torch.utils.data import Dataset
 from torchaudio.datasets.utils import (
    download_url,
    extract_archive,
-    walk_files,
 )

 URL = "train-clean-100"
@@ -126,10 +125,7 @@ class LIBRITTS(Dataset):
                    download_url(url, root, hash_value=checksum)
                extract_archive(archive)

-        walker = walk_files(
-            self._path, suffix=self._ext_audio, prefix=False, remove_suffix=True
-        )
-        self._walker = list(walker)
+        self._walker = sorted(str(p.stem) for p in Path(self._path).glob('*/*/*' + self._ext_audio))

    def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str, int, int, str]:
        """Load the n-th sample from the dataset.

--- a/torchaudio/datasets/speechcommands.py
+++ b/torchaudio/datasets/speechcommands.py
@@ -8,7 +8,6 @@ from torch import Tensor
 from torchaudio.datasets.utils import (
    download_url,
    extract_archive,
-    walk_files
 )

 FOLDER_IN_ARCHIVE = "SpeechCommands"
@@ -110,7 +109,7 @@ class SPEECHCOMMANDS(Dataset):
            self._walker = _load_list(self._path, "testing_list.txt")
        elif subset == "training":
            excludes = set(_load_list(self._path, "validation_list.txt", "testing_list.txt"))
-            walker = walk_files(self._path, suffix=".wav", prefix=True)
+            walker = sorted(str(p) for p in Path(self._path).glob('*/*.wav'))
            self._walker = [
                w for w in walker
                if HASH_DIVIDER in w
@@ -118,7 +117,7 @@ class SPEECHCOMMANDS(Dataset):
                and os.path.normpath(w) not in excludes
            ]
        else:
-            walker = walk_files(self._path, suffix=".wav", prefix=True)
+            walker = sorted(str(p) for p in Path(self._path).glob('*/*.wav'))
            self._walker = [w for w in walker if HASH_DIVIDER in w and EXCEPT_FOLDER not in w]

    def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str, int]:

--- a/torchaudio/datasets/yesno.py
+++ b/torchaudio/datasets/yesno.py
@@ -9,7 +9,6 @@ from torch.utils.data import Dataset
 from torchaudio.datasets.utils import (
    download_url,
    extract_archive,
-    walk_files
 )

 URL = "http://www.openslr.org/resources/1/waves_yesno.tar.gz"
@@ -85,10 +84,7 @@ class YESNO(Dataset):
                "Dataset not found. Please use `download=True` to download it."
            )

-        walker = walk_files(
-            self._path, suffix=self._ext_audio, prefix=False, remove_suffix=True
-        )
-        self._walker = list(walker)
+        self._walker = sorted(str(p.stem) for p in Path(self._path).glob('*' + self._ext_audio))

    def __getitem__(self, n: int) -> Tuple[Tensor, int, List[int]]:
        """Load the n-th sample from the dataset.