Remove walk_files (#1111)

The use of `walk_files` made it ambiguous who is responsible to locate the correct set of files. (Dataset class? or utility?) In fact, just glob-ing everything is not the right problem being solved in implementing Dataset, because if you have a specific dataset you consider to access, then the directory structure and file locations are determined. No need to do arbitral number of recursions. Each Dataset implementation should be glob-ing the right set of files it requires.

Remove walk_files (#1111)
The use of `walk_files` made it ambiguous who is responsible to locate the correct set of files. (Dataset class? or utility?) In fact, just glob-ing everything is not the right problem being solved in implementing Dataset, because if you have a specific dataset you consider to access, then the directory structure and file locations are determined. No need to do arbitral number of recursions. Each Dataset implementation should be glob-ing the right set of files it requires.
8187dc0a · Aziz · GitHub · 37692d8d · 8187dc0a · 8187dc0a
Unverified Commit 8187dc0a authored Dec 21, 2020 by Aziz Committed by GitHub Dec 21, 2020
Show whitespace changes
Inline Side-by-side

Showing with 8 additions and 88 deletions

test/torchaudio_unittest/datasets/utils_test.py test/torchaudio_unittest/datasets/utils_test.py +0 -41

torchaudio/datasets/utils.py torchaudio/datasets/utils.py +8 -47

No files found.
--- a/test/torchaudio_unittest/datasets/utils_test.py
+++ b/test/torchaudio_unittest/datasets/utils_test.py
-import os
-from pathlib import Path
-
 from torchaudio.datasets import utils as dataset_utils
 from torchaudio.datasets.commonvoice import COMMONVOICE

@@ -11,44 +8,6 @@ from torchaudio_unittest.common_utils import (
 )


-class TestWalkFiles(TempDirMixin, TorchaudioTestCase):
-    root = None
-    expected = None
-
-    def _add_file(self, *parts):
-        path = self.get_temp_path(*parts)
-        self.expected.append(path)
-        Path(path).touch()
-
-    def setUp(self):
-        self.root = self.get_temp_path()
-        self.expected = []
-
-        # level 1
-        for filename in ['a.txt', 'b.txt', 'c.txt']:
-            self._add_file(filename)
-
-        # level 2
-        for dir1 in ['d1', 'd2', 'd3']:
-            for filename in ['d.txt', 'e.txt', 'f.txt']:
-                self._add_file(dir1, filename)
-            # level 3
-            for dir2 in ['d1', 'd2', 'd3']:
-                for filename in ['g.txt', 'h.txt', 'i.txt']:
-                    self._add_file(dir1, dir2, filename)
-
-        print('\n'.join(self.expected))
-
-    def test_walk_files(self):
-        """walk_files should traverse files in alphabetical order"""
-        n_ites = 0
-        for i, path in enumerate(dataset_utils.walk_files(self.root, '.txt', prefix=True)):
-            found = os.path.join(self.root, path)
-            assert found == self.expected[i]
-            n_ites += 1
-        assert n_ites == len(self.expected)
-
-
 class TestIterator(TorchaudioTestCase):
    backend = 'default'
    path = get_asset_path('CommonVoice', 'cv-corpus-4-2019-12-10', 'tt')

--- a/torchaudio/datasets/utils.py
+++ b/torchaudio/datasets/utils.py
-import errno
 import hashlib
 import logging
 import os
-import sys
 import tarfile
 import threading
+import urllib
+import urllib.request
 import zipfile
-from _io import TextIOWrapper
 from queue import Queue
-from typing import Any, Iterable, List, Optional, Tuple, Union
+from typing import Any, Iterable, List, Optional

 import torch
-import urllib
-import urllib.request
 from torch.utils.data import Dataset
 from torch.utils.model_zoo import tqdm

@@ -203,42 +200,6 @@ def extract_archive(from_path: str, to_path: Optional[str] = None, overwrite: bo
    raise NotImplementedError("We currently only support tar.gz, tgz, and zip achives.")


-def walk_files(root: str,
-               suffix: Union[str, Tuple[str]],
-               prefix: bool = False,
-               remove_suffix: bool = False) -> Iterable[str]:
-    """List recursively all files ending with a suffix at a given root
-    Args:
-        root (str): Path to directory whose folders need to be listed
-        suffix (str or tuple): Suffix of the files to match, e.g. '.png' or ('.jpg', '.png').
-            It uses the Python "str.endswith" method and is passed directly
-        prefix (bool, optional): If true, prepends the full path to each result, otherwise
-            only returns the name of the files found (Default: ``False``)
-        remove_suffix (bool, optional): If true, removes the suffix to each result defined in suffix,
-            otherwise will return the result as found (Default: ``False``).
-    """
-
-    root = os.path.expanduser(root)
-
-    for dirpath, dirs, files in os.walk(root):
-        dirs.sort()
-        # `dirs` is the list used in os.walk function and by sorting it in-place here, we change the
-        # behavior of os.walk to traverse sub directory alphabetically
-        # see also
-        # https://stackoverflow.com/questions/6670029/can-i-force-python3s-os-walk-to-visit-directories-in-alphabetical-order-how#comment71993866_6670926
-        files.sort()
-        for f in files:
-            if f.endswith(suffix):
-
-                if remove_suffix:
-                    f = f[: -len(suffix)]
-
-                if prefix:
-                    f = os.path.join(dirpath, f)
-
-                yield f
-
-
 class _DiskCache(Dataset):
    """
    Wrap a dataset so that, whenever a new item is returned, it is saved to disk.