Unverified Commit 8187dc0a authored by Aziz's avatar Aziz Committed by GitHub
Browse files

Remove walk_files (#1111)

The use of `walk_files` made it ambiguous who is responsible to locate 
the correct set of files. (Dataset class? or utility?)
In fact, just glob-ing everything is not the right problem being solved in implementing
Dataset, because if you have a specific dataset you consider to access, then
the directory structure and file locations are determined. No need to do arbitral number of recursions.
Each Dataset implementation should be glob-ing the right set of files it requires.
parent 37692d8d
import os
from pathlib import Path
from torchaudio.datasets import utils as dataset_utils
from torchaudio.datasets.commonvoice import COMMONVOICE
......@@ -11,44 +8,6 @@ from torchaudio_unittest.common_utils import (
)
class TestWalkFiles(TempDirMixin, TorchaudioTestCase):
root = None
expected = None
def _add_file(self, *parts):
path = self.get_temp_path(*parts)
self.expected.append(path)
Path(path).touch()
def setUp(self):
self.root = self.get_temp_path()
self.expected = []
# level 1
for filename in ['a.txt', 'b.txt', 'c.txt']:
self._add_file(filename)
# level 2
for dir1 in ['d1', 'd2', 'd3']:
for filename in ['d.txt', 'e.txt', 'f.txt']:
self._add_file(dir1, filename)
# level 3
for dir2 in ['d1', 'd2', 'd3']:
for filename in ['g.txt', 'h.txt', 'i.txt']:
self._add_file(dir1, dir2, filename)
print('\n'.join(self.expected))
def test_walk_files(self):
"""walk_files should traverse files in alphabetical order"""
n_ites = 0
for i, path in enumerate(dataset_utils.walk_files(self.root, '.txt', prefix=True)):
found = os.path.join(self.root, path)
assert found == self.expected[i]
n_ites += 1
assert n_ites == len(self.expected)
class TestIterator(TorchaudioTestCase):
backend = 'default'
path = get_asset_path('CommonVoice', 'cv-corpus-4-2019-12-10', 'tt')
......
import errno
import hashlib
import logging
import os
import sys
import tarfile
import threading
import urllib
import urllib.request
import zipfile
from _io import TextIOWrapper
from queue import Queue
from typing import Any, Iterable, List, Optional, Tuple, Union
from typing import Any, Iterable, List, Optional
import torch
import urllib
import urllib.request
from torch.utils.data import Dataset
from torch.utils.model_zoo import tqdm
......@@ -203,42 +200,6 @@ def extract_archive(from_path: str, to_path: Optional[str] = None, overwrite: bo
raise NotImplementedError("We currently only support tar.gz, tgz, and zip achives.")
def walk_files(root: str,
suffix: Union[str, Tuple[str]],
prefix: bool = False,
remove_suffix: bool = False) -> Iterable[str]:
"""List recursively all files ending with a suffix at a given root
Args:
root (str): Path to directory whose folders need to be listed
suffix (str or tuple): Suffix of the files to match, e.g. '.png' or ('.jpg', '.png').
It uses the Python "str.endswith" method and is passed directly
prefix (bool, optional): If true, prepends the full path to each result, otherwise
only returns the name of the files found (Default: ``False``)
remove_suffix (bool, optional): If true, removes the suffix to each result defined in suffix,
otherwise will return the result as found (Default: ``False``).
"""
root = os.path.expanduser(root)
for dirpath, dirs, files in os.walk(root):
dirs.sort()
# `dirs` is the list used in os.walk function and by sorting it in-place here, we change the
# behavior of os.walk to traverse sub directory alphabetically
# see also
# https://stackoverflow.com/questions/6670029/can-i-force-python3s-os-walk-to-visit-directories-in-alphabetical-order-how#comment71993866_6670926
files.sort()
for f in files:
if f.endswith(suffix):
if remove_suffix:
f = f[: -len(suffix)]
if prefix:
f = os.path.join(dirpath, f)
yield f
class _DiskCache(Dataset):
"""
Wrap a dataset so that, whenever a new item is returned, it is saved to disk.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment